Add runbook for recovering wedged units and new scripts for device management
- Created a comprehensive runbook (`wedged_unit_recovery.md`) detailing the recovery process for units stuck in a call-home loop, including symptoms, recovery steps, and explanations of the failure mode. - Added `blind_stop.sh` script to send stop-monitoring commands in a tight loop for unresponsive devices. - Introduced `rescue_device.sh` script to disable Auto Call Home and erase events from a busy device. - Implemented `slow_drip.sh` script to send stop-monitoring frames at a slow rate to prevent UART overrun. - Developed `spam_stop.sh` script to rapidly send stop-monitoring commands to a device. - Created `watch_unit.sh` script for passive monitoring of device reachability, logging results over time.
This commit is contained in:
Executable
+100
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fire-and-forget Stop Monitoring loop — for wedged or constantly-triggering units.
|
||||
#
|
||||
# Hammers POST /device/stop_monitoring_blind in a tight loop. The endpoint
|
||||
# opens TCP, dumps SESSION_RESET + a few copies of the SUB 0x97 frame, and
|
||||
# closes — without ever reading an S3 response. Each TCP-won attempt is
|
||||
# ~50ms of wire activity instead of the multi-frame handshake the regular
|
||||
# rescue endpoint does, so windows that are too small for the full rescue
|
||||
# can still land a stop-monitoring command.
|
||||
#
|
||||
# Usage:
|
||||
# ./blind_stop.sh <host> [tcp_port]
|
||||
#
|
||||
# Env:
|
||||
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
|
||||
# Set to http://localhost:8001/api/sfm to route through
|
||||
# Terra-View's proxy.
|
||||
# MAX_ATTEMPTS Default: 600
|
||||
# SLEEP_S Default: 0 (no backoff — hammer it)
|
||||
# MAX_TIME_S Default: 15
|
||||
# CONNECT_TIMEOUT Default: 5
|
||||
# REPEAT Frames per TCP session (default 3 — increases hit rate
|
||||
# if the device is busy reading its own buffer).
|
||||
# STOP_ON_OK Default: 1. Set to 0 to keep hammering indefinitely
|
||||
# even after successful sends (every 503 means the device
|
||||
# is in *another* session, every 200 means our bytes got
|
||||
# through — but the device may not have processed them).
|
||||
|
||||
set -u
|
||||
|
||||
host="${1:-}"
|
||||
tcp_port="${2:-9034}"
|
||||
if [[ -z "$host" ]]; then
|
||||
echo "usage: $0 <host> [tcp_port]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
base="${SFM_BASE_URL:-http://localhost:8200}"
|
||||
max_attempts="${MAX_ATTEMPTS:-600}"
|
||||
sleep_s="${SLEEP_S:-0}"
|
||||
max_time_s="${MAX_TIME_S:-15}"
|
||||
connect_timeout="${CONNECT_TIMEOUT:-5}"
|
||||
repeat="${REPEAT:-3}"
|
||||
stop_on_ok="${STOP_ON_OK:-1}"
|
||||
|
||||
url="${base}/device/stop_monitoring_blind?host=${host}&tcp_port=${tcp_port}&connect_timeout=${connect_timeout}&repeat=${repeat}"
|
||||
|
||||
echo "blind_stop: target ${host}:${tcp_port} connect_timeout=${connect_timeout}s repeat=${repeat}"
|
||||
echo "blind_stop: POST ${url}"
|
||||
echo "blind_stop: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request"
|
||||
echo "blind_stop: stop_on_ok=${stop_on_ok}"
|
||||
echo
|
||||
|
||||
ok_count=0
|
||||
busy_count=0
|
||||
err_count=0
|
||||
started=$(date +%s)
|
||||
|
||||
for ((i=1; i<=max_attempts; i++)); do
|
||||
printf "[%4d] %s " "$i" "$(date +%H:%M:%S)"
|
||||
http_code=$(curl -sS -o /tmp/blind_resp.$$ -w "%{http_code}" \
|
||||
--max-time "$max_time_s" \
|
||||
-X POST "$url" || echo "000")
|
||||
body=$(cat /tmp/blind_resp.$$ 2>/dev/null || true)
|
||||
rm -f /tmp/blind_resp.$$
|
||||
|
||||
case "$http_code" in
|
||||
200|201)
|
||||
ok_count=$((ok_count + 1))
|
||||
echo "SENT $body"
|
||||
if [[ "$stop_on_ok" == "1" ]]; then
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
echo
|
||||
echo "blind_stop: success after ${i} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}"
|
||||
echo "blind_stop: NEXT — wait ~10s, then try the full rescue:"
|
||||
echo " /home/serversdown/seismo-relay/scripts/rescue_device.sh ${host} ${tcp_port}"
|
||||
exit 0
|
||||
fi
|
||||
;;
|
||||
503)
|
||||
busy_count=$((busy_count + 1))
|
||||
echo "busy (503)"
|
||||
;;
|
||||
000)
|
||||
err_count=$((err_count + 1))
|
||||
echo "curl error"
|
||||
;;
|
||||
*)
|
||||
err_count=$((err_count + 1))
|
||||
echo "HTTP $http_code $body" | head -c 400
|
||||
echo
|
||||
;;
|
||||
esac
|
||||
[[ "$sleep_s" != "0" ]] && sleep "$sleep_s"
|
||||
done
|
||||
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
echo
|
||||
echo "blind_stop: gave up after ${max_attempts} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" >&2
|
||||
exit 1
|
||||
Executable
+99
@@ -0,0 +1,99 @@
|
||||
#!/usr/bin/env bash
|
||||
# Rescue an uncooperative MiniMate that's busy with another ACH session.
|
||||
#
|
||||
# Hammers POST /device/rescue in a tight loop with a short timeout. When the
|
||||
# device is in an ACH session our SYN either gets refused or silently dropped
|
||||
# (5s connect timeout inside the endpoint) and we retry immediately. When the
|
||||
# device is between sessions, our TCP wins, the endpoint disables Auto Call
|
||||
# Home and erases events inside the same session, then returns success.
|
||||
#
|
||||
# Usage:
|
||||
# ./rescue_device.sh <host> [tcp_port] [--no-erase] [--no-disable-ach]
|
||||
#
|
||||
# Examples:
|
||||
# ./rescue_device.sh 166.246.130.1 9034
|
||||
# ./rescue_device.sh 166.246.130.1 9034 --no-erase # just silence it
|
||||
#
|
||||
# Environment:
|
||||
# SFM_BASE_URL Defaults to http://localhost:8200 (SFM direct).
|
||||
# Set to http://localhost:8001/api/sfm to route through
|
||||
# Terra-View's proxy. Direct mode avoids the proxy's
|
||||
# 60s timeout, which matters for long-running endpoints.
|
||||
# MAX_ATTEMPTS Cap on retries (default 600 ≈ 30+ min).
|
||||
# SLEEP_S Backoff between attempts (default 1).
|
||||
# MAX_TIME_S Per-request timeout (default 60).
|
||||
# CONNECT_TIMEOUT TCP connect timeout (default 5).
|
||||
# RECV_TIMEOUT Per-frame S3 recv timeout (default 5). If POLL or any
|
||||
# subsequent frame doesn't respond within this window, the
|
||||
# rescue endpoint bails and this script retries.
|
||||
|
||||
set -u
|
||||
|
||||
host="${1:-}"
|
||||
tcp_port="${2:-9034}"
|
||||
shift 2 2>/dev/null || shift $# 2>/dev/null
|
||||
|
||||
if [[ -z "$host" ]]; then
|
||||
echo "usage: $0 <host> [tcp_port] [--no-erase] [--no-disable-ach]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
disable_ach="true"
|
||||
erase="true"
|
||||
for arg in "$@"; do
|
||||
case "$arg" in
|
||||
--no-erase) erase="false" ;;
|
||||
--no-disable-ach) disable_ach="false" ;;
|
||||
*) echo "unknown flag: $arg" >&2; exit 2 ;;
|
||||
esac
|
||||
done
|
||||
|
||||
base="${SFM_BASE_URL:-http://localhost:8200}"
|
||||
max_attempts="${MAX_ATTEMPTS:-600}"
|
||||
sleep_s="${SLEEP_S:-1}"
|
||||
max_time_s="${MAX_TIME_S:-60}"
|
||||
connect_timeout="${CONNECT_TIMEOUT:-5}"
|
||||
recv_timeout="${RECV_TIMEOUT:-5}"
|
||||
|
||||
url="${base}/device/rescue?host=${host}&tcp_port=${tcp_port}&disable_ach=${disable_ach}&erase=${erase}&connect_timeout=${connect_timeout}&recv_timeout=${recv_timeout}"
|
||||
|
||||
echo "rescue: target ${host}:${tcp_port} disable_ach=${disable_ach} erase=${erase}"
|
||||
echo "rescue: connect_timeout=${connect_timeout}s recv_timeout=${recv_timeout}s"
|
||||
echo "rescue: POST ${url}"
|
||||
echo "rescue: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request"
|
||||
echo
|
||||
|
||||
started=$(date +%s)
|
||||
for ((i=1; i<=max_attempts; i++)); do
|
||||
printf "[%3d] %s " "$i" "$(date +%H:%M:%S)"
|
||||
http_code=$(curl -sS -o /tmp/rescue_resp.$$ -w "%{http_code}" \
|
||||
--max-time "$max_time_s" \
|
||||
-X POST "$url" || echo "000")
|
||||
body=$(cat /tmp/rescue_resp.$$ 2>/dev/null || true)
|
||||
rm -f /tmp/rescue_resp.$$
|
||||
|
||||
case "$http_code" in
|
||||
200|201)
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
echo "OK (${elapsed}s total)"
|
||||
echo "$body"
|
||||
exit 0
|
||||
;;
|
||||
503)
|
||||
# Connection refused / timeout — device busy in another session. Retry fast.
|
||||
echo "busy (503)"
|
||||
;;
|
||||
000)
|
||||
echo "curl error (network)"
|
||||
;;
|
||||
*)
|
||||
echo "HTTP $http_code"
|
||||
echo " $body" | head -c 400
|
||||
echo
|
||||
;;
|
||||
esac
|
||||
sleep "$sleep_s"
|
||||
done
|
||||
|
||||
echo "rescue: gave up after ${max_attempts} attempts" >&2
|
||||
exit 1
|
||||
Executable
+44
@@ -0,0 +1,44 @@
|
||||
#!/usr/bin/env bash
|
||||
# Hold a single TCP session open and drip stop-monitoring frames at a slow
|
||||
# rate, so the device's UART RX FIFO has time to drain between sends.
|
||||
#
|
||||
# Use when high-rate spam isn't landing — typically because the device's
|
||||
# firmware is too busy to drain its serial buffer fast enough and bytes
|
||||
# are being lost to UART overrun.
|
||||
#
|
||||
# Usage:
|
||||
# ./slow_drip.sh <host> [tcp_port] [duration_s]
|
||||
#
|
||||
# Env:
|
||||
# DURATION Default: 120 (seconds; arg 3 overrides). Clamped 1..600.
|
||||
# INTERVAL Seconds between drip sends (default 3). Lower = more
|
||||
# aggressive, more risk of FIFO overrun. Higher = safer
|
||||
# but fewer total drips per duration.
|
||||
# CONNECT_TIMEOUT Default: 5
|
||||
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
|
||||
|
||||
set -u
|
||||
|
||||
host="${1:-}"
|
||||
tcp_port="${2:-9034}"
|
||||
duration="${3:-${DURATION:-120}}"
|
||||
if [[ -z "$host" ]]; then
|
||||
echo "usage: $0 <host> [tcp_port] [duration_s]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
base="${SFM_BASE_URL:-http://localhost:8200}"
|
||||
interval="${INTERVAL:-3}"
|
||||
connect_timeout="${CONNECT_TIMEOUT:-5}"
|
||||
|
||||
url="${base}/device/stop_monitoring_slow_drip?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&interval_s=${interval}&connect_timeout=${connect_timeout}"
|
||||
|
||||
echo "slow_drip: target ${host}:${tcp_port} duration=${duration}s interval=${interval}s connect_timeout=${connect_timeout}s"
|
||||
echo "slow_drip: POST ${url}"
|
||||
echo
|
||||
|
||||
# Give curl enough slack to wait out the duration plus a buffer
|
||||
max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 30 }')
|
||||
|
||||
curl -sS --max-time "$max_time" -X POST "$url"
|
||||
echo
|
||||
Executable
+48
@@ -0,0 +1,48 @@
|
||||
#!/usr/bin/env bash
|
||||
# Hammer a device with blind stop-monitoring sessions as fast as possible.
|
||||
# Single HTTP call kicks off the burst inside SFM (no per-attempt HTTP
|
||||
# overhead). Default: 10 seconds, ~500 ms per attempt = ~20 attempts/sec.
|
||||
#
|
||||
# Usage:
|
||||
# ./spam_stop.sh <host> [tcp_port] [duration_s]
|
||||
#
|
||||
# Examples:
|
||||
# ./spam_stop.sh 166.246.130.1 # 10s burst
|
||||
# ./spam_stop.sh 166.246.130.1 9034 30 # 30s burst
|
||||
# DURATION=60 CONNECT_TIMEOUT=0.2 ./spam_stop.sh 166.246.130.1
|
||||
#
|
||||
# Env:
|
||||
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
|
||||
# Set to http://localhost:8001/api/sfm to route through
|
||||
# Terra-View's proxy — but note the proxy has a 60s
|
||||
# timeout, so long bursts need direct mode.
|
||||
# DURATION Default: 10 (seconds; arg 3 overrides)
|
||||
# CONNECT_TIMEOUT Default: 0.5 (seconds)
|
||||
# REPEAT Default: 3 (stop frames per TCP session)
|
||||
|
||||
set -u
|
||||
|
||||
host="${1:-}"
|
||||
tcp_port="${2:-9034}"
|
||||
duration="${3:-${DURATION:-10}}"
|
||||
|
||||
if [[ -z "$host" ]]; then
|
||||
echo "usage: $0 <host> [tcp_port] [duration_s]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
base="${SFM_BASE_URL:-http://localhost:8200}"
|
||||
connect_timeout="${CONNECT_TIMEOUT:-0.5}"
|
||||
repeat="${REPEAT:-3}"
|
||||
|
||||
url="${base}/device/stop_monitoring_spam?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&connect_timeout=${connect_timeout}&repeat=${repeat}"
|
||||
|
||||
echo "spam_stop: target ${host}:${tcp_port} duration=${duration}s connect_timeout=${connect_timeout}s repeat=${repeat}"
|
||||
echo "spam_stop: POST ${url}"
|
||||
echo
|
||||
|
||||
# Give curl enough slack to wait out the duration plus a buffer
|
||||
max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 10 }')
|
||||
|
||||
curl -sS --max-time "$max_time" -X POST "$url"
|
||||
echo
|
||||
Executable
+58
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env bash
|
||||
# Passive monitor for a misbehaving unit. Every INTERVAL seconds, attempts
|
||||
# a single short TCP probe + storage_range read and logs the result. Designed
|
||||
# to run unattended for hours/days and tell you when the unit comes back.
|
||||
#
|
||||
# Usage:
|
||||
# ./watch_unit.sh <host> [tcp_port]
|
||||
#
|
||||
# Env:
|
||||
# INTERVAL Seconds between checks (default 300 = 5 min)
|
||||
# LOG_FILE Append results here (default /tmp/watch_<host>.log)
|
||||
# SFM_BASE_URL Default: http://localhost:8200
|
||||
|
||||
set -u
|
||||
|
||||
host="${1:-}"
|
||||
tcp_port="${2:-9034}"
|
||||
if [[ -z "$host" ]]; then
|
||||
echo "usage: $0 <host> [tcp_port]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
interval="${INTERVAL:-300}"
|
||||
log_file="${LOG_FILE:-/tmp/watch_${host}.log}"
|
||||
base="${SFM_BASE_URL:-http://localhost:8200}"
|
||||
|
||||
url="${base}/device/events/storage_range?host=${host}&tcp_port=${tcp_port}"
|
||||
|
||||
echo "watch_unit: target ${host}:${tcp_port} interval=${interval}s log=${log_file}"
|
||||
echo "watch_unit: Ctrl-C to stop"
|
||||
|
||||
while true; do
|
||||
ts=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
http_code=$(curl -sS -o /tmp/watch_resp.$$ -w "%{http_code}" \
|
||||
--max-time 20 "$url" || echo "000")
|
||||
body=$(cat /tmp/watch_resp.$$ 2>/dev/null || true)
|
||||
rm -f /tmp/watch_resp.$$
|
||||
|
||||
case "$http_code" in
|
||||
200|201)
|
||||
# Strip the raw_hex for readability
|
||||
summary=$(echo "$body" | sed 's/"raw_hex":"[^"]*",*//; s/,*$//' | head -c 200)
|
||||
echo "$ts REACHABLE $summary" | tee -a "$log_file"
|
||||
;;
|
||||
502|503)
|
||||
err=$(echo "$body" | head -c 150)
|
||||
echo "$ts ERROR_$http_code $err" | tee -a "$log_file"
|
||||
;;
|
||||
000)
|
||||
echo "$ts CURL_FAIL (network/timeout)" | tee -a "$log_file"
|
||||
;;
|
||||
*)
|
||||
echo "$ts HTTP_$http_code $(echo "$body" | head -c 150)" | tee -a "$log_file"
|
||||
;;
|
||||
esac
|
||||
|
||||
sleep "$interval"
|
||||
done
|
||||
Reference in New Issue
Block a user