Add runbook for recovering wedged units and new scripts for device management

- Created a comprehensive runbook (`wedged_unit_recovery.md`) detailing the recovery process for units stuck in a call-home loop, including symptoms, recovery steps, and explanations of the failure mode.
- Added `blind_stop.sh` script to send stop-monitoring commands in a tight loop for unresponsive devices.
- Introduced `rescue_device.sh` script to disable Auto Call Home and erase events from a busy device.
- Implemented `slow_drip.sh` script to send stop-monitoring frames at a slow rate to prevent UART overrun.
- Developed `spam_stop.sh` script to rapidly send stop-monitoring commands to a device.
- Created `watch_unit.sh` script for passive monitoring of device reachability, logging results over time.
This commit is contained in:
2026-05-17 07:58:13 +00:00
parent ae7edac83f
commit 1fff8179d6
8 changed files with 1401 additions and 10 deletions
+100
View File
@@ -0,0 +1,100 @@
#!/usr/bin/env bash
# Fire-and-forget Stop Monitoring loop — for wedged or constantly-triggering units.
#
# Hammers POST /device/stop_monitoring_blind in a tight loop. The endpoint
# opens TCP, dumps SESSION_RESET + a few copies of the SUB 0x97 frame, and
# closes — without ever reading an S3 response. Each TCP-won attempt is
# ~50ms of wire activity instead of the multi-frame handshake the regular
# rescue endpoint does, so windows that are too small for the full rescue
# can still land a stop-monitoring command.
#
# Usage:
# ./blind_stop.sh <host> [tcp_port]
#
# Env:
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
# Set to http://localhost:8001/api/sfm to route through
# Terra-View's proxy.
# MAX_ATTEMPTS Default: 600
# SLEEP_S Default: 0 (no backoff — hammer it)
# MAX_TIME_S Default: 15
# CONNECT_TIMEOUT Default: 5
# REPEAT Frames per TCP session (default 3 — increases hit rate
# if the device is busy reading its own buffer).
# STOP_ON_OK Default: 1. Set to 0 to keep hammering indefinitely
# even after successful sends (every 503 means the device
# is in *another* session, every 200 means our bytes got
# through — but the device may not have processed them).
set -u
host="${1:-}"
tcp_port="${2:-9034}"
if [[ -z "$host" ]]; then
echo "usage: $0 <host> [tcp_port]" >&2
exit 2
fi
base="${SFM_BASE_URL:-http://localhost:8200}"
max_attempts="${MAX_ATTEMPTS:-600}"
sleep_s="${SLEEP_S:-0}"
max_time_s="${MAX_TIME_S:-15}"
connect_timeout="${CONNECT_TIMEOUT:-5}"
repeat="${REPEAT:-3}"
stop_on_ok="${STOP_ON_OK:-1}"
url="${base}/device/stop_monitoring_blind?host=${host}&tcp_port=${tcp_port}&connect_timeout=${connect_timeout}&repeat=${repeat}"
echo "blind_stop: target ${host}:${tcp_port} connect_timeout=${connect_timeout}s repeat=${repeat}"
echo "blind_stop: POST ${url}"
echo "blind_stop: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request"
echo "blind_stop: stop_on_ok=${stop_on_ok}"
echo
ok_count=0
busy_count=0
err_count=0
started=$(date +%s)
for ((i=1; i<=max_attempts; i++)); do
printf "[%4d] %s " "$i" "$(date +%H:%M:%S)"
http_code=$(curl -sS -o /tmp/blind_resp.$$ -w "%{http_code}" \
--max-time "$max_time_s" \
-X POST "$url" || echo "000")
body=$(cat /tmp/blind_resp.$$ 2>/dev/null || true)
rm -f /tmp/blind_resp.$$
case "$http_code" in
200|201)
ok_count=$((ok_count + 1))
echo "SENT $body"
if [[ "$stop_on_ok" == "1" ]]; then
elapsed=$(( $(date +%s) - started ))
echo
echo "blind_stop: success after ${i} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}"
echo "blind_stop: NEXT — wait ~10s, then try the full rescue:"
echo " /home/serversdown/seismo-relay/scripts/rescue_device.sh ${host} ${tcp_port}"
exit 0
fi
;;
503)
busy_count=$((busy_count + 1))
echo "busy (503)"
;;
000)
err_count=$((err_count + 1))
echo "curl error"
;;
*)
err_count=$((err_count + 1))
echo "HTTP $http_code $body" | head -c 400
echo
;;
esac
[[ "$sleep_s" != "0" ]] && sleep "$sleep_s"
done
elapsed=$(( $(date +%s) - started ))
echo
echo "blind_stop: gave up after ${max_attempts} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" >&2
exit 1
+99
View File
@@ -0,0 +1,99 @@
#!/usr/bin/env bash
# Rescue an uncooperative MiniMate that's busy with another ACH session.
#
# Hammers POST /device/rescue in a tight loop with a short timeout. When the
# device is in an ACH session our SYN either gets refused or silently dropped
# (5s connect timeout inside the endpoint) and we retry immediately. When the
# device is between sessions, our TCP wins, the endpoint disables Auto Call
# Home and erases events inside the same session, then returns success.
#
# Usage:
# ./rescue_device.sh <host> [tcp_port] [--no-erase] [--no-disable-ach]
#
# Examples:
# ./rescue_device.sh 166.246.130.1 9034
# ./rescue_device.sh 166.246.130.1 9034 --no-erase # just silence it
#
# Environment:
# SFM_BASE_URL Defaults to http://localhost:8200 (SFM direct).
# Set to http://localhost:8001/api/sfm to route through
# Terra-View's proxy. Direct mode avoids the proxy's
# 60s timeout, which matters for long-running endpoints.
# MAX_ATTEMPTS Cap on retries (default 600 ≈ 30+ min).
# SLEEP_S Backoff between attempts (default 1).
# MAX_TIME_S Per-request timeout (default 60).
# CONNECT_TIMEOUT TCP connect timeout (default 5).
# RECV_TIMEOUT Per-frame S3 recv timeout (default 5). If POLL or any
# subsequent frame doesn't respond within this window, the
# rescue endpoint bails and this script retries.
set -u
host="${1:-}"
tcp_port="${2:-9034}"
shift 2 2>/dev/null || shift $# 2>/dev/null
if [[ -z "$host" ]]; then
echo "usage: $0 <host> [tcp_port] [--no-erase] [--no-disable-ach]" >&2
exit 2
fi
disable_ach="true"
erase="true"
for arg in "$@"; do
case "$arg" in
--no-erase) erase="false" ;;
--no-disable-ach) disable_ach="false" ;;
*) echo "unknown flag: $arg" >&2; exit 2 ;;
esac
done
base="${SFM_BASE_URL:-http://localhost:8200}"
max_attempts="${MAX_ATTEMPTS:-600}"
sleep_s="${SLEEP_S:-1}"
max_time_s="${MAX_TIME_S:-60}"
connect_timeout="${CONNECT_TIMEOUT:-5}"
recv_timeout="${RECV_TIMEOUT:-5}"
url="${base}/device/rescue?host=${host}&tcp_port=${tcp_port}&disable_ach=${disable_ach}&erase=${erase}&connect_timeout=${connect_timeout}&recv_timeout=${recv_timeout}"
echo "rescue: target ${host}:${tcp_port} disable_ach=${disable_ach} erase=${erase}"
echo "rescue: connect_timeout=${connect_timeout}s recv_timeout=${recv_timeout}s"
echo "rescue: POST ${url}"
echo "rescue: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request"
echo
started=$(date +%s)
for ((i=1; i<=max_attempts; i++)); do
printf "[%3d] %s " "$i" "$(date +%H:%M:%S)"
http_code=$(curl -sS -o /tmp/rescue_resp.$$ -w "%{http_code}" \
--max-time "$max_time_s" \
-X POST "$url" || echo "000")
body=$(cat /tmp/rescue_resp.$$ 2>/dev/null || true)
rm -f /tmp/rescue_resp.$$
case "$http_code" in
200|201)
elapsed=$(( $(date +%s) - started ))
echo "OK (${elapsed}s total)"
echo "$body"
exit 0
;;
503)
# Connection refused / timeout — device busy in another session. Retry fast.
echo "busy (503)"
;;
000)
echo "curl error (network)"
;;
*)
echo "HTTP $http_code"
echo " $body" | head -c 400
echo
;;
esac
sleep "$sleep_s"
done
echo "rescue: gave up after ${max_attempts} attempts" >&2
exit 1
+44
View File
@@ -0,0 +1,44 @@
#!/usr/bin/env bash
# Hold a single TCP session open and drip stop-monitoring frames at a slow
# rate, so the device's UART RX FIFO has time to drain between sends.
#
# Use when high-rate spam isn't landing — typically because the device's
# firmware is too busy to drain its serial buffer fast enough and bytes
# are being lost to UART overrun.
#
# Usage:
# ./slow_drip.sh <host> [tcp_port] [duration_s]
#
# Env:
# DURATION Default: 120 (seconds; arg 3 overrides). Clamped 1..600.
# INTERVAL Seconds between drip sends (default 3). Lower = more
# aggressive, more risk of FIFO overrun. Higher = safer
# but fewer total drips per duration.
# CONNECT_TIMEOUT Default: 5
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
set -u
host="${1:-}"
tcp_port="${2:-9034}"
duration="${3:-${DURATION:-120}}"
if [[ -z "$host" ]]; then
echo "usage: $0 <host> [tcp_port] [duration_s]" >&2
exit 2
fi
base="${SFM_BASE_URL:-http://localhost:8200}"
interval="${INTERVAL:-3}"
connect_timeout="${CONNECT_TIMEOUT:-5}"
url="${base}/device/stop_monitoring_slow_drip?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&interval_s=${interval}&connect_timeout=${connect_timeout}"
echo "slow_drip: target ${host}:${tcp_port} duration=${duration}s interval=${interval}s connect_timeout=${connect_timeout}s"
echo "slow_drip: POST ${url}"
echo
# Give curl enough slack to wait out the duration plus a buffer
max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 30 }')
curl -sS --max-time "$max_time" -X POST "$url"
echo
+48
View File
@@ -0,0 +1,48 @@
#!/usr/bin/env bash
# Hammer a device with blind stop-monitoring sessions as fast as possible.
# Single HTTP call kicks off the burst inside SFM (no per-attempt HTTP
# overhead). Default: 10 seconds, ~500 ms per attempt = ~20 attempts/sec.
#
# Usage:
# ./spam_stop.sh <host> [tcp_port] [duration_s]
#
# Examples:
# ./spam_stop.sh 166.246.130.1 # 10s burst
# ./spam_stop.sh 166.246.130.1 9034 30 # 30s burst
# DURATION=60 CONNECT_TIMEOUT=0.2 ./spam_stop.sh 166.246.130.1
#
# Env:
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
# Set to http://localhost:8001/api/sfm to route through
# Terra-View's proxy — but note the proxy has a 60s
# timeout, so long bursts need direct mode.
# DURATION Default: 10 (seconds; arg 3 overrides)
# CONNECT_TIMEOUT Default: 0.5 (seconds)
# REPEAT Default: 3 (stop frames per TCP session)
set -u
host="${1:-}"
tcp_port="${2:-9034}"
duration="${3:-${DURATION:-10}}"
if [[ -z "$host" ]]; then
echo "usage: $0 <host> [tcp_port] [duration_s]" >&2
exit 2
fi
base="${SFM_BASE_URL:-http://localhost:8200}"
connect_timeout="${CONNECT_TIMEOUT:-0.5}"
repeat="${REPEAT:-3}"
url="${base}/device/stop_monitoring_spam?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&connect_timeout=${connect_timeout}&repeat=${repeat}"
echo "spam_stop: target ${host}:${tcp_port} duration=${duration}s connect_timeout=${connect_timeout}s repeat=${repeat}"
echo "spam_stop: POST ${url}"
echo
# Give curl enough slack to wait out the duration plus a buffer
max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 10 }')
curl -sS --max-time "$max_time" -X POST "$url"
echo
+58
View File
@@ -0,0 +1,58 @@
#!/usr/bin/env bash
# Passive monitor for a misbehaving unit. Every INTERVAL seconds, attempts
# a single short TCP probe + storage_range read and logs the result. Designed
# to run unattended for hours/days and tell you when the unit comes back.
#
# Usage:
# ./watch_unit.sh <host> [tcp_port]
#
# Env:
# INTERVAL Seconds between checks (default 300 = 5 min)
# LOG_FILE Append results here (default /tmp/watch_<host>.log)
# SFM_BASE_URL Default: http://localhost:8200
set -u
host="${1:-}"
tcp_port="${2:-9034}"
if [[ -z "$host" ]]; then
echo "usage: $0 <host> [tcp_port]" >&2
exit 2
fi
interval="${INTERVAL:-300}"
log_file="${LOG_FILE:-/tmp/watch_${host}.log}"
base="${SFM_BASE_URL:-http://localhost:8200}"
url="${base}/device/events/storage_range?host=${host}&tcp_port=${tcp_port}"
echo "watch_unit: target ${host}:${tcp_port} interval=${interval}s log=${log_file}"
echo "watch_unit: Ctrl-C to stop"
while true; do
ts=$(date '+%Y-%m-%d %H:%M:%S')
http_code=$(curl -sS -o /tmp/watch_resp.$$ -w "%{http_code}" \
--max-time 20 "$url" || echo "000")
body=$(cat /tmp/watch_resp.$$ 2>/dev/null || true)
rm -f /tmp/watch_resp.$$
case "$http_code" in
200|201)
# Strip the raw_hex for readability
summary=$(echo "$body" | sed 's/"raw_hex":"[^"]*",*//; s/,*$//' | head -c 200)
echo "$ts REACHABLE $summary" | tee -a "$log_file"
;;
502|503)
err=$(echo "$body" | head -c 150)
echo "$ts ERROR_$http_code $err" | tee -a "$log_file"
;;
000)
echo "$ts CURL_FAIL (network/timeout)" | tee -a "$log_file"
;;
*)
echo "$ts HTTP_$http_code $(echo "$body" | head -c 150)" | tee -a "$log_file"
;;
esac
sleep "$interval"
done