Add runbook for recovering wedged units and new scripts for device management
- Created a comprehensive runbook (`wedged_unit_recovery.md`) detailing the recovery process for units stuck in a call-home loop, including symptoms, recovery steps, and explanations of the failure mode. - Added `blind_stop.sh` script to send stop-monitoring commands in a tight loop for unresponsive devices. - Introduced `rescue_device.sh` script to disable Auto Call Home and erase events from a busy device. - Implemented `slow_drip.sh` script to send stop-monitoring frames at a slow rate to prevent UART overrun. - Developed `spam_stop.sh` script to rapidly send stop-monitoring commands to a device. - Created `watch_unit.sh` script for passive monitoring of device reachability, logging results over time.
This commit is contained in:
Executable
+100
@@ -0,0 +1,100 @@
|
||||
#!/usr/bin/env bash
|
||||
# Fire-and-forget Stop Monitoring loop — for wedged or constantly-triggering units.
|
||||
#
|
||||
# Hammers POST /device/stop_monitoring_blind in a tight loop. The endpoint
|
||||
# opens TCP, dumps SESSION_RESET + a few copies of the SUB 0x97 frame, and
|
||||
# closes — without ever reading an S3 response. Each TCP-won attempt is
|
||||
# ~50ms of wire activity instead of the multi-frame handshake the regular
|
||||
# rescue endpoint does, so windows that are too small for the full rescue
|
||||
# can still land a stop-monitoring command.
|
||||
#
|
||||
# Usage:
|
||||
# ./blind_stop.sh <host> [tcp_port]
|
||||
#
|
||||
# Env:
|
||||
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
|
||||
# Set to http://localhost:8001/api/sfm to route through
|
||||
# Terra-View's proxy.
|
||||
# MAX_ATTEMPTS Default: 600
|
||||
# SLEEP_S Default: 0 (no backoff — hammer it)
|
||||
# MAX_TIME_S Default: 15
|
||||
# CONNECT_TIMEOUT Default: 5
|
||||
# REPEAT Frames per TCP session (default 3 — increases hit rate
|
||||
# if the device is busy reading its own buffer).
|
||||
# STOP_ON_OK Default: 1. Set to 0 to keep hammering indefinitely
|
||||
# even after successful sends (every 503 means the device
|
||||
# is in *another* session, every 200 means our bytes got
|
||||
# through — but the device may not have processed them).
|
||||
|
||||
set -u
|
||||
|
||||
host="${1:-}"
|
||||
tcp_port="${2:-9034}"
|
||||
if [[ -z "$host" ]]; then
|
||||
echo "usage: $0 <host> [tcp_port]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
base="${SFM_BASE_URL:-http://localhost:8200}"
|
||||
max_attempts="${MAX_ATTEMPTS:-600}"
|
||||
sleep_s="${SLEEP_S:-0}"
|
||||
max_time_s="${MAX_TIME_S:-15}"
|
||||
connect_timeout="${CONNECT_TIMEOUT:-5}"
|
||||
repeat="${REPEAT:-3}"
|
||||
stop_on_ok="${STOP_ON_OK:-1}"
|
||||
|
||||
url="${base}/device/stop_monitoring_blind?host=${host}&tcp_port=${tcp_port}&connect_timeout=${connect_timeout}&repeat=${repeat}"
|
||||
|
||||
echo "blind_stop: target ${host}:${tcp_port} connect_timeout=${connect_timeout}s repeat=${repeat}"
|
||||
echo "blind_stop: POST ${url}"
|
||||
echo "blind_stop: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request"
|
||||
echo "blind_stop: stop_on_ok=${stop_on_ok}"
|
||||
echo
|
||||
|
||||
ok_count=0
|
||||
busy_count=0
|
||||
err_count=0
|
||||
started=$(date +%s)
|
||||
|
||||
for ((i=1; i<=max_attempts; i++)); do
|
||||
printf "[%4d] %s " "$i" "$(date +%H:%M:%S)"
|
||||
http_code=$(curl -sS -o /tmp/blind_resp.$$ -w "%{http_code}" \
|
||||
--max-time "$max_time_s" \
|
||||
-X POST "$url" || echo "000")
|
||||
body=$(cat /tmp/blind_resp.$$ 2>/dev/null || true)
|
||||
rm -f /tmp/blind_resp.$$
|
||||
|
||||
case "$http_code" in
|
||||
200|201)
|
||||
ok_count=$((ok_count + 1))
|
||||
echo "SENT $body"
|
||||
if [[ "$stop_on_ok" == "1" ]]; then
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
echo
|
||||
echo "blind_stop: success after ${i} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}"
|
||||
echo "blind_stop: NEXT — wait ~10s, then try the full rescue:"
|
||||
echo " /home/serversdown/seismo-relay/scripts/rescue_device.sh ${host} ${tcp_port}"
|
||||
exit 0
|
||||
fi
|
||||
;;
|
||||
503)
|
||||
busy_count=$((busy_count + 1))
|
||||
echo "busy (503)"
|
||||
;;
|
||||
000)
|
||||
err_count=$((err_count + 1))
|
||||
echo "curl error"
|
||||
;;
|
||||
*)
|
||||
err_count=$((err_count + 1))
|
||||
echo "HTTP $http_code $body" | head -c 400
|
||||
echo
|
||||
;;
|
||||
esac
|
||||
[[ "$sleep_s" != "0" ]] && sleep "$sleep_s"
|
||||
done
|
||||
|
||||
elapsed=$(( $(date +%s) - started ))
|
||||
echo
|
||||
echo "blind_stop: gave up after ${max_attempts} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" >&2
|
||||
exit 1
|
||||
Reference in New Issue
Block a user