1fff8179d6
- Created a comprehensive runbook (`wedged_unit_recovery.md`) detailing the recovery process for units stuck in a call-home loop, including symptoms, recovery steps, and explanations of the failure mode. - Added `blind_stop.sh` script to send stop-monitoring commands in a tight loop for unresponsive devices. - Introduced `rescue_device.sh` script to disable Auto Call Home and erase events from a busy device. - Implemented `slow_drip.sh` script to send stop-monitoring frames at a slow rate to prevent UART overrun. - Developed `spam_stop.sh` script to rapidly send stop-monitoring commands to a device. - Created `watch_unit.sh` script for passive monitoring of device reachability, logging results over time.
101 lines
3.4 KiB
Bash
Executable File
101 lines
3.4 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Fire-and-forget Stop Monitoring loop — for wedged or constantly-triggering units.
|
|
#
|
|
# Hammers POST /device/stop_monitoring_blind in a tight loop. The endpoint
|
|
# opens TCP, dumps SESSION_RESET + a few copies of the SUB 0x97 frame, and
|
|
# closes — without ever reading an S3 response. Each TCP-won attempt is
|
|
# ~50ms of wire activity instead of the multi-frame handshake the regular
|
|
# rescue endpoint does, so windows that are too small for the full rescue
|
|
# can still land a stop-monitoring command.
|
|
#
|
|
# Usage:
|
|
# ./blind_stop.sh <host> [tcp_port]
|
|
#
|
|
# Env:
|
|
# SFM_BASE_URL Default: http://localhost:8200 (SFM direct).
|
|
# Set to http://localhost:8001/api/sfm to route through
|
|
# Terra-View's proxy.
|
|
# MAX_ATTEMPTS Default: 600
|
|
# SLEEP_S Default: 0 (no backoff — hammer it)
|
|
# MAX_TIME_S Default: 15
|
|
# CONNECT_TIMEOUT Default: 5
|
|
# REPEAT Frames per TCP session (default 3 — increases hit rate
|
|
# if the device is busy reading its own buffer).
|
|
# STOP_ON_OK Default: 1. Set to 0 to keep hammering indefinitely
|
|
# even after successful sends (every 503 means the device
|
|
# is in *another* session, every 200 means our bytes got
|
|
# through — but the device may not have processed them).
|
|
|
|
set -u
|
|
|
|
host="${1:-}"
|
|
tcp_port="${2:-9034}"
|
|
if [[ -z "$host" ]]; then
|
|
echo "usage: $0 <host> [tcp_port]" >&2
|
|
exit 2
|
|
fi
|
|
|
|
base="${SFM_BASE_URL:-http://localhost:8200}"
|
|
max_attempts="${MAX_ATTEMPTS:-600}"
|
|
sleep_s="${SLEEP_S:-0}"
|
|
max_time_s="${MAX_TIME_S:-15}"
|
|
connect_timeout="${CONNECT_TIMEOUT:-5}"
|
|
repeat="${REPEAT:-3}"
|
|
stop_on_ok="${STOP_ON_OK:-1}"
|
|
|
|
url="${base}/device/stop_monitoring_blind?host=${host}&tcp_port=${tcp_port}&connect_timeout=${connect_timeout}&repeat=${repeat}"
|
|
|
|
echo "blind_stop: target ${host}:${tcp_port} connect_timeout=${connect_timeout}s repeat=${repeat}"
|
|
echo "blind_stop: POST ${url}"
|
|
echo "blind_stop: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request"
|
|
echo "blind_stop: stop_on_ok=${stop_on_ok}"
|
|
echo
|
|
|
|
ok_count=0
|
|
busy_count=0
|
|
err_count=0
|
|
started=$(date +%s)
|
|
|
|
for ((i=1; i<=max_attempts; i++)); do
|
|
printf "[%4d] %s " "$i" "$(date +%H:%M:%S)"
|
|
http_code=$(curl -sS -o /tmp/blind_resp.$$ -w "%{http_code}" \
|
|
--max-time "$max_time_s" \
|
|
-X POST "$url" || echo "000")
|
|
body=$(cat /tmp/blind_resp.$$ 2>/dev/null || true)
|
|
rm -f /tmp/blind_resp.$$
|
|
|
|
case "$http_code" in
|
|
200|201)
|
|
ok_count=$((ok_count + 1))
|
|
echo "SENT $body"
|
|
if [[ "$stop_on_ok" == "1" ]]; then
|
|
elapsed=$(( $(date +%s) - started ))
|
|
echo
|
|
echo "blind_stop: success after ${i} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}"
|
|
echo "blind_stop: NEXT — wait ~10s, then try the full rescue:"
|
|
echo " /home/serversdown/seismo-relay/scripts/rescue_device.sh ${host} ${tcp_port}"
|
|
exit 0
|
|
fi
|
|
;;
|
|
503)
|
|
busy_count=$((busy_count + 1))
|
|
echo "busy (503)"
|
|
;;
|
|
000)
|
|
err_count=$((err_count + 1))
|
|
echo "curl error"
|
|
;;
|
|
*)
|
|
err_count=$((err_count + 1))
|
|
echo "HTTP $http_code $body" | head -c 400
|
|
echo
|
|
;;
|
|
esac
|
|
[[ "$sleep_s" != "0" ]] && sleep "$sleep_s"
|
|
done
|
|
|
|
elapsed=$(( $(date +%s) - started ))
|
|
echo
|
|
echo "blind_stop: gave up after ${max_attempts} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" >&2
|
|
exit 1
|