Add runbook for recovering wedged units and new scripts for device management
- Created a comprehensive runbook (`wedged_unit_recovery.md`) detailing the recovery process for units stuck in a call-home loop, including symptoms, recovery steps, and explanations of the failure mode. - Added `blind_stop.sh` script to send stop-monitoring commands in a tight loop for unresponsive devices. - Introduced `rescue_device.sh` script to disable Auto Call Home and erase events from a busy device. - Implemented `slow_drip.sh` script to send stop-monitoring frames at a slow rate to prevent UART overrun. - Developed `spam_stop.sh` script to rapidly send stop-monitoring commands to a device. - Created `watch_unit.sh` script for passive monitoring of device reachability, logging results over time.
This commit is contained in:
Executable
+58
@@ -0,0 +1,58 @@
|
||||
#!/usr/bin/env bash
|
||||
# Passive monitor for a misbehaving unit. Every INTERVAL seconds, attempts
|
||||
# a single short TCP probe + storage_range read and logs the result. Designed
|
||||
# to run unattended for hours/days and tell you when the unit comes back.
|
||||
#
|
||||
# Usage:
|
||||
# ./watch_unit.sh <host> [tcp_port]
|
||||
#
|
||||
# Env:
|
||||
# INTERVAL Seconds between checks (default 300 = 5 min)
|
||||
# LOG_FILE Append results here (default /tmp/watch_<host>.log)
|
||||
# SFM_BASE_URL Default: http://localhost:8200
|
||||
|
||||
set -u
|
||||
|
||||
host="${1:-}"
|
||||
tcp_port="${2:-9034}"
|
||||
if [[ -z "$host" ]]; then
|
||||
echo "usage: $0 <host> [tcp_port]" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
interval="${INTERVAL:-300}"
|
||||
log_file="${LOG_FILE:-/tmp/watch_${host}.log}"
|
||||
base="${SFM_BASE_URL:-http://localhost:8200}"
|
||||
|
||||
url="${base}/device/events/storage_range?host=${host}&tcp_port=${tcp_port}"
|
||||
|
||||
echo "watch_unit: target ${host}:${tcp_port} interval=${interval}s log=${log_file}"
|
||||
echo "watch_unit: Ctrl-C to stop"
|
||||
|
||||
while true; do
|
||||
ts=$(date '+%Y-%m-%d %H:%M:%S')
|
||||
http_code=$(curl -sS -o /tmp/watch_resp.$$ -w "%{http_code}" \
|
||||
--max-time 20 "$url" || echo "000")
|
||||
body=$(cat /tmp/watch_resp.$$ 2>/dev/null || true)
|
||||
rm -f /tmp/watch_resp.$$
|
||||
|
||||
case "$http_code" in
|
||||
200|201)
|
||||
# Strip the raw_hex for readability
|
||||
summary=$(echo "$body" | sed 's/"raw_hex":"[^"]*",*//; s/,*$//' | head -c 200)
|
||||
echo "$ts REACHABLE $summary" | tee -a "$log_file"
|
||||
;;
|
||||
502|503)
|
||||
err=$(echo "$body" | head -c 150)
|
||||
echo "$ts ERROR_$http_code $err" | tee -a "$log_file"
|
||||
;;
|
||||
000)
|
||||
echo "$ts CURL_FAIL (network/timeout)" | tee -a "$log_file"
|
||||
;;
|
||||
*)
|
||||
echo "$ts HTTP_$http_code $(echo "$body" | head -c 150)" | tee -a "$log_file"
|
||||
;;
|
||||
esac
|
||||
|
||||
sleep "$interval"
|
||||
done
|
||||
Reference in New Issue
Block a user