1fff8179d6
- Created a comprehensive runbook (`wedged_unit_recovery.md`) detailing the recovery process for units stuck in a call-home loop, including symptoms, recovery steps, and explanations of the failure mode. - Added `blind_stop.sh` script to send stop-monitoring commands in a tight loop for unresponsive devices. - Introduced `rescue_device.sh` script to disable Auto Call Home and erase events from a busy device. - Implemented `slow_drip.sh` script to send stop-monitoring frames at a slow rate to prevent UART overrun. - Developed `spam_stop.sh` script to rapidly send stop-monitoring commands to a device. - Created `watch_unit.sh` script for passive monitoring of device reachability, logging results over time.
100 lines
3.3 KiB
Bash
Executable File
100 lines
3.3 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Rescue an uncooperative MiniMate that's busy with another ACH session.
|
|
#
|
|
# Hammers POST /device/rescue in a tight loop with a short timeout. When the
|
|
# device is in an ACH session our SYN either gets refused or silently dropped
|
|
# (5s connect timeout inside the endpoint) and we retry immediately. When the
|
|
# device is between sessions, our TCP wins, the endpoint disables Auto Call
|
|
# Home and erases events inside the same session, then returns success.
|
|
#
|
|
# Usage:
|
|
# ./rescue_device.sh <host> [tcp_port] [--no-erase] [--no-disable-ach]
|
|
#
|
|
# Examples:
|
|
# ./rescue_device.sh 166.246.130.1 9034
|
|
# ./rescue_device.sh 166.246.130.1 9034 --no-erase # just silence it
|
|
#
|
|
# Environment:
|
|
# SFM_BASE_URL Defaults to http://localhost:8200 (SFM direct).
|
|
# Set to http://localhost:8001/api/sfm to route through
|
|
# Terra-View's proxy. Direct mode avoids the proxy's
|
|
# 60s timeout, which matters for long-running endpoints.
|
|
# MAX_ATTEMPTS Cap on retries (default 600 ≈ 30+ min).
|
|
# SLEEP_S Backoff between attempts (default 1).
|
|
# MAX_TIME_S Per-request timeout (default 60).
|
|
# CONNECT_TIMEOUT TCP connect timeout (default 5).
|
|
# RECV_TIMEOUT Per-frame S3 recv timeout (default 5). If POLL or any
|
|
# subsequent frame doesn't respond within this window, the
|
|
# rescue endpoint bails and this script retries.
|
|
|
|
set -u
|
|
|
|
host="${1:-}"
|
|
tcp_port="${2:-9034}"
|
|
shift 2 2>/dev/null || shift $# 2>/dev/null
|
|
|
|
if [[ -z "$host" ]]; then
|
|
echo "usage: $0 <host> [tcp_port] [--no-erase] [--no-disable-ach]" >&2
|
|
exit 2
|
|
fi
|
|
|
|
disable_ach="true"
|
|
erase="true"
|
|
for arg in "$@"; do
|
|
case "$arg" in
|
|
--no-erase) erase="false" ;;
|
|
--no-disable-ach) disable_ach="false" ;;
|
|
*) echo "unknown flag: $arg" >&2; exit 2 ;;
|
|
esac
|
|
done
|
|
|
|
base="${SFM_BASE_URL:-http://localhost:8200}"
|
|
max_attempts="${MAX_ATTEMPTS:-600}"
|
|
sleep_s="${SLEEP_S:-1}"
|
|
max_time_s="${MAX_TIME_S:-60}"
|
|
connect_timeout="${CONNECT_TIMEOUT:-5}"
|
|
recv_timeout="${RECV_TIMEOUT:-5}"
|
|
|
|
url="${base}/device/rescue?host=${host}&tcp_port=${tcp_port}&disable_ach=${disable_ach}&erase=${erase}&connect_timeout=${connect_timeout}&recv_timeout=${recv_timeout}"
|
|
|
|
echo "rescue: target ${host}:${tcp_port} disable_ach=${disable_ach} erase=${erase}"
|
|
echo "rescue: connect_timeout=${connect_timeout}s recv_timeout=${recv_timeout}s"
|
|
echo "rescue: POST ${url}"
|
|
echo "rescue: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request"
|
|
echo
|
|
|
|
started=$(date +%s)
|
|
for ((i=1; i<=max_attempts; i++)); do
|
|
printf "[%3d] %s " "$i" "$(date +%H:%M:%S)"
|
|
http_code=$(curl -sS -o /tmp/rescue_resp.$$ -w "%{http_code}" \
|
|
--max-time "$max_time_s" \
|
|
-X POST "$url" || echo "000")
|
|
body=$(cat /tmp/rescue_resp.$$ 2>/dev/null || true)
|
|
rm -f /tmp/rescue_resp.$$
|
|
|
|
case "$http_code" in
|
|
200|201)
|
|
elapsed=$(( $(date +%s) - started ))
|
|
echo "OK (${elapsed}s total)"
|
|
echo "$body"
|
|
exit 0
|
|
;;
|
|
503)
|
|
# Connection refused / timeout — device busy in another session. Retry fast.
|
|
echo "busy (503)"
|
|
;;
|
|
000)
|
|
echo "curl error (network)"
|
|
;;
|
|
*)
|
|
echo "HTTP $http_code"
|
|
echo " $body" | head -c 400
|
|
echo
|
|
;;
|
|
esac
|
|
sleep "$sleep_s"
|
|
done
|
|
|
|
echo "rescue: gave up after ${max_attempts} attempts" >&2
|
|
exit 1
|