From aac1c8e06df73c2d4fdfaa7881504c904c9818b5 Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 14 May 2026 21:09:21 +0000 Subject: [PATCH 1/5] fix(import): derive record_type from filename suffix instead of hardcoding "Waveform" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The BW ACH ingest path was inserting every event with record_type="Waveform" regardless of the actual type because read_blastware_file() had `ev.record_type = "Waveform"` hardcoded, and the live watcher-forward path parses files from a tmp path (suffix ".bw") that doesn't carry the original extension. V10.72+ MiniMate Plus firmware encodes the event type as the last character of the AB0T extension scheme (H=Histogram, W=Waveform, M=Manual, E=Event, C=Combo). This change: 1. Adds derive_record_type_from_filename() public helper in minimateplus/event_file_io.py 2. Uses it inside read_blastware_file() so direct callers (the --dry-run path of scripts/import_bw.py, tests, ad-hoc scripts) get correct types automatically 3. Overrides ev.record_type in WaveformStore.save_imported_bw() using the ORIGINAL filename (source_path.name) — required because the parser sees only the tmp file Old S338 firmware (3-char extensions ending in `0`) and any unrecognized suffix fall back to "Waveform". Existing DB rows ingested before this fix are stuck with record_type="Waveform" — a one-off SQL backfill would fix them retroactively if desired. Terra-view's event modal also derives client-side from the filename, so the UI already shows the correct type for old events even without the backfill. Version bumped to 0.16.1 in pyproject.toml, event_file_io.py TOOL_VERSION, sfm/server.py FastAPI version, and CHANGELOG.md. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 14 +++++++++ minimateplus/event_file_io.py | 53 +++++++++++++++++++++++++++++++++-- pyproject.toml | 2 +- sfm/server.py | 2 +- sfm/waveform_store.py | 10 +++++++ 5 files changed, 77 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a9f718..0bdfdf4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,20 @@ All notable changes to seismo-relay are documented here. --- +## v0.16.1 — 2026-05-14 + +### Fixed + +- **`record_type` always "Waveform" for forwarded events.** `read_blastware_file()` hardcoded `ev.record_type = "Waveform"` regardless of the file's actual type. The watcher-forward pipeline (the main BW ACH ingest path) compounds this by parsing files from a tmp path with a `.bw` suffix, so even a filename-based fallback inside the parser still wouldn't see the original extension. Now: + + 1. New `derive_record_type_from_filename(filename)` helper in `minimateplus/event_file_io.py` derives the type from the LAST character of the filename's extension (V10.72+ AB0T scheme: `H`=Histogram, `W`=Waveform, `M`=Manual, `E`=Event, `C`=Combo). Falls back to `"Waveform"` for old S338 firmware (3-char extensions ending in `0`) and any unrecognized suffix. + 2. `read_blastware_file()` now calls the helper with its `path.name` so direct callers (the `--dry-run` path in `scripts/import_bw.py`, tests, ad-hoc scripts) get the right value automatically. + 3. `WaveformStore.save_imported_bw()` overrides `ev.record_type` with the **original** filename's derived type after parsing (the tmp file inside the parser doesn't carry the original extension). This is the path the live watcher-forwarder hits, so the DB column now reflects the actual event type going forward. + + Events ingested before this fix are stuck with `record_type="Waveform"` in the DB; a one-off backfill (`UPDATE events SET record_type = ... WHERE blastware_filename LIKE '%H'`) would fix them retroactively if desired. Terra-view's event modal also derives client-side from the filename, so the UI already shows the correct type for old events even without the backfill. + +--- + ## v0.16.0 — 2026-05-11 The "BW ACH ingestion" release. When paired with **series3-watcher v1.5.0**, every Blastware ACH event (binary + `_ASCII.TXT` report) lands in SeismoDb with device-authoritative peaks, project metadata, sensor self-check, and ZC/Time-of-Peak data — without depending on the still-undecoded waveform body codec. This is the end-to-end product win discussed in v0.15.0's "out of scope" notes: sortable / filterable monthly-summary review of historical events, populated from the BW ASCII export rather than re-decoded samples. diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index a415599..12ad2db 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -47,7 +47,7 @@ SIDECAR_KIND = "sfm.event" # bumped without a `pip install` re-run — leading to confusing stale # version stamps in sidecars. Bump this constant and CHANGELOG.md # together at release time. -TOOL_VERSION = "0.16.0" +TOOL_VERSION = "0.16.1" try: # Best-effort: prefer the installed metadata when it's NEWER than the @@ -646,6 +646,50 @@ def _peaks_from_samples(samples: dict[str, list[int]]) -> PeakValues: ) +_RECORD_TYPE_BY_EXT_SUFFIX = { + 'H': 'Histogram', + 'W': 'Waveform', + 'M': 'Manual', + 'E': 'Event', + 'C': 'Combo', +} + + +def derive_record_type_from_filename(filename, default: str = "Waveform") -> str: + """Derive a BW Event's record_type from its filename's extension suffix. + + V10.72+ MiniMate Plus firmware encodes the event type as the LAST + character of the extension (the `T` in BW's `AB0T` scheme): + + ``M529LKIQ.G10H`` → H → ``"Histogram"`` + ``T350L385.VY0W`` → W → ``"Waveform"`` + ``...M`` → M → ``"Manual"`` + ``...E`` → E → ``"Event"`` + ``...C`` → C → ``"Combo"`` + + Old S338 firmware uses 3-char extensions ending in ``0`` whose + encoding is not yet known — those fall through to ``default``. + Micromate Series 4 uses a different scheme entirely (observed: + ``IDFH``, ``IDFW``) but the LAST-char convention (H / W) still holds + for the type code, so it works for both families. + + Returns ``default`` if filename is empty, has no extension, or the + suffix char isn't a recognized type code. + """ + if not filename: + return default + try: + name = Path(filename).name + except (TypeError, ValueError): + return default + if '.' not in name: + return default + ext = name.rsplit('.', 1)[1] + if not ext: + return default + return _RECORD_TYPE_BY_EXT_SUFFIX.get(ext[-1].upper(), default) + + def read_blastware_file(path: Union[str, Path]) -> Event: """ Parse a Blastware waveform file into an Event. @@ -727,7 +771,12 @@ def read_blastware_file(path: Union[str, Path]) -> Event: ev = Event(index=-1) if strt_fields.get("waveform_key"): ev._waveform_key = bytes.fromhex(strt_fields["waveform_key"]) - ev.record_type = "Waveform" + # Derive record_type from the filename's extension suffix (H/W/M/E/C). + # When called from save_imported_bw the path here is a tmp file with a + # ".bw" suffix, so the derivation falls back to "Waveform" and the + # caller overrides ev.record_type using the original filename — see + # waveform_store.save_imported_bw. + ev.record_type = derive_record_type_from_filename(path.name) ev.rectime_seconds = strt_fields.get("rectime_seconds") ev.total_samples = strt_fields.get("total_samples") ev.pretrig_samples = strt_fields.get("pretrig_samples") diff --git a/pyproject.toml b/pyproject.toml index 70530de..c8abd3c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "seismo-relay" -version = "0.16.0" +version = "0.16.1" description = "Python client and REST server for MiniMate Plus seismographs" requires-python = ">=3.10" dependencies = [ diff --git a/sfm/server.py b/sfm/server.py index 772f922..9005652 100644 --- a/sfm/server.py +++ b/sfm/server.py @@ -86,7 +86,7 @@ app = FastAPI( "Implements the minimateplus RS-232 protocol library.\n" "Proxied by terra-view at /api/sfm/*." ), - version="0.16.0", + version="0.16.1", ) # Allow requests from the waveform viewer opened as a local file (file://) diff --git a/sfm/waveform_store.py b/sfm/waveform_store.py index 93cd970..0d04460 100644 --- a/sfm/waveform_store.py +++ b/sfm/waveform_store.py @@ -300,6 +300,16 @@ class WaveformStore: except FileNotFoundError: pass + # read_blastware_file derives record_type from its path arg, but + # that arg is the tmp file (suffix ".bw") — so override with the + # original filename's encoded type (H/W/M/E/C in the BW AB0T + # scheme). Without this override every BW-imported event lands + # in the DB with record_type="Waveform" regardless of the actual + # type (Histogram, Manual, etc.). + ev.record_type = event_file_io.derive_record_type_from_filename( + source_path.name + ) + # Parse the BW ASCII report if one was supplied. Failures here # are non-fatal: we still write the binary + sidecar without the # rich derived fields. -- 2.52.0 From b6911009ff5776b7578e7c577f678db083aadf4f Mon Sep 17 00:00:00 2001 From: serversdown Date: Fri, 15 May 2026 06:38:09 +0000 Subject: [PATCH 2/5] scripts: backfill record_type on legacy events imported with hardcoded "Waveform" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Pre-v0.16.1 (commit aac1c8e), every event ingested through read_blastware_file got record_type="Waveform" regardless of actual type because the field was hardcoded. New ingests derive correctly from the AB0T filename scheme (H/W/M/E/C). Existing rows still hold the wrong value. This script walks the events table, derives the correct record_type from each row's blastware_filename, and bulk-updates rows that differ. Idempotent + dry-run by default. Usage: python -m scripts.backfill_record_type --db bridges/captures/seismo_relay.db python -m scripts.backfill_record_type --db bridges/captures/seismo_relay.db --apply Terra-view's event-detail modal already derives the record_type client-side from the filename for display, so operators see the correct type in the UI even before this backfill runs. This script brings the DB column in line with what the UI is already showing — matters for reporting and any downstream consumer that reads the column directly. --- scripts/backfill_record_type.py | 150 ++++++++++++++++++++++++++++++++ 1 file changed, 150 insertions(+) create mode 100644 scripts/backfill_record_type.py diff --git a/scripts/backfill_record_type.py b/scripts/backfill_record_type.py new file mode 100644 index 0000000..b2d5202 --- /dev/null +++ b/scripts/backfill_record_type.py @@ -0,0 +1,150 @@ +""" +scripts/backfill_record_type.py — fix `record_type` on legacy event +rows whose value was hardcoded to "Waveform" regardless of actual type. + +Why this is needed +────────────────── +Pre-v0.16.1 the BW file importer (`event_file_io.read_blastware_file`) +hardcoded `ev.record_type = "Waveform"` for every imported event. Fixed +in commit aac1c8e — new ingests now derive the type from the Blastware +filename's extension last character (H=Histogram, W=Waveform, M=Manual, +E=Event, C=Combo) per the V10.72+ MiniMate Plus AB0T filename scheme. + +Effect on a server that imported events under the old code: every +events row has `record_type = "Waveform"`, even for histograms, +manuals, etc. Visible in terra-view's event-detail modal under the +"Record Type" field. Terra-view also has a client-side workaround +that derives the type from the filename for display purposes, so +operators see the correct type in the UI even before this backfill. +This script makes the DB column match what the UI is already showing, +which matters for reporting and any downstream consumer that reads +events.record_type directly. + +This script +─────────── +Walks the `events` table and updates each row's `record_type` to the +derived value from its `blastware_filename`. Old S338 firmware files +(3-char extensions ending in `0`) and any unrecognized suffix get +left at the existing value (defaults to "Waveform"). + +Idempotent: re-running after a successful backfill finds zero rows +needing updates and exits cleanly (it always re-derives but only +writes when the value would change). + +Usage +───── + # Dry-run (default): print what would change, don't touch the DB + python -m scripts.backfill_record_type --db bridges/captures/seismo_relay.db + + # Apply the backfill + python -m scripts.backfill_record_type --db bridges/captures/seismo_relay.db --apply +""" + +from __future__ import annotations + +import argparse +import sqlite3 +import sys +from collections import Counter +from pathlib import Path + + +# Must stay in sync with minimateplus.event_file_io._RECORD_TYPE_BY_EXT_SUFFIX. +_TYPE_FROM_SUFFIX = { + "H": "Histogram", + "W": "Waveform", + "M": "Manual", + "E": "Event", + "C": "Combo", +} + + +def derive_record_type(filename: str | None, default: str = "Waveform") -> str: + """Mirror of minimateplus.event_file_io.derive_record_type_from_filename. + + Vendored here so this script runs without needing the seismo-relay + package on the Python path (useful on prod where you might be + running it via `docker exec` against a container's DB volume). + """ + if not filename: + return default + name = Path(filename).name + if "." not in name: + return default + ext = name.rsplit(".", 1)[1] + if not ext: + return default + return _TYPE_FROM_SUFFIX.get(ext[-1].upper(), default) + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--db", required=True, help="Path to seismo_relay.db") + ap.add_argument("--apply", action="store_true", + help="Actually write changes (default is dry-run).") + ap.add_argument("--default", default="Waveform", + help="Fallback record_type when filename doesn't encode one. " + "Default: Waveform (matches the pre-fix bug's behavior).") + args = ap.parse_args() + + db_path = Path(args.db) + if not db_path.exists(): + print(f"ERROR: database not found at {db_path}", file=sys.stderr) + return 1 + + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + cur = conn.cursor() + + cur.execute(""" + SELECT id, blastware_filename, record_type + FROM events + WHERE blastware_filename IS NOT NULL + AND blastware_filename != '' + """) + rows = cur.fetchall() + total = len(rows) + print(f"Scanning {total:,} event rows…") + print() + + # Tally proposed changes. + transitions: Counter[tuple[str, str]] = Counter() + update_ids: list[tuple[str, str]] = [] + unrecognized = 0 + + for row in rows: + derived = derive_record_type(row["blastware_filename"], default=args.default) + current = row["record_type"] or "" + if derived == current: + continue + transitions[(current, derived)] += 1 + update_ids.append((row["id"], derived)) + + if not update_ids: + print("Nothing to update — all rows already match.") + conn.close() + return 0 + + print(f"{len(update_ids):,} row(s) need updating:") + for (old, new), count in sorted(transitions.items(), key=lambda x: -x[1]): + print(f" {count:>6,} {old!r:14s} → {new!r}") + print() + + if not args.apply: + print("(dry-run — re-run with --apply to write changes)") + conn.close() + return 0 + + print("Applying changes…") + cur.executemany( + "UPDATE events SET record_type = ? WHERE id = ?", + [(new, eid) for eid, new in update_ids], + ) + conn.commit() + print(f"Done. Updated {cur.rowcount:,} row(s).") + conn.close() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) -- 2.52.0 From ae7edac83fe7936bdbb5be01f4191aab282d9a38 Mon Sep 17 00:00:00 2001 From: serversdown Date: Fri, 15 May 2026 23:35:35 +0000 Subject: [PATCH 3/5] chore(doc): bump to 0.16.1 --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fc92a77..e94c1cd 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# seismo-relay `v0.16.0` +# seismo-relay `v0.16.1` A ground-up replacement for **Blastware** — Instantel's aging Windows-only software for managing MiniMate Plus seismographs. -- 2.52.0 From 1fff8179d6639106c8152ce5199eb0e18edf97b2 Mon Sep 17 00:00:00 2001 From: serversdown Date: Sun, 17 May 2026 07:58:13 +0000 Subject: [PATCH 4/5] Add runbook for recovering wedged units and new scripts for device management - Created a comprehensive runbook (`wedged_unit_recovery.md`) detailing the recovery process for units stuck in a call-home loop, including symptoms, recovery steps, and explanations of the failure mode. - Added `blind_stop.sh` script to send stop-monitoring commands in a tight loop for unresponsive devices. - Introduced `rescue_device.sh` script to disable Auto Call Home and erase events from a busy device. - Implemented `slow_drip.sh` script to send stop-monitoring frames at a slow rate to prevent UART overrun. - Developed `spam_stop.sh` script to rapidly send stop-monitoring commands to a device. - Created `watch_unit.sh` script for passive monitoring of device reachability, logging results over time. --- docs/runbooks/wedged_unit_recovery.md | 255 +++++++++ scripts/blind_stop.sh | 100 ++++ scripts/rescue_device.sh | 99 ++++ scripts/slow_drip.sh | 44 ++ scripts/spam_stop.sh | 48 ++ scripts/watch_unit.sh | 58 ++ sfm/database.py | 69 +++ sfm/server.py | 738 +++++++++++++++++++++++++- 8 files changed, 1401 insertions(+), 10 deletions(-) create mode 100644 docs/runbooks/wedged_unit_recovery.md create mode 100755 scripts/blind_stop.sh create mode 100755 scripts/rescue_device.sh create mode 100755 scripts/slow_drip.sh create mode 100755 scripts/spam_stop.sh create mode 100755 scripts/watch_unit.sh diff --git a/docs/runbooks/wedged_unit_recovery.md b/docs/runbooks/wedged_unit_recovery.md new file mode 100644 index 0000000..8d27dd0 --- /dev/null +++ b/docs/runbooks/wedged_unit_recovery.md @@ -0,0 +1,255 @@ +# Runbook — Recovering a wedged unit stuck in a call-home loop + +**Original incident:** BE9558H at `166.246.130.1:9034`, recovered 2026-05-17. + +A field unit with a stuck-triggered geophone (or any hardware fault causing +constant event triggering) will record events back-to-back, and if Auto Call +Home is set to "After Event Recorded" the device will dial the office BW +ACH server in a tight loop. Combined with a Sierra Wireless modem in +bidirectional serial-TCP mode, this makes the unit effectively unreachable +from SFM — every TCP connection we open gets killed when the modem flips +from server-mode to client-mode to honor the device's next AT dial command. + +This runbook describes how to break the loop and recover control. + +--- + +## Symptoms + +- Terra-View / SFM `/device/info` either hangs or fails on `count_events()`. +- `/device/monitor/status` and `/device/rescue` return 502 (protocol timeout + waiting for POLL response) or 503 (TCP connect refused). +- ACEmanager serial log shows repeating + `Connect to IP: Port: ` → `Shutdown TCP socket` cycles + every 30-60 seconds. +- Spam-mode endpoints (`/device/stop_monitoring_spam`) report many + `sent_ok` but the device's monitoring state never changes. +- `slow_drip` reports `[Errno 32] Broken pipe` after sending the preamble + but before completing the drip loop. + +If you see *all* of these, the unit is in this exact failure mode. + +--- + +## Quick reference — how to recover + +You need **ACEmanager access** to the unit's modem. + +### Step 1: stop the modem's mode-flipping + +In ACEmanager → **Serial → Port Configuration**: + +| Field | Set to | +|---|---| +| **Destination Address** | clear (blank) | +| **Destination Port** | `0` | + +Click **Apply**. This removes the modem's auto-dial-out target. The device's +AT dial commands now error back at the modem instead of triggering a +mode-flip, so the modem stays in TCP-server mode permanently and our inbound +TCP sessions stay alive. + +*(Optional belt-and-suspenders: also add the BW server's port to +**Security → Port Filtering - Outbound** as a blocked port, with +Outbound Port Filtering Mode = Blocked Ports.)* + +### Step 2: stop monitoring on the device (slow drip) + +From the SFM host: + +```bash +/home/serversdown/seismo-relay/scripts/slow_drip.sh +``` + +Defaults are 120s duration with a drip every 3s. Watch the response: + +- `duration_s ≈ 120` and `drips_sent ≈ 40` → session held the full duration ✓ +- `bytes_received > 0` → device is responding ✓ (this is the success signal) + +If `duration_s` is small or `send_error: "Broken pipe"`, Step 1 didn't take +hold — re-check ACEmanager, may need to reboot the modem after Apply. + +### Step 3: confirm monitoring stopped + +```bash +curl 'http://localhost:8200/device/monitor/status?host=&tcp_port=&force=true' +# expect: {"is_monitoring": false, ...} +``` + +### Step 4: disable ACH at the device level + erase corrupted events + +Either fire the rescue endpoint: + +```bash +/home/serversdown/seismo-relay/scripts/rescue_device.sh +``` + +Or do the two steps manually: + +```bash +# Disable ACH in the device's compliance config +curl -X POST 'http://localhost:8200/device/call_home?host=&tcp_port=' \ + -H 'Content-Type: application/json' \ + -d '{"auto_call_home_enabled": false}' + +# Erase corrupted event chain +curl -X POST 'http://localhost:8200/device/events/erase?host=&tcp_port=' +``` + +You can also do this via the SFM standalone UI → **Call Home** tab → set +`Enable Auto Call Home` to `Disabled` → **Write to Device**. + +### Step 5: restore modem config (housekeeping) + +Once the device-side ACH is disabled, restore the modem's Destination +Address and Port to the original values (e.g. `50.197.32.92` / `12345`) in +ACEmanager. The modem will resume normal bidirectional behavior, but the +unit won't issue any dial commands until ACH is explicitly re-enabled on +the device. + +### Step 6: do NOT re-enable ACH on this unit until the underlying hardware +fault is repaired. If you do, the call-home loop starts again immediately +and you'll be running this runbook a second time. + +--- + +## Why this works — the failure mode explained + +The Sierra Wireless RV50/RV55 serial port operates in one of two TCP modes +at any moment: + +- **Server mode** — listens on `Device Port` (e.g. 9034), bridges inbound + TCP to the device's serial port. This is what we need to interact with + the device. +- **Client mode** — when the device sends an AT dial command on its serial + TX line, the modem opens an outbound TCP to `Destination Address:Port` + and bridges that to serial. + +A serial port in this configuration is **bidirectional**: the modem flips +between server and client modes on demand. When the device's firmware is +healthy and only dials occasionally, this works fine. + +When the unit is constantly triggering events and ACH is set to "After +Event Recorded", the device sends an AT dial command every few seconds. +Each one causes the modem to: + +1. Drop any active inbound TCP session +2. Flip to client mode +3. Attempt outbound TCP to `Destination Address:Port` +4. Hang for up to a minute waiting for it to succeed/fail +5. Drop back to server mode + +**During the entire hang, no inbound TCP can establish.** Even between +hangs, the modem closes any existing inbound session before flipping. So +any tool that needs more than a few seconds of held TCP (e.g. POLL + +config read + write) gets repeatedly kicked off. + +Clearing `Destination Address` removes step 3-4 from the cycle: the modem +has nowhere to dial, so it doesn't flip modes when it receives an AT dial +command. The serial port effectively becomes server-only, and inbound TCP +sessions can stay open as long as needed. + +**This is a modem-layer issue, not a device firmware issue.** The device +is alive and responsive the whole time — confirmed in the BE9558H +recovery by 990 bytes of S3 responses received over a 120s slow-drip +session once the modem was no longer mode-flipping. + +--- + +## Why simpler approaches don't work + +| Approach | Why it fails | +|---|---| +| Standard `/device/info` | Triggers `count_events()` 1E/1F walk, takes 90s+ and hits corrupted event chain in this scenario | +| `/device/rescue` race loop | Gets 502 (protocol timeout) because the modem closes the TCP before the POLL handshake can complete | +| `/device/stop_monitoring_blind` (single frame) | Even if the bytes leave the wire, the device's protocol parser ignores write commands without a preceding POLL handshake (early-version bug, now fixed by including POLL preamble in blind sends) | +| `/device/stop_monitoring_spam` (sub-second cadence) | Each session is killed by the modem's mode-flip before the device can drain its UART RX buffer; high-rate spam also risks UART FIFO overrun on the device side | +| Outbound port firewall block alone | Stops the outbound TCP from succeeding, but doesn't stop the modem from *trying* and mode-flipping. Reduces but doesn't eliminate the contention. | +| Modem reboot | Temporary — as soon as the device starts triggering again, the loop resumes within seconds | + +The combination of `slow_drip` + cleared `Destination Address` works because: + +1. The modem stops mode-flipping → TCP session stays open for the full + drip duration +2. Slow drip rate → device's UART RX FIFO never overflows even if + firmware is busy with event recording +3. The drip is `SESSION_RESET + STOP_MONITORING` every 3s → many + independent chances for the parser to land one valid frame +4. Once one Stop Monitoring is parsed, event recording halts → firmware + has CPU to spare → subsequent operations are trivially easy + +--- + +## Tooling reference + +All endpoints live in `seismo-relay/sfm/server.py`. All scripts live in +`seismo-relay/scripts/` and default to SFM direct (`http://localhost:8200`), +overridable via `SFM_BASE_URL`. + +### Endpoints added during BE9558H recovery + +| Endpoint | Purpose | +|---|---| +| `GET /device/events/storage_range` | SUB 0x06 — first/last event keys, `is_empty` flag. ~2s, no event walk. | +| `GET /device/events/index` | SUB 0x08 — lifetime event counter (does NOT decrement on erase). ~2s. | +| `POST /device/events/erase` | Full erase sequence 0xA3 → 0x1C → 0x06 → 0xA2. | +| `POST /device/rescue` | Disable ACH + erase in one TCP session. Short timeouts for race-loop usage. | +| `POST /device/stop_monitoring_blind` | Fire-and-forget Stop with full POLL preamble (single attempt). | +| `POST /device/stop_monitoring_spam` | Server-side tight retry loop, sub-second cadence, duration-bounded. | +| `POST /device/stop_monitoring_slow_drip` | One held TCP session, slow trickle of stop frames. **The endpoint that saved BE9558H.** | + +Also changed: default protocol recv timeout dropped from 30s → 10s in +`_build_client`. Added `connect_timeout` knob to same. Cleaned up +unhandled-exception path in `/device/monitor/status` so it returns 502 +instead of 500 on protocol timeouts. + +### Scripts + +| Script | Purpose | +|---|---| +| `scripts/rescue_device.sh` | Race-loop wrapper around `/device/rescue` | +| `scripts/blind_stop.sh` | Race-loop wrapper around `/device/stop_monitoring_blind` | +| `scripts/spam_stop.sh` | Single-call burst hammer (`/device/stop_monitoring_spam`) | +| `scripts/slow_drip.sh` | Single-call held-session drip (`/device/stop_monitoring_slow_drip`) | +| `scripts/watch_unit.sh` | Passive periodic reachability check, logs to file | + +--- + +## Incident log — BE9558H, 2026-05-16/17 + +What was wrong: Long-axis geophone developed an offset, constantly above +trigger threshold → constant event recording → after-event ACH set → +modem dialing office BW server (`50.197.32.92:12345`) every 30-60s. +Local event chain corrupted (`next_boundary 0x100EE exceeds uint16`). + +Diagnostic path: + +1. `/device/info` slow, choked on event walk +2. Built lightweight probe endpoints (`storage_range`, `index`) — useful + but didn't reach the wedged unit +3. Built `/device/rescue` with short timeouts — got 502 (POLL no response) +4. Built `/device/stop_monitoring_blind` — first version was a false + positive (no POLL preamble); fixed by including + `SESSION_RESET+POLL_PROBE+SESSION_RESET+POLL_DATA` in the dump +5. Verified blind stop works on bench unit +6. Built `/device/stop_monitoring_spam` — 420 successful sends over + 5 min, zero behavior change on field unit +7. Inspected ACEmanager logs → saw outbound dial-out attempts every ~30s, + confirmed device was not fully locked up +8. Added outbound port-12345 firewall block → outbound attempts now fail + instantly but contention persisted +9. Built `/device/stop_monitoring_slow_drip` — session died at 3s with + broken pipe (modem closing on us) +10. Looked at full ACEmanager Port Configuration → **found + `Destination Address: 50.197.32.92` configured**, realized every AT + dial command was triggering a modem mode-flip that killed our inbound +11. Cleared Destination Address + Port → slow_drip held 120s, device + responded with 990 bytes, 39 stop commands acked +12. Disabled ACH at device level via `/device/call_home`, erased events + +Final state: device IDLE, memory 958.1 / 960 KB free, ACH disabled at +device level, modem destination cleared (to be restored after physical +service). + +Total time from "i was wondering if its possible to" first attempt to +recovery: ~7 hours of intermittent debugging across one evening. diff --git a/scripts/blind_stop.sh b/scripts/blind_stop.sh new file mode 100755 index 0000000..5ab170b --- /dev/null +++ b/scripts/blind_stop.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# Fire-and-forget Stop Monitoring loop — for wedged or constantly-triggering units. +# +# Hammers POST /device/stop_monitoring_blind in a tight loop. The endpoint +# opens TCP, dumps SESSION_RESET + a few copies of the SUB 0x97 frame, and +# closes — without ever reading an S3 response. Each TCP-won attempt is +# ~50ms of wire activity instead of the multi-frame handshake the regular +# rescue endpoint does, so windows that are too small for the full rescue +# can still land a stop-monitoring command. +# +# Usage: +# ./blind_stop.sh [tcp_port] +# +# Env: +# SFM_BASE_URL Default: http://localhost:8200 (SFM direct). +# Set to http://localhost:8001/api/sfm to route through +# Terra-View's proxy. +# MAX_ATTEMPTS Default: 600 +# SLEEP_S Default: 0 (no backoff — hammer it) +# MAX_TIME_S Default: 15 +# CONNECT_TIMEOUT Default: 5 +# REPEAT Frames per TCP session (default 3 — increases hit rate +# if the device is busy reading its own buffer). +# STOP_ON_OK Default: 1. Set to 0 to keep hammering indefinitely +# even after successful sends (every 503 means the device +# is in *another* session, every 200 means our bytes got +# through — but the device may not have processed them). + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port]" >&2 + exit 2 +fi + +base="${SFM_BASE_URL:-http://localhost:8200}" +max_attempts="${MAX_ATTEMPTS:-600}" +sleep_s="${SLEEP_S:-0}" +max_time_s="${MAX_TIME_S:-15}" +connect_timeout="${CONNECT_TIMEOUT:-5}" +repeat="${REPEAT:-3}" +stop_on_ok="${STOP_ON_OK:-1}" + +url="${base}/device/stop_monitoring_blind?host=${host}&tcp_port=${tcp_port}&connect_timeout=${connect_timeout}&repeat=${repeat}" + +echo "blind_stop: target ${host}:${tcp_port} connect_timeout=${connect_timeout}s repeat=${repeat}" +echo "blind_stop: POST ${url}" +echo "blind_stop: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request" +echo "blind_stop: stop_on_ok=${stop_on_ok}" +echo + +ok_count=0 +busy_count=0 +err_count=0 +started=$(date +%s) + +for ((i=1; i<=max_attempts; i++)); do + printf "[%4d] %s " "$i" "$(date +%H:%M:%S)" + http_code=$(curl -sS -o /tmp/blind_resp.$$ -w "%{http_code}" \ + --max-time "$max_time_s" \ + -X POST "$url" || echo "000") + body=$(cat /tmp/blind_resp.$$ 2>/dev/null || true) + rm -f /tmp/blind_resp.$$ + + case "$http_code" in + 200|201) + ok_count=$((ok_count + 1)) + echo "SENT $body" + if [[ "$stop_on_ok" == "1" ]]; then + elapsed=$(( $(date +%s) - started )) + echo + echo "blind_stop: success after ${i} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" + echo "blind_stop: NEXT — wait ~10s, then try the full rescue:" + echo " /home/serversdown/seismo-relay/scripts/rescue_device.sh ${host} ${tcp_port}" + exit 0 + fi + ;; + 503) + busy_count=$((busy_count + 1)) + echo "busy (503)" + ;; + 000) + err_count=$((err_count + 1)) + echo "curl error" + ;; + *) + err_count=$((err_count + 1)) + echo "HTTP $http_code $body" | head -c 400 + echo + ;; + esac + [[ "$sleep_s" != "0" ]] && sleep "$sleep_s" +done + +elapsed=$(( $(date +%s) - started )) +echo +echo "blind_stop: gave up after ${max_attempts} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" >&2 +exit 1 diff --git a/scripts/rescue_device.sh b/scripts/rescue_device.sh new file mode 100755 index 0000000..3466926 --- /dev/null +++ b/scripts/rescue_device.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# Rescue an uncooperative MiniMate that's busy with another ACH session. +# +# Hammers POST /device/rescue in a tight loop with a short timeout. When the +# device is in an ACH session our SYN either gets refused or silently dropped +# (5s connect timeout inside the endpoint) and we retry immediately. When the +# device is between sessions, our TCP wins, the endpoint disables Auto Call +# Home and erases events inside the same session, then returns success. +# +# Usage: +# ./rescue_device.sh [tcp_port] [--no-erase] [--no-disable-ach] +# +# Examples: +# ./rescue_device.sh 166.246.130.1 9034 +# ./rescue_device.sh 166.246.130.1 9034 --no-erase # just silence it +# +# Environment: +# SFM_BASE_URL Defaults to http://localhost:8200 (SFM direct). +# Set to http://localhost:8001/api/sfm to route through +# Terra-View's proxy. Direct mode avoids the proxy's +# 60s timeout, which matters for long-running endpoints. +# MAX_ATTEMPTS Cap on retries (default 600 ≈ 30+ min). +# SLEEP_S Backoff between attempts (default 1). +# MAX_TIME_S Per-request timeout (default 60). +# CONNECT_TIMEOUT TCP connect timeout (default 5). +# RECV_TIMEOUT Per-frame S3 recv timeout (default 5). If POLL or any +# subsequent frame doesn't respond within this window, the +# rescue endpoint bails and this script retries. + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +shift 2 2>/dev/null || shift $# 2>/dev/null + +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port] [--no-erase] [--no-disable-ach]" >&2 + exit 2 +fi + +disable_ach="true" +erase="true" +for arg in "$@"; do + case "$arg" in + --no-erase) erase="false" ;; + --no-disable-ach) disable_ach="false" ;; + *) echo "unknown flag: $arg" >&2; exit 2 ;; + esac +done + +base="${SFM_BASE_URL:-http://localhost:8200}" +max_attempts="${MAX_ATTEMPTS:-600}" +sleep_s="${SLEEP_S:-1}" +max_time_s="${MAX_TIME_S:-60}" +connect_timeout="${CONNECT_TIMEOUT:-5}" +recv_timeout="${RECV_TIMEOUT:-5}" + +url="${base}/device/rescue?host=${host}&tcp_port=${tcp_port}&disable_ach=${disable_ach}&erase=${erase}&connect_timeout=${connect_timeout}&recv_timeout=${recv_timeout}" + +echo "rescue: target ${host}:${tcp_port} disable_ach=${disable_ach} erase=${erase}" +echo "rescue: connect_timeout=${connect_timeout}s recv_timeout=${recv_timeout}s" +echo "rescue: POST ${url}" +echo "rescue: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request" +echo + +started=$(date +%s) +for ((i=1; i<=max_attempts; i++)); do + printf "[%3d] %s " "$i" "$(date +%H:%M:%S)" + http_code=$(curl -sS -o /tmp/rescue_resp.$$ -w "%{http_code}" \ + --max-time "$max_time_s" \ + -X POST "$url" || echo "000") + body=$(cat /tmp/rescue_resp.$$ 2>/dev/null || true) + rm -f /tmp/rescue_resp.$$ + + case "$http_code" in + 200|201) + elapsed=$(( $(date +%s) - started )) + echo "OK (${elapsed}s total)" + echo "$body" + exit 0 + ;; + 503) + # Connection refused / timeout — device busy in another session. Retry fast. + echo "busy (503)" + ;; + 000) + echo "curl error (network)" + ;; + *) + echo "HTTP $http_code" + echo " $body" | head -c 400 + echo + ;; + esac + sleep "$sleep_s" +done + +echo "rescue: gave up after ${max_attempts} attempts" >&2 +exit 1 diff --git a/scripts/slow_drip.sh b/scripts/slow_drip.sh new file mode 100755 index 0000000..9c942d1 --- /dev/null +++ b/scripts/slow_drip.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Hold a single TCP session open and drip stop-monitoring frames at a slow +# rate, so the device's UART RX FIFO has time to drain between sends. +# +# Use when high-rate spam isn't landing — typically because the device's +# firmware is too busy to drain its serial buffer fast enough and bytes +# are being lost to UART overrun. +# +# Usage: +# ./slow_drip.sh [tcp_port] [duration_s] +# +# Env: +# DURATION Default: 120 (seconds; arg 3 overrides). Clamped 1..600. +# INTERVAL Seconds between drip sends (default 3). Lower = more +# aggressive, more risk of FIFO overrun. Higher = safer +# but fewer total drips per duration. +# CONNECT_TIMEOUT Default: 5 +# SFM_BASE_URL Default: http://localhost:8200 (SFM direct). + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +duration="${3:-${DURATION:-120}}" +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port] [duration_s]" >&2 + exit 2 +fi + +base="${SFM_BASE_URL:-http://localhost:8200}" +interval="${INTERVAL:-3}" +connect_timeout="${CONNECT_TIMEOUT:-5}" + +url="${base}/device/stop_monitoring_slow_drip?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&interval_s=${interval}&connect_timeout=${connect_timeout}" + +echo "slow_drip: target ${host}:${tcp_port} duration=${duration}s interval=${interval}s connect_timeout=${connect_timeout}s" +echo "slow_drip: POST ${url}" +echo + +# Give curl enough slack to wait out the duration plus a buffer +max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 30 }') + +curl -sS --max-time "$max_time" -X POST "$url" +echo diff --git a/scripts/spam_stop.sh b/scripts/spam_stop.sh new file mode 100755 index 0000000..dda864d --- /dev/null +++ b/scripts/spam_stop.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Hammer a device with blind stop-monitoring sessions as fast as possible. +# Single HTTP call kicks off the burst inside SFM (no per-attempt HTTP +# overhead). Default: 10 seconds, ~500 ms per attempt = ~20 attempts/sec. +# +# Usage: +# ./spam_stop.sh [tcp_port] [duration_s] +# +# Examples: +# ./spam_stop.sh 166.246.130.1 # 10s burst +# ./spam_stop.sh 166.246.130.1 9034 30 # 30s burst +# DURATION=60 CONNECT_TIMEOUT=0.2 ./spam_stop.sh 166.246.130.1 +# +# Env: +# SFM_BASE_URL Default: http://localhost:8200 (SFM direct). +# Set to http://localhost:8001/api/sfm to route through +# Terra-View's proxy — but note the proxy has a 60s +# timeout, so long bursts need direct mode. +# DURATION Default: 10 (seconds; arg 3 overrides) +# CONNECT_TIMEOUT Default: 0.5 (seconds) +# REPEAT Default: 3 (stop frames per TCP session) + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +duration="${3:-${DURATION:-10}}" + +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port] [duration_s]" >&2 + exit 2 +fi + +base="${SFM_BASE_URL:-http://localhost:8200}" +connect_timeout="${CONNECT_TIMEOUT:-0.5}" +repeat="${REPEAT:-3}" + +url="${base}/device/stop_monitoring_spam?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&connect_timeout=${connect_timeout}&repeat=${repeat}" + +echo "spam_stop: target ${host}:${tcp_port} duration=${duration}s connect_timeout=${connect_timeout}s repeat=${repeat}" +echo "spam_stop: POST ${url}" +echo + +# Give curl enough slack to wait out the duration plus a buffer +max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 10 }') + +curl -sS --max-time "$max_time" -X POST "$url" +echo diff --git a/scripts/watch_unit.sh b/scripts/watch_unit.sh new file mode 100755 index 0000000..2114936 --- /dev/null +++ b/scripts/watch_unit.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Passive monitor for a misbehaving unit. Every INTERVAL seconds, attempts +# a single short TCP probe + storage_range read and logs the result. Designed +# to run unattended for hours/days and tell you when the unit comes back. +# +# Usage: +# ./watch_unit.sh [tcp_port] +# +# Env: +# INTERVAL Seconds between checks (default 300 = 5 min) +# LOG_FILE Append results here (default /tmp/watch_.log) +# SFM_BASE_URL Default: http://localhost:8200 + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port]" >&2 + exit 2 +fi + +interval="${INTERVAL:-300}" +log_file="${LOG_FILE:-/tmp/watch_${host}.log}" +base="${SFM_BASE_URL:-http://localhost:8200}" + +url="${base}/device/events/storage_range?host=${host}&tcp_port=${tcp_port}" + +echo "watch_unit: target ${host}:${tcp_port} interval=${interval}s log=${log_file}" +echo "watch_unit: Ctrl-C to stop" + +while true; do + ts=$(date '+%Y-%m-%d %H:%M:%S') + http_code=$(curl -sS -o /tmp/watch_resp.$$ -w "%{http_code}" \ + --max-time 20 "$url" || echo "000") + body=$(cat /tmp/watch_resp.$$ 2>/dev/null || true) + rm -f /tmp/watch_resp.$$ + + case "$http_code" in + 200|201) + # Strip the raw_hex for readability + summary=$(echo "$body" | sed 's/"raw_hex":"[^"]*",*//; s/,*$//' | head -c 200) + echo "$ts REACHABLE $summary" | tee -a "$log_file" + ;; + 502|503) + err=$(echo "$body" | head -c 150) + echo "$ts ERROR_$http_code $err" | tee -a "$log_file" + ;; + 000) + echo "$ts CURL_FAIL (network/timeout)" | tee -a "$log_file" + ;; + *) + echo "$ts HTTP_$http_code $(echo "$body" | head -c 150)" | tee -a "$log_file" + ;; + esac + + sleep "$interval" +done diff --git a/sfm/database.py b/sfm/database.py index 88497d8..8228d6c 100644 --- a/sfm/database.py +++ b/sfm/database.py @@ -491,6 +491,75 @@ class SeismoDb: ) return cur.rowcount > 0 + def delete_event(self, event_id: str) -> Optional[dict]: + """ + Hard-delete one event row by id. Returns the deleted row (so the + caller can clean up any on-disk files referenced by it) or None + if no row matched. + """ + with self._connect() as conn: + row = conn.execute( + "SELECT * FROM events WHERE id = ?", (event_id,), + ).fetchone() + if row is None: + return None + conn.execute("DELETE FROM events WHERE id = ?", (event_id,)) + return dict(row) + + def delete_events_bulk( + self, + serial: Optional[str] = None, + from_dt: Optional[datetime.datetime] = None, + to_dt: Optional[datetime.datetime] = None, + false_trigger: Optional[bool] = None, + ids: Optional[list[str]] = None, + ) -> list[dict]: + """ + Hard-delete events matching the given filters. Returns the list + of deleted row dicts. Refuses to delete with no filters at all + (would wipe the whole table) — raises ValueError. + + Filter semantics match query_events: serial / from_dt / to_dt / + false_trigger combine with AND. `ids` is an additional inclusion + list (event_id IN (...)); if supplied alongside other filters, + only rows matching all conditions are deleted. + """ + clauses: list[str] = [] + params: list = [] + + if serial: + clauses.append("serial = ?") + params.append(serial) + if from_dt: + clauses.append("timestamp >= ?") + params.append(from_dt.isoformat()) + if to_dt: + clauses.append("timestamp <= ?") + params.append(to_dt.isoformat()) + if false_trigger is not None: + clauses.append("false_trigger = ?") + params.append(1 if false_trigger else 0) + if ids: + placeholders = ",".join("?" * len(ids)) + clauses.append(f"id IN ({placeholders})") + params.extend(ids) + + if not clauses: + raise ValueError( + "delete_events_bulk refuses to delete with no filters " + "(would wipe the entire events table)" + ) + + where = "WHERE " + " AND ".join(clauses) + + with self._connect() as conn: + rows = conn.execute( + f"SELECT * FROM events {where}", params, + ).fetchall() + if rows: + conn.execute(f"DELETE FROM events {where}", params) + return [dict(r) for r in rows] + def update_event_review(self, event_id: str, review: dict) -> bool: """ Sync derived index columns from a sidecar's `review` block. diff --git a/sfm/server.py b/sfm/server.py index 9005652..95937c6 100644 --- a/sfm/server.py +++ b/sfm/server.py @@ -36,6 +36,7 @@ from __future__ import annotations import datetime import logging +import socket import sys import tempfile import threading @@ -63,7 +64,9 @@ from minimateplus.protocol import ProtocolError from minimateplus.models import CallHomeConfig, ComplianceConfig, DeviceInfo, Event, PeakValues, ProjectInfo, Timestamp from minimateplus.transport import TcpTransport, DEFAULT_TCP_PORT from minimateplus.blastware_file import write_blastware_file, blastware_filename -from minimateplus.client import _decode_a5_metadata_into, _decode_a5_waveform +from minimateplus.client import _decode_a5_metadata_into, _decode_a5_waveform, _decode_event_count +from minimateplus.framing import build_bw_write_frame, SESSION_RESET, POLL_PROBE, POLL_DATA +from minimateplus.protocol import SUB_STOP_MONITORING from sfm import event_hdf5 from sfm.cache import SFMCache, get_cache from sfm.database import SeismoDb @@ -268,7 +271,8 @@ def _build_client( baud: int, host: Optional[str], tcp_port: int, - timeout: float = 30.0, + timeout: float = 10.0, + connect_timeout: Optional[float] = None, ) -> MiniMateClient: """ Return a MiniMateClient configured for either serial or TCP transport. @@ -276,12 +280,24 @@ def _build_client( TCP takes priority if *host* is supplied; otherwise *port* (serial) is used. Raises HTTPException(422) if neither is provided. + Default *timeout* is 10s — the device usually responds in well under a + second over cellular; 10s leaves comfortable headroom for retransmits + while still failing reasonably fast when a unit is wedged. + Use timeout=120.0 (or higher) for endpoints that perform a full 5A waveform download — a 70-second event at 1024 sps takes 2-3 minutes to transfer over cellular and each individual recv must complete within the timeout window. + + *connect_timeout* (TCP only) overrides the TcpTransport default (10s) for + the initial TCP SYN/handshake. Use a small value (e.g. 5s) in rescue/race + scenarios where the device is busy in another session and you want to + fail fast and retry quickly. """ if host: - transport = TcpTransport(host, port=tcp_port) + if connect_timeout is not None: + transport = TcpTransport(host, port=tcp_port, connect_timeout=connect_timeout) + else: + transport = TcpTransport(host, port=tcp_port) log.debug("TCP transport: %s:%d timeout=%.0fs", host, tcp_port, timeout) return MiniMateClient(transport=transport, timeout=timeout) elif port: @@ -1095,13 +1111,23 @@ def device_monitor_status( cached["_cached"] = True return cached - with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: - try: - client.poll() - except Exception as exc: - log.warning("monitor status poll retry: %s", exc) - client.poll() - status = client.get_monitor_status() + try: + with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: + try: + client.poll() + except Exception as exc: + log.warning("monitor status poll retry: %s", exc) + client.poll() + status = client.get_monitor_status() + except HTTPException: + raise + except ProtocolError as exc: + # Includes minimateplus.protocol.TimeoutError ("device unresponsive"). + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc result: dict = {"is_monitoring": status.is_monitoring} if status.battery_v is not None: @@ -1117,6 +1143,529 @@ def device_monitor_status( return result +@app.get("/device/events/storage_range") +def device_events_storage_range( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), +) -> dict: + """ + Read the device's event storage range (SUB 0x06) — first and last + stored event keys. POLL handshake + one read; no connect(), no + config reads, no event walk. Completes in ~2 seconds. + + Useful for checking whether the device has any stored events + without invoking the slow count_events() 1E/1F chain. Both keys = + `01110000` means the device is empty. + """ + log.info("GET /device/events/storage_range host=%s tcp_port=%s", host, tcp_port) + try: + def _do(): + with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: + try: + client.poll() + except Exception as exc: + log.warning("storage_range poll retry: %s", exc) + client.poll() + proto = client._require_proto() + return proto.read_event_storage_range() + rng = _run_with_retry(_do, is_tcp=_is_tcp(host)) + except HTTPException: + raise + except ProtocolError as exc: + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc + + data = bytes(rng.data) + result: dict = {"raw_len": len(data), "raw_hex": data.hex()} + if len(data) >= 8: + first_key = data[-8:-4].hex() + last_key = data[-4:].hex() + result["first_key"] = first_key + result["last_key"] = last_key + result["is_empty"] = (first_key == "01110000" and last_key == "01110000") + return result + + +@app.get("/device/events/index") +def device_events_index( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), +) -> dict: + """ + Read the device's event index (SUB 0x08) — returns the lifetime + event counter at data[10:12] (uint16 BE). POLL handshake + one + read; no connect(), no config reads, no event walk. ~2 seconds. + + Note: this is a LIFETIME counter (events ever recorded) — it does + NOT decrement when events are erased. After an erase, the device + counter resets to 0 only on the next recorded event. For "are + there stored events right now?" use /device/events/storage_range + instead. + """ + log.info("GET /device/events/index host=%s tcp_port=%s", host, tcp_port) + try: + def _do(): + with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: + try: + client.poll() + except Exception as exc: + log.warning("event_index poll retry: %s", exc) + client.poll() + proto = client._require_proto() + return proto.read_event_index() + idx_raw = _run_with_retry(_do, is_tcp=_is_tcp(host)) + except HTTPException: + raise + except ProtocolError as exc: + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc + + raw = bytes(idx_raw) + result: dict = {"raw_len": len(raw), "raw_hex": raw.hex()} + try: + result["lifetime_count"] = _decode_event_count(raw) + except Exception as exc: + result["decode_error"] = str(exc) + return result + + +@app.post("/device/events/erase") +def device_events_erase( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), +) -> dict: + """ + Erase ALL stored events from the device memory. + + Sequence: SUB 0xA3 → 0x1C → 0x06 → 0xA2 (confirmed 2026-04-11). + After this call the unit's event memory is empty and event keys reset + to 0x01110000. The device returns to its normal operating state + automatically — no restart-monitoring call is needed. + + Note: this endpoint does NOT touch the ACH server's `ach_state.json`. + If a call-home subsequently lands on the ACH server, its post-erase + detection logic (max(device_keys) vs max_downloaded_key) handles the + key-counter rollback. + """ + log.info("POST /device/events/erase port=%s host=%s tcp_port=%s", port, host, tcp_port) + + try: + def _do(): + with _build_client(port, baud, host, tcp_port) as client: + client.connect() + client.delete_all_events() + _run_with_retry(_do, is_tcp=_is_tcp(host)) + except HTTPException: + raise + except ProtocolError as exc: + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc + + conn_key = SFMCache.make_conn_key(host, tcp_port, port, baud) + cleared = get_cache().clear_device(conn_key) + return { + "status": "ok", + "message": "Device event memory cleared", + "cache_cleared": cleared, + } + + +@app.post("/device/stop_monitoring_blind") +def device_stop_monitoring_blind( + host: str = Query(..., description="TCP host — modem IP"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + connect_timeout: float = Query(5.0, description="TCP connect timeout in seconds (default 5)"), + repeat: int = Query(3, description="How many times to send the frame within one TCP session (default 3)"), +) -> dict: + """ + Fire-and-forget Stop Monitoring (SUB 0x97). TCP-only. + + Opens a TCP session, dumps the FULL handshake the device's protocol + state machine expects — `SESSION_RESET + POLL_PROBE + SESSION_RESET + + POLL_DATA` — and then N back-to-back copies of the stop-monitoring + frame. Does NOT read any S3 response. Succeeds as long as the bytes + left the socket. + + The POLL handshake bytes are required: monitoring units ignore command + frames received without a preceding POLL exchange. Sending the POLL + bytes "blind" (without reading the responses) still works because the + device processes inbound bytes in order regardless of whether we drain + its outbound buffer. + + Idempotent: the device processes extra copies of SUB 0x97 the same as + one (already-stopped is a no-op). + + Returns the number of bytes sent. A 503 means the TCP connect failed + (device busy in another session — caller should retry). + """ + log.info( + "POST /device/stop_monitoring_blind host=%s tcp_port=%s connect_timeout=%.1fs repeat=%d", + host, tcp_port, connect_timeout, repeat, + ) + if repeat < 1: + repeat = 1 + + frame = build_bw_write_frame(SUB_STOP_MONITORING, b"") + payload = ( + SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA + + (frame * repeat) + ) + t0 = time.monotonic() + + transport = TcpTransport(host, port=tcp_port, connect_timeout=connect_timeout) + try: + transport.connect() + except OSError as exc: + raise HTTPException(status_code=503, detail=f"Connection error: {exc}") from exc + + try: + transport.write(payload) + except OSError as exc: + transport.disconnect() + raise HTTPException(status_code=502, detail=f"Send error: {exc}") from exc + finally: + transport.disconnect() + + return { + "status": "sent", + "bytes_sent": len(payload), + "frame_size": len(frame), + "repeat": repeat, + "elapsed_s": round(time.monotonic() - t0, 3), + } + + +@app.post("/device/stop_monitoring_slow_drip") +def device_stop_monitoring_slow_drip( + host: str = Query(..., description="TCP host — modem IP"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + duration_s: float = Query(120.0, description="Total time to hold the session open (seconds)"), + interval_s: float = Query(3.0, description="Seconds between drip sends"), + connect_timeout: float = Query(5.0, description="TCP connect timeout"), +) -> dict: + """ + Hold a single TCP session open for *duration_s* seconds and drip + stop-monitoring frames into the device at a slow rate so its UART + RX FIFO has time to drain between sends. + + Sequence: + 1. Open TCP session. + 2. Send the wake preamble: SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA (so the device's protocol parser + is primed for a write command). + 3. Wait interval_s for the device to drain. + 4. Drip-send (SESSION_RESET + stop_monitoring_frame) every + interval_s until duration_s elapses. + 5. Opportunistically drain any bytes the device sends back (so + the modem's TX queue doesn't fill up). Successful drains are + counted in `bytes_received` — non-zero strongly suggests the + device has started responding to us. + 6. Close. + + Designed for units whose firmware is too busy with event-recording + to keep up with high-rate spam. Heavy spam overruns the UART FIFO; + slow drip stays under it. + + Compared to spam mode: ~40× fewer bytes/sec on the wire, but each + byte has a much higher chance of actually being parsed. + """ + log.info( + "POST /device/stop_monitoring_slow_drip host=%s tcp_port=%s duration=%.1fs interval=%.2fs connect_timeout=%.1fs", + host, tcp_port, duration_s, interval_s, connect_timeout, + ) + duration_s = max(1.0, min(duration_s, 600.0)) # clamp 1s..10min + interval_s = max(0.1, min(interval_s, 30.0)) + connect_timeout = max(0.1, connect_timeout) + + stop_frame = build_bw_write_frame(SUB_STOP_MONITORING, b"") + preamble = ( + SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA + ) + + t0 = time.monotonic() + drips_sent = 0 + bytes_sent = 0 + bytes_received = 0 + + try: + sock = socket.create_connection((host, tcp_port), timeout=connect_timeout) + except OSError as exc: + raise HTTPException(status_code=503, detail=f"Connection error: {exc}") from exc + + # Short read timeout so opportunistic drains don't block. + sock.settimeout(0.1) + + try: + # Initial wake preamble. + try: + sock.sendall(preamble) + bytes_sent += len(preamble) + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Preamble send failed: {exc}") from exc + + # Initial settle. + time.sleep(interval_s) + + # Try a non-blocking drain of any response to the wake. + try: + data = sock.recv(4096) + if data: + bytes_received += len(data) + log.info("slow_drip: device responded to wake preamble (%d bytes)", len(data)) + except socket.timeout: + pass + except OSError: + pass + + deadline = t0 + duration_s + drip = SESSION_RESET + stop_frame # 2 + 21 = 23 bytes per drip + send_error: Optional[str] = None + + while time.monotonic() < deadline: + try: + sock.sendall(drip) + bytes_sent += len(drip) + drips_sent += 1 + except OSError as exc: + send_error = f"{exc}" + log.warning("slow_drip: send failed after %d drips: %s", drips_sent, exc) + break + + # Drain any inbound bytes; ignore timeouts. + try: + data = sock.recv(4096) + if data: + bytes_received += len(data) + except socket.timeout: + pass + except OSError: + pass + + # Sleep the interval, but don't oversleep past the deadline. + remaining = deadline - time.monotonic() + if remaining <= 0: + break + time.sleep(min(interval_s, remaining)) + finally: + try: + sock.shutdown(socket.SHUT_RDWR) + except OSError: + pass + try: + sock.close() + except OSError: + pass + + elapsed = time.monotonic() - t0 + log.info( + "slow_drip done — drips=%d bytes_sent=%d bytes_received=%d in %.1fs", + drips_sent, bytes_sent, bytes_received, elapsed, + ) + return { + "status": "done", + "duration_s": round(elapsed, 2), + "drips_sent": drips_sent, + "bytes_sent": bytes_sent, + "bytes_received": bytes_received, + "preamble_bytes": len(preamble), + "drip_bytes": len(drip), + "send_error": send_error, + } + + +@app.post("/device/stop_monitoring_spam") +def device_stop_monitoring_spam( + host: str = Query(..., description="TCP host — modem IP"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + duration_s: float = Query(10.0, description="How long to hammer the device for (seconds)"), + connect_timeout: float = Query(0.5, description="Per-attempt TCP connect timeout (default 0.5s)"), + repeat: int = Query(3, description="Stop frames per TCP session (default 3)"), +) -> dict: + """ + Hammer the device with blind stop-monitoring sessions as fast as + possible for `duration_s` seconds. Each attempt: open TCP → write + SESSION_RESET + POLL handshake + STOP frames × repeat → close. No + response is read. + + Designed for units that are aggressively calling home — short + connect_timeout (default 500 ms) means every failed attempt loses + only that much time before retrying, so we can fit several attempts + per second even when the modem is mostly busy with its own outbound + sessions. + + Single HTTP call kicks off the whole burst; counters are returned + when it finishes. No streaming; if you want live progress, watch + SFM logs. + """ + log.info( + "POST /device/stop_monitoring_spam host=%s tcp_port=%s duration=%.1fs connect_timeout=%.3fs repeat=%d", + host, tcp_port, duration_s, connect_timeout, repeat, + ) + if repeat < 1: + repeat = 1 + duration_s = max(0.1, min(duration_s, 300.0)) # clamp 0.1s..5min + connect_timeout = max(0.05, connect_timeout) + + frame = build_bw_write_frame(SUB_STOP_MONITORING, b"") + payload = ( + SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA + + (frame * repeat) + ) + + t0 = time.monotonic() + deadline = t0 + duration_s + sent_ok = 0 + connect_failed = 0 + write_failed = 0 + + while time.monotonic() < deadline: + try: + sock = socket.create_connection((host, tcp_port), timeout=connect_timeout) + except OSError: + connect_failed += 1 + continue + try: + sock.sendall(payload) + sent_ok += 1 + except OSError: + write_failed += 1 + finally: + try: + sock.shutdown(socket.SHUT_RDWR) + except OSError: + pass + try: + sock.close() + except OSError: + pass + + elapsed = time.monotonic() - t0 + total = sent_ok + connect_failed + write_failed + log.info( + "stop_monitoring_spam done — sent=%d connect_failed=%d write_failed=%d in %.2fs", + sent_ok, connect_failed, write_failed, elapsed, + ) + return { + "status": "done", + "duration_s": round(elapsed, 2), + "sent_ok": sent_ok, + "connect_failed": connect_failed, + "write_failed": write_failed, + "total_attempts": total, + "rate_attempts_per_s": round(total / elapsed, 1) if elapsed > 0 else 0, + "payload_bytes": len(payload), + } + + +@app.post("/device/rescue") +def device_rescue( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + connect_timeout: float = Query(5.0, description="TCP connect timeout in seconds (default 5)"), + recv_timeout: float = Query(5.0, description="Per-frame S3 recv timeout in seconds (default 5)"), + disable_ach: bool = Query(True, description="Disable Auto Call Home on the device before erasing"), + erase: bool = Query(True, description="Erase all stored events after disabling ACH"), +) -> dict: + """ + Rescue an uncooperative unit by squeezing all maintenance work into a + single TCP session. + + Designed for devices that are actively calling home to a separate ACH + server (BW or otherwise). While we hold this TCP session open the + modem cannot accept an inbound ACH call, so the order matters: + + 1. Short-timeout TCP connect (fails fast if the device is busy in + another session — the caller should retry in a tight loop). + 2. POLL handshake. + 3. (optional) Write call_home config with auto_call_home_enabled=false + so the device stops calling out even after we drop the session. + 4. (optional) Erase all stored events (0xA3 → 0x1C → 0x06 → 0xA2). + 5. Close the TCP session. + + Both `disable_ach` and `erase` default to true. Pass `?erase=false` if + you only want to silence the unit without wiping its events. + + Caller pattern (bash): + + until curl -sS --max-time 30 -X POST \\ + "http://localhost:8001/api/sfm/device/rescue?host=$IP&tcp_port=$P"; do + sleep 1 + done + """ + log.info( + "POST /device/rescue host=%s tcp_port=%s connect_timeout=%.1fs recv_timeout=%.1fs disable_ach=%s erase=%s", + host, tcp_port, connect_timeout, recv_timeout, disable_ach, erase, + ) + + steps: list[dict] = [] + t0 = time.monotonic() + + try: + with _build_client( + port, baud, host, tcp_port, + timeout=recv_timeout, + connect_timeout=connect_timeout, + ) as client: + steps.append({"step": "tcp_connect", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + try: + client.poll() + except Exception as exc: + log.warning("rescue: poll retry: %s", exc) + client.poll() + steps.append({"step": "poll", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + if disable_ach: + client.set_call_home_config(auto_call_home_enabled=False) + steps.append({"step": "disable_ach", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + if erase: + client.delete_all_events() + steps.append({"step": "erase", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + except ProtocolError as exc: + steps.append({"step": "error", "ok": False, "detail": f"protocol: {exc}"}) + raise HTTPException(status_code=502, detail={"message": f"Protocol error: {exc}", "steps": steps}) from exc + except OSError as exc: + steps.append({"step": "error", "ok": False, "detail": f"socket: {exc}"}) + # Connection refused / timed out → device busy in another session. Caller should retry. + raise HTTPException(status_code=503, detail={"message": f"Connection error: {exc}", "steps": steps}) from exc + except Exception as exc: + steps.append({"step": "error", "ok": False, "detail": str(exc)}) + raise HTTPException(status_code=500, detail={"message": f"Device error: {exc}", "steps": steps}) from exc + + conn_key = SFMCache.make_conn_key(host, tcp_port, port, baud) + cleared = get_cache().clear_device(conn_key) + return { + "status": "ok", + "elapsed_s": round(time.monotonic() - t0, 2), + "disable_ach": disable_ach, + "erase": erase, + "steps": steps, + "cache_cleared": cleared, + } + + @app.post("/device/monitor/start") def device_monitor_start( port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), @@ -1403,6 +1952,175 @@ def db_set_false_trigger( return {"status": "ok", "event_id": event_id, "false_trigger": value} +def _cleanup_event_files(row: dict) -> dict: + """ + Best-effort cleanup of on-disk waveform / sidecar / pickle / hdf5 files + associated with a deleted event row. Returns a dict of {kind: bool} for + what was actually removed (true) vs not found / failed (false). + """ + serial = row.get("serial") + bw_name = row.get("blastware_filename") + a5_name = row.get("a5_pickle_filename") + sc_name = row.get("sidecar_filename") + removed: dict = {} + if not serial: + return removed + store = _get_store() + # blastware_filename is the "base" — other files derive their paths from it + # via WaveformStore helpers. Sidecar and a5 may also be stored under their + # own column values if they ever diverged historically. + base_name = bw_name or a5_name or sc_name + if base_name: + bw_path, a5_path = store.paths_for(serial, base_name) + sc_path = store.sidecar_path_for(serial, base_name) + h5_path = store.hdf5_path_for(serial, base_name) + for kind, p in [("blastware", bw_path), ("a5_pickle", a5_path), + ("sidecar", sc_path), ("hdf5", h5_path)]: + try: + if p.exists(): + p.unlink() + removed[kind] = True + except OSError as exc: + log.warning("file cleanup failed for %s (%s): %s", p, kind, exc) + removed[kind] = False + return removed + + +@app.delete("/db/events/{event_id}") +def db_delete_event(event_id: str) -> dict: + """ + Hard-delete a single event from the SFM events table and remove any + associated on-disk waveform/sidecar/pickle/hdf5 files. + + Returns 404 if the event_id is not found. + """ + log.info("DELETE /db/events/%s", event_id) + deleted = _get_db().delete_event(event_id) + if deleted is None: + raise HTTPException(status_code=404, detail=f"Event {event_id} not found") + files_removed = _cleanup_event_files(deleted) + return { + "status": "ok", + "event_id": event_id, + "files_removed": files_removed, + } + + +class BulkDeleteBody(BaseModel): + """Body for POST /db/events/delete_bulk.""" + serial: Optional[str] = None + from_dt: Optional[str] = None # ISO-8601 + to_dt: Optional[str] = None # ISO-8601 + false_trigger: Optional[bool] = None + ids: Optional[list[str]] = None + confirm: bool = False + # Safety: when no `ids` are supplied, require this many max rows to + # actually be deleted; if the matched count exceeds it, the endpoint + # returns a dry-run-style summary instead. Pass None to disable. + max_rows: Optional[int] = 10000 + + +@app.post("/db/events/delete_bulk") +def db_delete_events_bulk(body: BulkDeleteBody) -> dict: + """ + Hard-delete multiple events at once, by filter and/or by id list. + + Filters (`serial`, `from_dt`, `to_dt`, `false_trigger`) combine with AND, + matching the same semantics as `GET /db/events`. `ids` is an additional + inclusion list. At least one filter or non-empty `ids` MUST be supplied + — refusing to wipe the whole table. + + Safety knobs: + - `confirm` MUST be `true` to actually delete. When false (default), + returns the match count without deleting (dry-run). + - `max_rows` (default 10,000) caps how many rows can be deleted in one + call by-filter; if the match count exceeds it, the endpoint returns + a count summary without deleting. Ignored when only `ids` is used. + + Returns: + { + "status": "ok" | "dry_run" | "too_many", + "matched": , + "deleted": , # 0 unless status == "ok" + "files_removed": , # total file unlink successes + "sample_serials": [...], # up to 5 distinct serials touched + } + """ + log.info( + "POST /db/events/delete_bulk serial=%s from=%s to=%s ft=%s ids=%d confirm=%s max=%s", + body.serial, body.from_dt, body.to_dt, body.false_trigger, + len(body.ids or []), body.confirm, body.max_rows, + ) + + from_parsed = datetime.datetime.fromisoformat(body.from_dt) if body.from_dt else None + to_parsed = datetime.datetime.fromisoformat(body.to_dt) if body.to_dt else None + + db = _get_db() + + # Dry-run path: count matches without deleting. + rows = db.query_events( + serial=body.serial, + from_dt=from_parsed, + to_dt=to_parsed, + false_trigger=body.false_trigger, + limit=1_000_000, # we want a true count, not a page + offset=0, + ) + if body.ids: + id_set = set(body.ids) + rows = [r for r in rows if r["id"] in id_set] + matched = len(rows) + sample_serials = sorted({r.get("serial") for r in rows[:50] if r.get("serial")})[:5] + + if not body.confirm: + return { + "status": "dry_run", + "matched": matched, + "deleted": 0, + "files_removed": 0, + "sample_serials": sample_serials, + "hint": "Set confirm=true in the request body to actually delete.", + } + + if body.max_rows is not None and not body.ids and matched > body.max_rows: + return { + "status": "too_many", + "matched": matched, + "deleted": 0, + "files_removed": 0, + "sample_serials": sample_serials, + "hint": ( + f"Matched {matched} > max_rows={body.max_rows}. Either raise " + f"max_rows in the body, narrow the filter, or supply an " + f"explicit `ids` list." + ), + } + + try: + deleted_rows = db.delete_events_bulk( + serial=body.serial, + from_dt=from_parsed, + to_dt=to_parsed, + false_trigger=body.false_trigger, + ids=body.ids, + ) + except ValueError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + + files_removed = 0 + for row in deleted_rows: + result = _cleanup_event_files(row) + files_removed += sum(1 for ok in result.values() if ok) + + return { + "status": "ok", + "matched": matched, + "deleted": len(deleted_rows), + "files_removed": files_removed, + "sample_serials": sample_serials, + } + + # ── /db/events/{id} — waveform file accessors ───────────────────────────────── # # These endpoints serve files from the persistent WaveformStore, so a Blastware -- 2.52.0 From 57287a2ade186c12f17e82b528820fe4d90f797f Mon Sep 17 00:00:00 2001 From: serversdown Date: Sun, 17 May 2026 23:07:12 +0000 Subject: [PATCH 5/5] chore: update to 0.17.0 --- CHANGELOG.md | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++ CLAUDE.md | 2 +- README.md | 2 +- pyproject.toml | 2 +- sfm/server.py | 2 +- 5 files changed, 60 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0bdfdf4..a1c60e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,62 @@ All notable changes to seismo-relay are documented here. --- +## v0.17.0 — 2026-05-17 + +The "field rescue + DB management" release. Hardened against units that are stuck in a runaway call-home loop, and added an operator-facing path for purging bogus events that those same units dump into the DB before recovery. All work in this release was driven by the BE9558H incident (full incident log + recovery procedure at `docs/runbooks/wedged_unit_recovery.md`). + +### Added — wedged-unit recovery toolkit + +A toolkit for breaking the call-home loop on a misbehaving unit whose firmware is too busy to keep up with normal request/response handshakes. Tested in production against BE9558H (16 May 2026) — a unit with a stuck-triggered Long-axis geophone that had been call-homing the office BW ACH server every 30 seconds for hours. Endpoints layered from "single attempt" to "siege mode" to suit different contention levels: + +- **`GET /device/events/storage_range`** — SUB 0x06 probe. POLL + one read; ~2s. Returns first/last event keys and an `is_empty` flag. Use to triage whether a unit has stored events without invoking the slow `count_events()` 1E/1F chain (which choked on BE9558H's corrupted event chain). +- **`GET /device/events/index`** — SUB 0x08 probe. POLL + one read; ~2s. Returns the lifetime event counter (does NOT decrement on erase — use `storage_range` for "right now" state). +- **`POST /device/events/erase`** — full erase sequence `0xA3 → 0x1C → 0x06 → 0xA2` (confirmed 2026-04-11, see the protocol reference). Resets event keys to `0x01110000`. Caller's responsibility to disable ACH first if the underlying trigger condition will re-fill the buffer. +- **`POST /device/rescue`** — one TCP session, short connect+recv timeouts: POLL → disable ACH (compliance config write) → erase events → close. Designed for race-loop usage when the device is busy in another session. 503 on connect-refused, 502 on protocol failure, 200 on full sequence success. +- **`POST /device/stop_monitoring_blind`** — fire-and-forget Stop Monitoring (SUB 0x97), TCP-only. Dumps `SESSION_RESET + POLL_PROBE + SESSION_RESET + POLL_DATA + 0x97 × repeat` and closes without reading any S3 response. The full POLL preamble is required — write commands without it are silently ignored by the device's protocol parser (false-positive surface area that bit the first version of this endpoint). Use when the device's firmware can't keep up with full request/response but might process inbound bytes at its own pace. +- **`POST /device/stop_monitoring_spam`** — server-side hammer loop, duration-bounded. Open TCP → write the same blind payload → close → repeat as fast as possible until `duration_s` elapses. Configurable `connect_timeout` (default 500ms) and `repeat` (frames per session). Reports `sent_ok`, `connect_failed`, `write_failed`, `rate_attempts_per_s`. Clamped to 5min duration. +- **`POST /device/stop_monitoring_slow_drip`** — opposite of spam. Open ONE TCP session, drip the wake handshake + stop frames at `interval_s` (default 3s) for `duration_s` (default 120s, max 10min). Each drip is ~23 bytes — well under any UART FIFO size. Opportunistically drains any inbound bytes the device sends back; `bytes_received > 0` in the response strongly suggests the device has started talking and the session is healthy. **This is the endpoint that saved BE9558H.** Spam mode had been overrunning the device's UART FIFO; slow drip stayed under it. +- **Six rescue scripts** under `scripts/` — thin bash wrappers around the endpoints, default `SFM_BASE_URL=http://localhost:8200` (direct, not via Terra-View proxy whose 60s timeout would cut off the longer endpoints): + - `rescue_device.sh` — race-loop wrapper for `/device/rescue` + - `blind_stop.sh` — race-loop wrapper for `/device/stop_monitoring_blind` + - `spam_stop.sh` — single-call burst hammer + - `slow_drip.sh` — single-call held-session drip + - `watch_unit.sh` — passive periodic reachability check (every N min, logs to file), useful for unattended overnight monitoring of a wedged unit +- **`docs/runbooks/wedged_unit_recovery.md`** — symptoms, quick-reference recovery procedure, the modem-layer mechanism (Sierra Wireless serial-port mode-flipping is the real failure mode — not the device firmware), and a table of "why simpler approaches don't work" so the next incident skips the dead ends. + +### Added — operator event DB management + +Endpoints powering Terra-View's new `/admin/events` page (v0.12.0). Designed for purging bogus events from a unit that's been forwarding them in bulk (e.g. a stuck-triggered seismograph dumping hundreds of junk events before it's recovered). + +- **`DELETE /db/events/{event_id}`** — hard-delete one event row. Also unlinks the associated blastware binary (`.AB0*`), `.a5.pkl`, `.sfm.json` sidecar, and `.h5` clean-waveform files via the WaveformStore. Returns the per-file removal status. 404 if the event doesn't exist. +- **`POST /db/events/delete_bulk`** — filter-based or id-list-based bulk delete with safety rails: + - Filters (`serial`, `from_dt`, `to_dt`, `false_trigger`) combine with AND; same semantics as `GET /db/events`. `ids` is an additional inclusion list. Refuses to run with no filters (would wipe the whole table — raises 422). + - `confirm` must be `true` to actually delete. Otherwise returns a dry-run summary (`status: "dry_run"`, `matched: N`, `sample_serials: [...]`). + - `max_rows` (default 10,000) caps how many rows can be deleted by-filter in one call. If exceeded, returns `status: "too_many"` with a hint to narrow or raise the cap. Bypassed when only `ids` is supplied. +- **`_cleanup_event_files(row)`** helper in `sfm/server.py` — best-effort `unlink()` of all four sidecar paths derived from the row's `blastware_filename`. Logged at WARN if a path exists but unlink fails; the DB row deletion still proceeds. +- **`SeismoDb.delete_event(id)` and `SeismoDb.delete_events_bulk(...)`** in `sfm/database.py` — both return the deleted row dict(s) so callers can do file cleanup. `delete_events_bulk` raises `ValueError` if no filters are supplied. + +### Changed + +- **Default protocol recv timeout dropped from 30s → 10s** in `_build_client()`. The unit usually responds in well under a second over cellular; 10s leaves comfortable headroom for retransmits while failing reasonably fast when a unit is wedged. The two endpoints that perform full 5A waveform downloads still pass `timeout=120.0` explicitly so multi-minute event transfers are unaffected. +- **`_build_client()` now accepts an optional `connect_timeout`** (TCP-only) so rescue / race-loop endpoints can fail fast on busy modems without affecting the protocol-level recv timeout. + +### Fixed + +- **`GET /device/monitor/status` returned HTTP 500 + uncaught traceback when the device was unresponsive**. The retry-on-`Exception` inner block let the second `client.poll()`'s `ProtocolError` propagate out of the handler. Now wrapped in proper try/except — returns 502 with `{"detail": "Protocol error: No S3 frame received within 10.0s ..."}` on timeout, 502 on connection errors, 500 only for genuinely unexpected exceptions. + +### Migration + +No schema changes. No data migration required. + +If you've been running a previous version against a wedged unit and accumulated bogus events, the new `/admin/events` page in Terra-View v0.12.0 (or direct `POST /db/events/delete_bulk` with `confirm: true`) is the cleanup tool. Watcher state on the upstream DL2 PC does NOT need separate cleaning — the watcher's `sfm_forwarded.json` keys on file sha256 and won't re-forward the same files. + +### Pairing + +This release pairs with **Terra-View v0.12.0**, which adds the `/admin/events` UI that consumes the new bulk-delete endpoints, the bulk false-trigger flagging on `/unit/{id}`, and the field-deployment workflow that uses the same `series3-watcher` → SFM ingest path as before. + +--- + ## v0.16.1 — 2026-05-14 ### Fixed diff --git a/CLAUDE.md b/CLAUDE.md index 56a9e69..1f8d71a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ Ground-up Python replacement for **Blastware**, Instantel's Windows-only software for managing MiniMate Plus seismographs. Connects over direct RS-232 or cellular modem -(Sierra Wireless RV50 / RV55). Current version: **v0.14.3**. +(Sierra Wireless RV50 / RV55). Current version: **v0.17.0**. When new information about the protocol is discovered, please update the instantel_protocol_reference.md with the findings in addition to this document diff --git a/README.md b/README.md index e94c1cd..79534c8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# seismo-relay `v0.16.1` +# seismo-relay `v0.17.0` A ground-up replacement for **Blastware** — Instantel's aging Windows-only software for managing MiniMate Plus seismographs. diff --git a/pyproject.toml b/pyproject.toml index c8abd3c..922e5f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "seismo-relay" -version = "0.16.1" +version = "0.17.0" description = "Python client and REST server for MiniMate Plus seismographs" requires-python = ">=3.10" dependencies = [ diff --git a/sfm/server.py b/sfm/server.py index 95937c6..603e2d2 100644 --- a/sfm/server.py +++ b/sfm/server.py @@ -89,7 +89,7 @@ app = FastAPI( "Implements the minimateplus RS-232 protocol library.\n" "Proxied by terra-view at /api/sfm/*." ), - version="0.16.1", + version="0.17.0", ) # Allow requests from the waveform viewer opened as a local file (file://) -- 2.52.0