diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..39f70c8 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,28 @@ +.git +.gitignore + +.venv +venv +env +__pycache__ +*.pyc +*.pyo +*.pyd +.pytest_cache +.mypy_cache +.ruff_cache + +*.db +*.db-wal +*.db-shm +*.sqlite +*.sqlite3 + +sfm/data +bridges/captures +example-events +captures +logs + +.DS_Store +Thumbs.db \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a9f718..a1c60e2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,76 @@ All notable changes to seismo-relay are documented here. --- +## v0.17.0 — 2026-05-17 + +The "field rescue + DB management" release. Hardened against units that are stuck in a runaway call-home loop, and added an operator-facing path for purging bogus events that those same units dump into the DB before recovery. All work in this release was driven by the BE9558H incident (full incident log + recovery procedure at `docs/runbooks/wedged_unit_recovery.md`). + +### Added — wedged-unit recovery toolkit + +A toolkit for breaking the call-home loop on a misbehaving unit whose firmware is too busy to keep up with normal request/response handshakes. Tested in production against BE9558H (16 May 2026) — a unit with a stuck-triggered Long-axis geophone that had been call-homing the office BW ACH server every 30 seconds for hours. Endpoints layered from "single attempt" to "siege mode" to suit different contention levels: + +- **`GET /device/events/storage_range`** — SUB 0x06 probe. POLL + one read; ~2s. Returns first/last event keys and an `is_empty` flag. Use to triage whether a unit has stored events without invoking the slow `count_events()` 1E/1F chain (which choked on BE9558H's corrupted event chain). +- **`GET /device/events/index`** — SUB 0x08 probe. POLL + one read; ~2s. Returns the lifetime event counter (does NOT decrement on erase — use `storage_range` for "right now" state). +- **`POST /device/events/erase`** — full erase sequence `0xA3 → 0x1C → 0x06 → 0xA2` (confirmed 2026-04-11, see the protocol reference). Resets event keys to `0x01110000`. Caller's responsibility to disable ACH first if the underlying trigger condition will re-fill the buffer. +- **`POST /device/rescue`** — one TCP session, short connect+recv timeouts: POLL → disable ACH (compliance config write) → erase events → close. Designed for race-loop usage when the device is busy in another session. 503 on connect-refused, 502 on protocol failure, 200 on full sequence success. +- **`POST /device/stop_monitoring_blind`** — fire-and-forget Stop Monitoring (SUB 0x97), TCP-only. Dumps `SESSION_RESET + POLL_PROBE + SESSION_RESET + POLL_DATA + 0x97 × repeat` and closes without reading any S3 response. The full POLL preamble is required — write commands without it are silently ignored by the device's protocol parser (false-positive surface area that bit the first version of this endpoint). Use when the device's firmware can't keep up with full request/response but might process inbound bytes at its own pace. +- **`POST /device/stop_monitoring_spam`** — server-side hammer loop, duration-bounded. Open TCP → write the same blind payload → close → repeat as fast as possible until `duration_s` elapses. Configurable `connect_timeout` (default 500ms) and `repeat` (frames per session). Reports `sent_ok`, `connect_failed`, `write_failed`, `rate_attempts_per_s`. Clamped to 5min duration. +- **`POST /device/stop_monitoring_slow_drip`** — opposite of spam. Open ONE TCP session, drip the wake handshake + stop frames at `interval_s` (default 3s) for `duration_s` (default 120s, max 10min). Each drip is ~23 bytes — well under any UART FIFO size. Opportunistically drains any inbound bytes the device sends back; `bytes_received > 0` in the response strongly suggests the device has started talking and the session is healthy. **This is the endpoint that saved BE9558H.** Spam mode had been overrunning the device's UART FIFO; slow drip stayed under it. +- **Six rescue scripts** under `scripts/` — thin bash wrappers around the endpoints, default `SFM_BASE_URL=http://localhost:8200` (direct, not via Terra-View proxy whose 60s timeout would cut off the longer endpoints): + - `rescue_device.sh` — race-loop wrapper for `/device/rescue` + - `blind_stop.sh` — race-loop wrapper for `/device/stop_monitoring_blind` + - `spam_stop.sh` — single-call burst hammer + - `slow_drip.sh` — single-call held-session drip + - `watch_unit.sh` — passive periodic reachability check (every N min, logs to file), useful for unattended overnight monitoring of a wedged unit +- **`docs/runbooks/wedged_unit_recovery.md`** — symptoms, quick-reference recovery procedure, the modem-layer mechanism (Sierra Wireless serial-port mode-flipping is the real failure mode — not the device firmware), and a table of "why simpler approaches don't work" so the next incident skips the dead ends. + +### Added — operator event DB management + +Endpoints powering Terra-View's new `/admin/events` page (v0.12.0). Designed for purging bogus events from a unit that's been forwarding them in bulk (e.g. a stuck-triggered seismograph dumping hundreds of junk events before it's recovered). + +- **`DELETE /db/events/{event_id}`** — hard-delete one event row. Also unlinks the associated blastware binary (`.AB0*`), `.a5.pkl`, `.sfm.json` sidecar, and `.h5` clean-waveform files via the WaveformStore. Returns the per-file removal status. 404 if the event doesn't exist. +- **`POST /db/events/delete_bulk`** — filter-based or id-list-based bulk delete with safety rails: + - Filters (`serial`, `from_dt`, `to_dt`, `false_trigger`) combine with AND; same semantics as `GET /db/events`. `ids` is an additional inclusion list. Refuses to run with no filters (would wipe the whole table — raises 422). + - `confirm` must be `true` to actually delete. Otherwise returns a dry-run summary (`status: "dry_run"`, `matched: N`, `sample_serials: [...]`). + - `max_rows` (default 10,000) caps how many rows can be deleted by-filter in one call. If exceeded, returns `status: "too_many"` with a hint to narrow or raise the cap. Bypassed when only `ids` is supplied. +- **`_cleanup_event_files(row)`** helper in `sfm/server.py` — best-effort `unlink()` of all four sidecar paths derived from the row's `blastware_filename`. Logged at WARN if a path exists but unlink fails; the DB row deletion still proceeds. +- **`SeismoDb.delete_event(id)` and `SeismoDb.delete_events_bulk(...)`** in `sfm/database.py` — both return the deleted row dict(s) so callers can do file cleanup. `delete_events_bulk` raises `ValueError` if no filters are supplied. + +### Changed + +- **Default protocol recv timeout dropped from 30s → 10s** in `_build_client()`. The unit usually responds in well under a second over cellular; 10s leaves comfortable headroom for retransmits while failing reasonably fast when a unit is wedged. The two endpoints that perform full 5A waveform downloads still pass `timeout=120.0` explicitly so multi-minute event transfers are unaffected. +- **`_build_client()` now accepts an optional `connect_timeout`** (TCP-only) so rescue / race-loop endpoints can fail fast on busy modems without affecting the protocol-level recv timeout. + +### Fixed + +- **`GET /device/monitor/status` returned HTTP 500 + uncaught traceback when the device was unresponsive**. The retry-on-`Exception` inner block let the second `client.poll()`'s `ProtocolError` propagate out of the handler. Now wrapped in proper try/except — returns 502 with `{"detail": "Protocol error: No S3 frame received within 10.0s ..."}` on timeout, 502 on connection errors, 500 only for genuinely unexpected exceptions. + +### Migration + +No schema changes. No data migration required. + +If you've been running a previous version against a wedged unit and accumulated bogus events, the new `/admin/events` page in Terra-View v0.12.0 (or direct `POST /db/events/delete_bulk` with `confirm: true`) is the cleanup tool. Watcher state on the upstream DL2 PC does NOT need separate cleaning — the watcher's `sfm_forwarded.json` keys on file sha256 and won't re-forward the same files. + +### Pairing + +This release pairs with **Terra-View v0.12.0**, which adds the `/admin/events` UI that consumes the new bulk-delete endpoints, the bulk false-trigger flagging on `/unit/{id}`, and the field-deployment workflow that uses the same `series3-watcher` → SFM ingest path as before. + +--- + +## v0.16.1 — 2026-05-14 + +### Fixed + +- **`record_type` always "Waveform" for forwarded events.** `read_blastware_file()` hardcoded `ev.record_type = "Waveform"` regardless of the file's actual type. The watcher-forward pipeline (the main BW ACH ingest path) compounds this by parsing files from a tmp path with a `.bw` suffix, so even a filename-based fallback inside the parser still wouldn't see the original extension. Now: + + 1. New `derive_record_type_from_filename(filename)` helper in `minimateplus/event_file_io.py` derives the type from the LAST character of the filename's extension (V10.72+ AB0T scheme: `H`=Histogram, `W`=Waveform, `M`=Manual, `E`=Event, `C`=Combo). Falls back to `"Waveform"` for old S338 firmware (3-char extensions ending in `0`) and any unrecognized suffix. + 2. `read_blastware_file()` now calls the helper with its `path.name` so direct callers (the `--dry-run` path in `scripts/import_bw.py`, tests, ad-hoc scripts) get the right value automatically. + 3. `WaveformStore.save_imported_bw()` overrides `ev.record_type` with the **original** filename's derived type after parsing (the tmp file inside the parser doesn't carry the original extension). This is the path the live watcher-forwarder hits, so the DB column now reflects the actual event type going forward. + + Events ingested before this fix are stuck with `record_type="Waveform"` in the DB; a one-off backfill (`UPDATE events SET record_type = ... WHERE blastware_filename LIKE '%H'`) would fix them retroactively if desired. Terra-view's event modal also derives client-side from the filename, so the UI already shows the correct type for old events even without the backfill. + +--- + ## v0.16.0 — 2026-05-11 The "BW ACH ingestion" release. When paired with **series3-watcher v1.5.0**, every Blastware ACH event (binary + `_ASCII.TXT` report) lands in SeismoDb with device-authoritative peaks, project metadata, sensor self-check, and ZC/Time-of-Peak data — without depending on the still-undecoded waveform body codec. This is the end-to-end product win discussed in v0.15.0's "out of scope" notes: sortable / filterable monthly-summary review of historical events, populated from the BW ASCII export rather than re-decoded samples. diff --git a/CLAUDE.md b/CLAUDE.md index 56a9e69..1f8d71a 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -2,7 +2,7 @@ Ground-up Python replacement for **Blastware**, Instantel's Windows-only software for managing MiniMate Plus seismographs. Connects over direct RS-232 or cellular modem -(Sierra Wireless RV50 / RV55). Current version: **v0.14.3**. +(Sierra Wireless RV50 / RV55). Current version: **v0.17.0**. When new information about the protocol is discovered, please update the instantel_protocol_reference.md with the findings in addition to this document diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..8fb05f7 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,18 @@ +FROM python:3.11-slim + +WORKDIR /app + +RUN apt-get update && \ + apt-get install -y --no-install-recommends curl && \ + rm -rf /var/lib/apt/lists/* + +COPY pyproject.toml requirements.txt ./ +COPY minimateplus ./minimateplus +COPY sfm ./sfm +COPY bridges ./bridges + +RUN pip install --no-cache-dir -e . + +EXPOSE 8200 + +CMD ["python", "-m", "uvicorn", "sfm.server:app", "--host", "0.0.0.0", "--port", "8200"] \ No newline at end of file diff --git a/README.md b/README.md index fc92a77..79534c8 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# seismo-relay `v0.16.0` +# seismo-relay `v0.17.0` A ground-up replacement for **Blastware** — Instantel's aging Windows-only software for managing MiniMate Plus seismographs. diff --git a/docs/runbooks/wedged_unit_recovery.md b/docs/runbooks/wedged_unit_recovery.md new file mode 100644 index 0000000..8d27dd0 --- /dev/null +++ b/docs/runbooks/wedged_unit_recovery.md @@ -0,0 +1,255 @@ +# Runbook — Recovering a wedged unit stuck in a call-home loop + +**Original incident:** BE9558H at `166.246.130.1:9034`, recovered 2026-05-17. + +A field unit with a stuck-triggered geophone (or any hardware fault causing +constant event triggering) will record events back-to-back, and if Auto Call +Home is set to "After Event Recorded" the device will dial the office BW +ACH server in a tight loop. Combined with a Sierra Wireless modem in +bidirectional serial-TCP mode, this makes the unit effectively unreachable +from SFM — every TCP connection we open gets killed when the modem flips +from server-mode to client-mode to honor the device's next AT dial command. + +This runbook describes how to break the loop and recover control. + +--- + +## Symptoms + +- Terra-View / SFM `/device/info` either hangs or fails on `count_events()`. +- `/device/monitor/status` and `/device/rescue` return 502 (protocol timeout + waiting for POLL response) or 503 (TCP connect refused). +- ACEmanager serial log shows repeating + `Connect to IP: Port: ` → `Shutdown TCP socket` cycles + every 30-60 seconds. +- Spam-mode endpoints (`/device/stop_monitoring_spam`) report many + `sent_ok` but the device's monitoring state never changes. +- `slow_drip` reports `[Errno 32] Broken pipe` after sending the preamble + but before completing the drip loop. + +If you see *all* of these, the unit is in this exact failure mode. + +--- + +## Quick reference — how to recover + +You need **ACEmanager access** to the unit's modem. + +### Step 1: stop the modem's mode-flipping + +In ACEmanager → **Serial → Port Configuration**: + +| Field | Set to | +|---|---| +| **Destination Address** | clear (blank) | +| **Destination Port** | `0` | + +Click **Apply**. This removes the modem's auto-dial-out target. The device's +AT dial commands now error back at the modem instead of triggering a +mode-flip, so the modem stays in TCP-server mode permanently and our inbound +TCP sessions stay alive. + +*(Optional belt-and-suspenders: also add the BW server's port to +**Security → Port Filtering - Outbound** as a blocked port, with +Outbound Port Filtering Mode = Blocked Ports.)* + +### Step 2: stop monitoring on the device (slow drip) + +From the SFM host: + +```bash +/home/serversdown/seismo-relay/scripts/slow_drip.sh +``` + +Defaults are 120s duration with a drip every 3s. Watch the response: + +- `duration_s ≈ 120` and `drips_sent ≈ 40` → session held the full duration ✓ +- `bytes_received > 0` → device is responding ✓ (this is the success signal) + +If `duration_s` is small or `send_error: "Broken pipe"`, Step 1 didn't take +hold — re-check ACEmanager, may need to reboot the modem after Apply. + +### Step 3: confirm monitoring stopped + +```bash +curl 'http://localhost:8200/device/monitor/status?host=&tcp_port=&force=true' +# expect: {"is_monitoring": false, ...} +``` + +### Step 4: disable ACH at the device level + erase corrupted events + +Either fire the rescue endpoint: + +```bash +/home/serversdown/seismo-relay/scripts/rescue_device.sh +``` + +Or do the two steps manually: + +```bash +# Disable ACH in the device's compliance config +curl -X POST 'http://localhost:8200/device/call_home?host=&tcp_port=' \ + -H 'Content-Type: application/json' \ + -d '{"auto_call_home_enabled": false}' + +# Erase corrupted event chain +curl -X POST 'http://localhost:8200/device/events/erase?host=&tcp_port=' +``` + +You can also do this via the SFM standalone UI → **Call Home** tab → set +`Enable Auto Call Home` to `Disabled` → **Write to Device**. + +### Step 5: restore modem config (housekeeping) + +Once the device-side ACH is disabled, restore the modem's Destination +Address and Port to the original values (e.g. `50.197.32.92` / `12345`) in +ACEmanager. The modem will resume normal bidirectional behavior, but the +unit won't issue any dial commands until ACH is explicitly re-enabled on +the device. + +### Step 6: do NOT re-enable ACH on this unit until the underlying hardware +fault is repaired. If you do, the call-home loop starts again immediately +and you'll be running this runbook a second time. + +--- + +## Why this works — the failure mode explained + +The Sierra Wireless RV50/RV55 serial port operates in one of two TCP modes +at any moment: + +- **Server mode** — listens on `Device Port` (e.g. 9034), bridges inbound + TCP to the device's serial port. This is what we need to interact with + the device. +- **Client mode** — when the device sends an AT dial command on its serial + TX line, the modem opens an outbound TCP to `Destination Address:Port` + and bridges that to serial. + +A serial port in this configuration is **bidirectional**: the modem flips +between server and client modes on demand. When the device's firmware is +healthy and only dials occasionally, this works fine. + +When the unit is constantly triggering events and ACH is set to "After +Event Recorded", the device sends an AT dial command every few seconds. +Each one causes the modem to: + +1. Drop any active inbound TCP session +2. Flip to client mode +3. Attempt outbound TCP to `Destination Address:Port` +4. Hang for up to a minute waiting for it to succeed/fail +5. Drop back to server mode + +**During the entire hang, no inbound TCP can establish.** Even between +hangs, the modem closes any existing inbound session before flipping. So +any tool that needs more than a few seconds of held TCP (e.g. POLL + +config read + write) gets repeatedly kicked off. + +Clearing `Destination Address` removes step 3-4 from the cycle: the modem +has nowhere to dial, so it doesn't flip modes when it receives an AT dial +command. The serial port effectively becomes server-only, and inbound TCP +sessions can stay open as long as needed. + +**This is a modem-layer issue, not a device firmware issue.** The device +is alive and responsive the whole time — confirmed in the BE9558H +recovery by 990 bytes of S3 responses received over a 120s slow-drip +session once the modem was no longer mode-flipping. + +--- + +## Why simpler approaches don't work + +| Approach | Why it fails | +|---|---| +| Standard `/device/info` | Triggers `count_events()` 1E/1F walk, takes 90s+ and hits corrupted event chain in this scenario | +| `/device/rescue` race loop | Gets 502 (protocol timeout) because the modem closes the TCP before the POLL handshake can complete | +| `/device/stop_monitoring_blind` (single frame) | Even if the bytes leave the wire, the device's protocol parser ignores write commands without a preceding POLL handshake (early-version bug, now fixed by including POLL preamble in blind sends) | +| `/device/stop_monitoring_spam` (sub-second cadence) | Each session is killed by the modem's mode-flip before the device can drain its UART RX buffer; high-rate spam also risks UART FIFO overrun on the device side | +| Outbound port firewall block alone | Stops the outbound TCP from succeeding, but doesn't stop the modem from *trying* and mode-flipping. Reduces but doesn't eliminate the contention. | +| Modem reboot | Temporary — as soon as the device starts triggering again, the loop resumes within seconds | + +The combination of `slow_drip` + cleared `Destination Address` works because: + +1. The modem stops mode-flipping → TCP session stays open for the full + drip duration +2. Slow drip rate → device's UART RX FIFO never overflows even if + firmware is busy with event recording +3. The drip is `SESSION_RESET + STOP_MONITORING` every 3s → many + independent chances for the parser to land one valid frame +4. Once one Stop Monitoring is parsed, event recording halts → firmware + has CPU to spare → subsequent operations are trivially easy + +--- + +## Tooling reference + +All endpoints live in `seismo-relay/sfm/server.py`. All scripts live in +`seismo-relay/scripts/` and default to SFM direct (`http://localhost:8200`), +overridable via `SFM_BASE_URL`. + +### Endpoints added during BE9558H recovery + +| Endpoint | Purpose | +|---|---| +| `GET /device/events/storage_range` | SUB 0x06 — first/last event keys, `is_empty` flag. ~2s, no event walk. | +| `GET /device/events/index` | SUB 0x08 — lifetime event counter (does NOT decrement on erase). ~2s. | +| `POST /device/events/erase` | Full erase sequence 0xA3 → 0x1C → 0x06 → 0xA2. | +| `POST /device/rescue` | Disable ACH + erase in one TCP session. Short timeouts for race-loop usage. | +| `POST /device/stop_monitoring_blind` | Fire-and-forget Stop with full POLL preamble (single attempt). | +| `POST /device/stop_monitoring_spam` | Server-side tight retry loop, sub-second cadence, duration-bounded. | +| `POST /device/stop_monitoring_slow_drip` | One held TCP session, slow trickle of stop frames. **The endpoint that saved BE9558H.** | + +Also changed: default protocol recv timeout dropped from 30s → 10s in +`_build_client`. Added `connect_timeout` knob to same. Cleaned up +unhandled-exception path in `/device/monitor/status` so it returns 502 +instead of 500 on protocol timeouts. + +### Scripts + +| Script | Purpose | +|---|---| +| `scripts/rescue_device.sh` | Race-loop wrapper around `/device/rescue` | +| `scripts/blind_stop.sh` | Race-loop wrapper around `/device/stop_monitoring_blind` | +| `scripts/spam_stop.sh` | Single-call burst hammer (`/device/stop_monitoring_spam`) | +| `scripts/slow_drip.sh` | Single-call held-session drip (`/device/stop_monitoring_slow_drip`) | +| `scripts/watch_unit.sh` | Passive periodic reachability check, logs to file | + +--- + +## Incident log — BE9558H, 2026-05-16/17 + +What was wrong: Long-axis geophone developed an offset, constantly above +trigger threshold → constant event recording → after-event ACH set → +modem dialing office BW server (`50.197.32.92:12345`) every 30-60s. +Local event chain corrupted (`next_boundary 0x100EE exceeds uint16`). + +Diagnostic path: + +1. `/device/info` slow, choked on event walk +2. Built lightweight probe endpoints (`storage_range`, `index`) — useful + but didn't reach the wedged unit +3. Built `/device/rescue` with short timeouts — got 502 (POLL no response) +4. Built `/device/stop_monitoring_blind` — first version was a false + positive (no POLL preamble); fixed by including + `SESSION_RESET+POLL_PROBE+SESSION_RESET+POLL_DATA` in the dump +5. Verified blind stop works on bench unit +6. Built `/device/stop_monitoring_spam` — 420 successful sends over + 5 min, zero behavior change on field unit +7. Inspected ACEmanager logs → saw outbound dial-out attempts every ~30s, + confirmed device was not fully locked up +8. Added outbound port-12345 firewall block → outbound attempts now fail + instantly but contention persisted +9. Built `/device/stop_monitoring_slow_drip` — session died at 3s with + broken pipe (modem closing on us) +10. Looked at full ACEmanager Port Configuration → **found + `Destination Address: 50.197.32.92` configured**, realized every AT + dial command was triggering a modem mode-flip that killed our inbound +11. Cleared Destination Address + Port → slow_drip held 120s, device + responded with 990 bytes, 39 stop commands acked +12. Disabled ACH at device level via `/device/call_home`, erased events + +Final state: device IDLE, memory 958.1 / 960 KB free, ACH disabled at +device level, modem destination cleared (to be restored after physical +service). + +Total time from "i was wondering if its possible to" first attempt to +recovery: ~7 hours of intermittent debugging across one evening. diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index a415599..12ad2db 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -47,7 +47,7 @@ SIDECAR_KIND = "sfm.event" # bumped without a `pip install` re-run — leading to confusing stale # version stamps in sidecars. Bump this constant and CHANGELOG.md # together at release time. -TOOL_VERSION = "0.16.0" +TOOL_VERSION = "0.16.1" try: # Best-effort: prefer the installed metadata when it's NEWER than the @@ -646,6 +646,50 @@ def _peaks_from_samples(samples: dict[str, list[int]]) -> PeakValues: ) +_RECORD_TYPE_BY_EXT_SUFFIX = { + 'H': 'Histogram', + 'W': 'Waveform', + 'M': 'Manual', + 'E': 'Event', + 'C': 'Combo', +} + + +def derive_record_type_from_filename(filename, default: str = "Waveform") -> str: + """Derive a BW Event's record_type from its filename's extension suffix. + + V10.72+ MiniMate Plus firmware encodes the event type as the LAST + character of the extension (the `T` in BW's `AB0T` scheme): + + ``M529LKIQ.G10H`` → H → ``"Histogram"`` + ``T350L385.VY0W`` → W → ``"Waveform"`` + ``...M`` → M → ``"Manual"`` + ``...E`` → E → ``"Event"`` + ``...C`` → C → ``"Combo"`` + + Old S338 firmware uses 3-char extensions ending in ``0`` whose + encoding is not yet known — those fall through to ``default``. + Micromate Series 4 uses a different scheme entirely (observed: + ``IDFH``, ``IDFW``) but the LAST-char convention (H / W) still holds + for the type code, so it works for both families. + + Returns ``default`` if filename is empty, has no extension, or the + suffix char isn't a recognized type code. + """ + if not filename: + return default + try: + name = Path(filename).name + except (TypeError, ValueError): + return default + if '.' not in name: + return default + ext = name.rsplit('.', 1)[1] + if not ext: + return default + return _RECORD_TYPE_BY_EXT_SUFFIX.get(ext[-1].upper(), default) + + def read_blastware_file(path: Union[str, Path]) -> Event: """ Parse a Blastware waveform file into an Event. @@ -727,7 +771,12 @@ def read_blastware_file(path: Union[str, Path]) -> Event: ev = Event(index=-1) if strt_fields.get("waveform_key"): ev._waveform_key = bytes.fromhex(strt_fields["waveform_key"]) - ev.record_type = "Waveform" + # Derive record_type from the filename's extension suffix (H/W/M/E/C). + # When called from save_imported_bw the path here is a tmp file with a + # ".bw" suffix, so the derivation falls back to "Waveform" and the + # caller overrides ev.record_type using the original filename — see + # waveform_store.save_imported_bw. + ev.record_type = derive_record_type_from_filename(path.name) ev.rectime_seconds = strt_fields.get("rectime_seconds") ev.total_samples = strt_fields.get("total_samples") ev.pretrig_samples = strt_fields.get("pretrig_samples") diff --git a/pyproject.toml b/pyproject.toml index 70530de..922e5f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "seismo-relay" -version = "0.16.0" +version = "0.17.0" description = "Python client and REST server for MiniMate Plus seismographs" requires-python = ">=3.10" dependencies = [ diff --git a/scripts/backfill_record_type.py b/scripts/backfill_record_type.py new file mode 100644 index 0000000..b2d5202 --- /dev/null +++ b/scripts/backfill_record_type.py @@ -0,0 +1,150 @@ +""" +scripts/backfill_record_type.py — fix `record_type` on legacy event +rows whose value was hardcoded to "Waveform" regardless of actual type. + +Why this is needed +────────────────── +Pre-v0.16.1 the BW file importer (`event_file_io.read_blastware_file`) +hardcoded `ev.record_type = "Waveform"` for every imported event. Fixed +in commit aac1c8e — new ingests now derive the type from the Blastware +filename's extension last character (H=Histogram, W=Waveform, M=Manual, +E=Event, C=Combo) per the V10.72+ MiniMate Plus AB0T filename scheme. + +Effect on a server that imported events under the old code: every +events row has `record_type = "Waveform"`, even for histograms, +manuals, etc. Visible in terra-view's event-detail modal under the +"Record Type" field. Terra-view also has a client-side workaround +that derives the type from the filename for display purposes, so +operators see the correct type in the UI even before this backfill. +This script makes the DB column match what the UI is already showing, +which matters for reporting and any downstream consumer that reads +events.record_type directly. + +This script +─────────── +Walks the `events` table and updates each row's `record_type` to the +derived value from its `blastware_filename`. Old S338 firmware files +(3-char extensions ending in `0`) and any unrecognized suffix get +left at the existing value (defaults to "Waveform"). + +Idempotent: re-running after a successful backfill finds zero rows +needing updates and exits cleanly (it always re-derives but only +writes when the value would change). + +Usage +───── + # Dry-run (default): print what would change, don't touch the DB + python -m scripts.backfill_record_type --db bridges/captures/seismo_relay.db + + # Apply the backfill + python -m scripts.backfill_record_type --db bridges/captures/seismo_relay.db --apply +""" + +from __future__ import annotations + +import argparse +import sqlite3 +import sys +from collections import Counter +from pathlib import Path + + +# Must stay in sync with minimateplus.event_file_io._RECORD_TYPE_BY_EXT_SUFFIX. +_TYPE_FROM_SUFFIX = { + "H": "Histogram", + "W": "Waveform", + "M": "Manual", + "E": "Event", + "C": "Combo", +} + + +def derive_record_type(filename: str | None, default: str = "Waveform") -> str: + """Mirror of minimateplus.event_file_io.derive_record_type_from_filename. + + Vendored here so this script runs without needing the seismo-relay + package on the Python path (useful on prod where you might be + running it via `docker exec` against a container's DB volume). + """ + if not filename: + return default + name = Path(filename).name + if "." not in name: + return default + ext = name.rsplit(".", 1)[1] + if not ext: + return default + return _TYPE_FROM_SUFFIX.get(ext[-1].upper(), default) + + +def main() -> int: + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--db", required=True, help="Path to seismo_relay.db") + ap.add_argument("--apply", action="store_true", + help="Actually write changes (default is dry-run).") + ap.add_argument("--default", default="Waveform", + help="Fallback record_type when filename doesn't encode one. " + "Default: Waveform (matches the pre-fix bug's behavior).") + args = ap.parse_args() + + db_path = Path(args.db) + if not db_path.exists(): + print(f"ERROR: database not found at {db_path}", file=sys.stderr) + return 1 + + conn = sqlite3.connect(str(db_path)) + conn.row_factory = sqlite3.Row + cur = conn.cursor() + + cur.execute(""" + SELECT id, blastware_filename, record_type + FROM events + WHERE blastware_filename IS NOT NULL + AND blastware_filename != '' + """) + rows = cur.fetchall() + total = len(rows) + print(f"Scanning {total:,} event rows…") + print() + + # Tally proposed changes. + transitions: Counter[tuple[str, str]] = Counter() + update_ids: list[tuple[str, str]] = [] + unrecognized = 0 + + for row in rows: + derived = derive_record_type(row["blastware_filename"], default=args.default) + current = row["record_type"] or "" + if derived == current: + continue + transitions[(current, derived)] += 1 + update_ids.append((row["id"], derived)) + + if not update_ids: + print("Nothing to update — all rows already match.") + conn.close() + return 0 + + print(f"{len(update_ids):,} row(s) need updating:") + for (old, new), count in sorted(transitions.items(), key=lambda x: -x[1]): + print(f" {count:>6,} {old!r:14s} → {new!r}") + print() + + if not args.apply: + print("(dry-run — re-run with --apply to write changes)") + conn.close() + return 0 + + print("Applying changes…") + cur.executemany( + "UPDATE events SET record_type = ? WHERE id = ?", + [(new, eid) for eid, new in update_ids], + ) + conn.commit() + print(f"Done. Updated {cur.rowcount:,} row(s).") + conn.close() + return 0 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/scripts/blind_stop.sh b/scripts/blind_stop.sh new file mode 100755 index 0000000..5ab170b --- /dev/null +++ b/scripts/blind_stop.sh @@ -0,0 +1,100 @@ +#!/usr/bin/env bash +# Fire-and-forget Stop Monitoring loop — for wedged or constantly-triggering units. +# +# Hammers POST /device/stop_monitoring_blind in a tight loop. The endpoint +# opens TCP, dumps SESSION_RESET + a few copies of the SUB 0x97 frame, and +# closes — without ever reading an S3 response. Each TCP-won attempt is +# ~50ms of wire activity instead of the multi-frame handshake the regular +# rescue endpoint does, so windows that are too small for the full rescue +# can still land a stop-monitoring command. +# +# Usage: +# ./blind_stop.sh [tcp_port] +# +# Env: +# SFM_BASE_URL Default: http://localhost:8200 (SFM direct). +# Set to http://localhost:8001/api/sfm to route through +# Terra-View's proxy. +# MAX_ATTEMPTS Default: 600 +# SLEEP_S Default: 0 (no backoff — hammer it) +# MAX_TIME_S Default: 15 +# CONNECT_TIMEOUT Default: 5 +# REPEAT Frames per TCP session (default 3 — increases hit rate +# if the device is busy reading its own buffer). +# STOP_ON_OK Default: 1. Set to 0 to keep hammering indefinitely +# even after successful sends (every 503 means the device +# is in *another* session, every 200 means our bytes got +# through — but the device may not have processed them). + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port]" >&2 + exit 2 +fi + +base="${SFM_BASE_URL:-http://localhost:8200}" +max_attempts="${MAX_ATTEMPTS:-600}" +sleep_s="${SLEEP_S:-0}" +max_time_s="${MAX_TIME_S:-15}" +connect_timeout="${CONNECT_TIMEOUT:-5}" +repeat="${REPEAT:-3}" +stop_on_ok="${STOP_ON_OK:-1}" + +url="${base}/device/stop_monitoring_blind?host=${host}&tcp_port=${tcp_port}&connect_timeout=${connect_timeout}&repeat=${repeat}" + +echo "blind_stop: target ${host}:${tcp_port} connect_timeout=${connect_timeout}s repeat=${repeat}" +echo "blind_stop: POST ${url}" +echo "blind_stop: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request" +echo "blind_stop: stop_on_ok=${stop_on_ok}" +echo + +ok_count=0 +busy_count=0 +err_count=0 +started=$(date +%s) + +for ((i=1; i<=max_attempts; i++)); do + printf "[%4d] %s " "$i" "$(date +%H:%M:%S)" + http_code=$(curl -sS -o /tmp/blind_resp.$$ -w "%{http_code}" \ + --max-time "$max_time_s" \ + -X POST "$url" || echo "000") + body=$(cat /tmp/blind_resp.$$ 2>/dev/null || true) + rm -f /tmp/blind_resp.$$ + + case "$http_code" in + 200|201) + ok_count=$((ok_count + 1)) + echo "SENT $body" + if [[ "$stop_on_ok" == "1" ]]; then + elapsed=$(( $(date +%s) - started )) + echo + echo "blind_stop: success after ${i} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" + echo "blind_stop: NEXT — wait ~10s, then try the full rescue:" + echo " /home/serversdown/seismo-relay/scripts/rescue_device.sh ${host} ${tcp_port}" + exit 0 + fi + ;; + 503) + busy_count=$((busy_count + 1)) + echo "busy (503)" + ;; + 000) + err_count=$((err_count + 1)) + echo "curl error" + ;; + *) + err_count=$((err_count + 1)) + echo "HTTP $http_code $body" | head -c 400 + echo + ;; + esac + [[ "$sleep_s" != "0" ]] && sleep "$sleep_s" +done + +elapsed=$(( $(date +%s) - started )) +echo +echo "blind_stop: gave up after ${max_attempts} attempts (${elapsed}s). ok=${ok_count} busy=${busy_count} err=${err_count}" >&2 +exit 1 diff --git a/scripts/rescue_device.sh b/scripts/rescue_device.sh new file mode 100755 index 0000000..3466926 --- /dev/null +++ b/scripts/rescue_device.sh @@ -0,0 +1,99 @@ +#!/usr/bin/env bash +# Rescue an uncooperative MiniMate that's busy with another ACH session. +# +# Hammers POST /device/rescue in a tight loop with a short timeout. When the +# device is in an ACH session our SYN either gets refused or silently dropped +# (5s connect timeout inside the endpoint) and we retry immediately. When the +# device is between sessions, our TCP wins, the endpoint disables Auto Call +# Home and erases events inside the same session, then returns success. +# +# Usage: +# ./rescue_device.sh [tcp_port] [--no-erase] [--no-disable-ach] +# +# Examples: +# ./rescue_device.sh 166.246.130.1 9034 +# ./rescue_device.sh 166.246.130.1 9034 --no-erase # just silence it +# +# Environment: +# SFM_BASE_URL Defaults to http://localhost:8200 (SFM direct). +# Set to http://localhost:8001/api/sfm to route through +# Terra-View's proxy. Direct mode avoids the proxy's +# 60s timeout, which matters for long-running endpoints. +# MAX_ATTEMPTS Cap on retries (default 600 ≈ 30+ min). +# SLEEP_S Backoff between attempts (default 1). +# MAX_TIME_S Per-request timeout (default 60). +# CONNECT_TIMEOUT TCP connect timeout (default 5). +# RECV_TIMEOUT Per-frame S3 recv timeout (default 5). If POLL or any +# subsequent frame doesn't respond within this window, the +# rescue endpoint bails and this script retries. + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +shift 2 2>/dev/null || shift $# 2>/dev/null + +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port] [--no-erase] [--no-disable-ach]" >&2 + exit 2 +fi + +disable_ach="true" +erase="true" +for arg in "$@"; do + case "$arg" in + --no-erase) erase="false" ;; + --no-disable-ach) disable_ach="false" ;; + *) echo "unknown flag: $arg" >&2; exit 2 ;; + esac +done + +base="${SFM_BASE_URL:-http://localhost:8200}" +max_attempts="${MAX_ATTEMPTS:-600}" +sleep_s="${SLEEP_S:-1}" +max_time_s="${MAX_TIME_S:-60}" +connect_timeout="${CONNECT_TIMEOUT:-5}" +recv_timeout="${RECV_TIMEOUT:-5}" + +url="${base}/device/rescue?host=${host}&tcp_port=${tcp_port}&disable_ach=${disable_ach}&erase=${erase}&connect_timeout=${connect_timeout}&recv_timeout=${recv_timeout}" + +echo "rescue: target ${host}:${tcp_port} disable_ach=${disable_ach} erase=${erase}" +echo "rescue: connect_timeout=${connect_timeout}s recv_timeout=${recv_timeout}s" +echo "rescue: POST ${url}" +echo "rescue: up to ${max_attempts} attempts, ${sleep_s}s between, ${max_time_s}s per request" +echo + +started=$(date +%s) +for ((i=1; i<=max_attempts; i++)); do + printf "[%3d] %s " "$i" "$(date +%H:%M:%S)" + http_code=$(curl -sS -o /tmp/rescue_resp.$$ -w "%{http_code}" \ + --max-time "$max_time_s" \ + -X POST "$url" || echo "000") + body=$(cat /tmp/rescue_resp.$$ 2>/dev/null || true) + rm -f /tmp/rescue_resp.$$ + + case "$http_code" in + 200|201) + elapsed=$(( $(date +%s) - started )) + echo "OK (${elapsed}s total)" + echo "$body" + exit 0 + ;; + 503) + # Connection refused / timeout — device busy in another session. Retry fast. + echo "busy (503)" + ;; + 000) + echo "curl error (network)" + ;; + *) + echo "HTTP $http_code" + echo " $body" | head -c 400 + echo + ;; + esac + sleep "$sleep_s" +done + +echo "rescue: gave up after ${max_attempts} attempts" >&2 +exit 1 diff --git a/scripts/slow_drip.sh b/scripts/slow_drip.sh new file mode 100755 index 0000000..9c942d1 --- /dev/null +++ b/scripts/slow_drip.sh @@ -0,0 +1,44 @@ +#!/usr/bin/env bash +# Hold a single TCP session open and drip stop-monitoring frames at a slow +# rate, so the device's UART RX FIFO has time to drain between sends. +# +# Use when high-rate spam isn't landing — typically because the device's +# firmware is too busy to drain its serial buffer fast enough and bytes +# are being lost to UART overrun. +# +# Usage: +# ./slow_drip.sh [tcp_port] [duration_s] +# +# Env: +# DURATION Default: 120 (seconds; arg 3 overrides). Clamped 1..600. +# INTERVAL Seconds between drip sends (default 3). Lower = more +# aggressive, more risk of FIFO overrun. Higher = safer +# but fewer total drips per duration. +# CONNECT_TIMEOUT Default: 5 +# SFM_BASE_URL Default: http://localhost:8200 (SFM direct). + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +duration="${3:-${DURATION:-120}}" +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port] [duration_s]" >&2 + exit 2 +fi + +base="${SFM_BASE_URL:-http://localhost:8200}" +interval="${INTERVAL:-3}" +connect_timeout="${CONNECT_TIMEOUT:-5}" + +url="${base}/device/stop_monitoring_slow_drip?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&interval_s=${interval}&connect_timeout=${connect_timeout}" + +echo "slow_drip: target ${host}:${tcp_port} duration=${duration}s interval=${interval}s connect_timeout=${connect_timeout}s" +echo "slow_drip: POST ${url}" +echo + +# Give curl enough slack to wait out the duration plus a buffer +max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 30 }') + +curl -sS --max-time "$max_time" -X POST "$url" +echo diff --git a/scripts/spam_stop.sh b/scripts/spam_stop.sh new file mode 100755 index 0000000..dda864d --- /dev/null +++ b/scripts/spam_stop.sh @@ -0,0 +1,48 @@ +#!/usr/bin/env bash +# Hammer a device with blind stop-monitoring sessions as fast as possible. +# Single HTTP call kicks off the burst inside SFM (no per-attempt HTTP +# overhead). Default: 10 seconds, ~500 ms per attempt = ~20 attempts/sec. +# +# Usage: +# ./spam_stop.sh [tcp_port] [duration_s] +# +# Examples: +# ./spam_stop.sh 166.246.130.1 # 10s burst +# ./spam_stop.sh 166.246.130.1 9034 30 # 30s burst +# DURATION=60 CONNECT_TIMEOUT=0.2 ./spam_stop.sh 166.246.130.1 +# +# Env: +# SFM_BASE_URL Default: http://localhost:8200 (SFM direct). +# Set to http://localhost:8001/api/sfm to route through +# Terra-View's proxy — but note the proxy has a 60s +# timeout, so long bursts need direct mode. +# DURATION Default: 10 (seconds; arg 3 overrides) +# CONNECT_TIMEOUT Default: 0.5 (seconds) +# REPEAT Default: 3 (stop frames per TCP session) + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +duration="${3:-${DURATION:-10}}" + +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port] [duration_s]" >&2 + exit 2 +fi + +base="${SFM_BASE_URL:-http://localhost:8200}" +connect_timeout="${CONNECT_TIMEOUT:-0.5}" +repeat="${REPEAT:-3}" + +url="${base}/device/stop_monitoring_spam?host=${host}&tcp_port=${tcp_port}&duration_s=${duration}&connect_timeout=${connect_timeout}&repeat=${repeat}" + +echo "spam_stop: target ${host}:${tcp_port} duration=${duration}s connect_timeout=${connect_timeout}s repeat=${repeat}" +echo "spam_stop: POST ${url}" +echo + +# Give curl enough slack to wait out the duration plus a buffer +max_time=$(awk -v d="$duration" 'BEGIN { printf "%d", d + 10 }') + +curl -sS --max-time "$max_time" -X POST "$url" +echo diff --git a/scripts/watch_unit.sh b/scripts/watch_unit.sh new file mode 100755 index 0000000..2114936 --- /dev/null +++ b/scripts/watch_unit.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +# Passive monitor for a misbehaving unit. Every INTERVAL seconds, attempts +# a single short TCP probe + storage_range read and logs the result. Designed +# to run unattended for hours/days and tell you when the unit comes back. +# +# Usage: +# ./watch_unit.sh [tcp_port] +# +# Env: +# INTERVAL Seconds between checks (default 300 = 5 min) +# LOG_FILE Append results here (default /tmp/watch_.log) +# SFM_BASE_URL Default: http://localhost:8200 + +set -u + +host="${1:-}" +tcp_port="${2:-9034}" +if [[ -z "$host" ]]; then + echo "usage: $0 [tcp_port]" >&2 + exit 2 +fi + +interval="${INTERVAL:-300}" +log_file="${LOG_FILE:-/tmp/watch_${host}.log}" +base="${SFM_BASE_URL:-http://localhost:8200}" + +url="${base}/device/events/storage_range?host=${host}&tcp_port=${tcp_port}" + +echo "watch_unit: target ${host}:${tcp_port} interval=${interval}s log=${log_file}" +echo "watch_unit: Ctrl-C to stop" + +while true; do + ts=$(date '+%Y-%m-%d %H:%M:%S') + http_code=$(curl -sS -o /tmp/watch_resp.$$ -w "%{http_code}" \ + --max-time 20 "$url" || echo "000") + body=$(cat /tmp/watch_resp.$$ 2>/dev/null || true) + rm -f /tmp/watch_resp.$$ + + case "$http_code" in + 200|201) + # Strip the raw_hex for readability + summary=$(echo "$body" | sed 's/"raw_hex":"[^"]*",*//; s/,*$//' | head -c 200) + echo "$ts REACHABLE $summary" | tee -a "$log_file" + ;; + 502|503) + err=$(echo "$body" | head -c 150) + echo "$ts ERROR_$http_code $err" | tee -a "$log_file" + ;; + 000) + echo "$ts CURL_FAIL (network/timeout)" | tee -a "$log_file" + ;; + *) + echo "$ts HTTP_$http_code $(echo "$body" | head -c 150)" | tee -a "$log_file" + ;; + esac + + sleep "$interval" +done diff --git a/sfm/database.py b/sfm/database.py index 88497d8..8228d6c 100644 --- a/sfm/database.py +++ b/sfm/database.py @@ -491,6 +491,75 @@ class SeismoDb: ) return cur.rowcount > 0 + def delete_event(self, event_id: str) -> Optional[dict]: + """ + Hard-delete one event row by id. Returns the deleted row (so the + caller can clean up any on-disk files referenced by it) or None + if no row matched. + """ + with self._connect() as conn: + row = conn.execute( + "SELECT * FROM events WHERE id = ?", (event_id,), + ).fetchone() + if row is None: + return None + conn.execute("DELETE FROM events WHERE id = ?", (event_id,)) + return dict(row) + + def delete_events_bulk( + self, + serial: Optional[str] = None, + from_dt: Optional[datetime.datetime] = None, + to_dt: Optional[datetime.datetime] = None, + false_trigger: Optional[bool] = None, + ids: Optional[list[str]] = None, + ) -> list[dict]: + """ + Hard-delete events matching the given filters. Returns the list + of deleted row dicts. Refuses to delete with no filters at all + (would wipe the whole table) — raises ValueError. + + Filter semantics match query_events: serial / from_dt / to_dt / + false_trigger combine with AND. `ids` is an additional inclusion + list (event_id IN (...)); if supplied alongside other filters, + only rows matching all conditions are deleted. + """ + clauses: list[str] = [] + params: list = [] + + if serial: + clauses.append("serial = ?") + params.append(serial) + if from_dt: + clauses.append("timestamp >= ?") + params.append(from_dt.isoformat()) + if to_dt: + clauses.append("timestamp <= ?") + params.append(to_dt.isoformat()) + if false_trigger is not None: + clauses.append("false_trigger = ?") + params.append(1 if false_trigger else 0) + if ids: + placeholders = ",".join("?" * len(ids)) + clauses.append(f"id IN ({placeholders})") + params.extend(ids) + + if not clauses: + raise ValueError( + "delete_events_bulk refuses to delete with no filters " + "(would wipe the entire events table)" + ) + + where = "WHERE " + " AND ".join(clauses) + + with self._connect() as conn: + rows = conn.execute( + f"SELECT * FROM events {where}", params, + ).fetchall() + if rows: + conn.execute(f"DELETE FROM events {where}", params) + return [dict(r) for r in rows] + def update_event_review(self, event_id: str, review: dict) -> bool: """ Sync derived index columns from a sidecar's `review` block. diff --git a/sfm/server.py b/sfm/server.py index a7b8cae..603e2d2 100644 --- a/sfm/server.py +++ b/sfm/server.py @@ -36,6 +36,7 @@ from __future__ import annotations import datetime import logging +import socket import sys import tempfile import threading @@ -63,7 +64,9 @@ from minimateplus.protocol import ProtocolError from minimateplus.models import CallHomeConfig, ComplianceConfig, DeviceInfo, Event, PeakValues, ProjectInfo, Timestamp from minimateplus.transport import TcpTransport, DEFAULT_TCP_PORT from minimateplus.blastware_file import write_blastware_file, blastware_filename -from minimateplus.client import _decode_a5_metadata_into, _decode_a5_waveform +from minimateplus.client import _decode_a5_metadata_into, _decode_a5_waveform, _decode_event_count +from minimateplus.framing import build_bw_write_frame, SESSION_RESET, POLL_PROBE, POLL_DATA +from minimateplus.protocol import SUB_STOP_MONITORING from sfm import event_hdf5 from sfm.cache import SFMCache, get_cache from sfm.database import SeismoDb @@ -86,7 +89,7 @@ app = FastAPI( "Implements the minimateplus RS-232 protocol library.\n" "Proxied by terra-view at /api/sfm/*." ), - version="0.1.0", + version="0.17.0", ) # Allow requests from the waveform viewer opened as a local file (file://) @@ -268,7 +271,8 @@ def _build_client( baud: int, host: Optional[str], tcp_port: int, - timeout: float = 30.0, + timeout: float = 10.0, + connect_timeout: Optional[float] = None, ) -> MiniMateClient: """ Return a MiniMateClient configured for either serial or TCP transport. @@ -276,12 +280,24 @@ def _build_client( TCP takes priority if *host* is supplied; otherwise *port* (serial) is used. Raises HTTPException(422) if neither is provided. + Default *timeout* is 10s — the device usually responds in well under a + second over cellular; 10s leaves comfortable headroom for retransmits + while still failing reasonably fast when a unit is wedged. + Use timeout=120.0 (or higher) for endpoints that perform a full 5A waveform download — a 70-second event at 1024 sps takes 2-3 minutes to transfer over cellular and each individual recv must complete within the timeout window. + + *connect_timeout* (TCP only) overrides the TcpTransport default (10s) for + the initial TCP SYN/handshake. Use a small value (e.g. 5s) in rescue/race + scenarios where the device is busy in another session and you want to + fail fast and retry quickly. """ if host: - transport = TcpTransport(host, port=tcp_port) + if connect_timeout is not None: + transport = TcpTransport(host, port=tcp_port, connect_timeout=connect_timeout) + else: + transport = TcpTransport(host, port=tcp_port) log.debug("TCP transport: %s:%d timeout=%.0fs", host, tcp_port, timeout) return MiniMateClient(transport=transport, timeout=timeout) elif port: @@ -1095,13 +1111,23 @@ def device_monitor_status( cached["_cached"] = True return cached - with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: - try: - client.poll() - except Exception as exc: - log.warning("monitor status poll retry: %s", exc) - client.poll() - status = client.get_monitor_status() + try: + with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: + try: + client.poll() + except Exception as exc: + log.warning("monitor status poll retry: %s", exc) + client.poll() + status = client.get_monitor_status() + except HTTPException: + raise + except ProtocolError as exc: + # Includes minimateplus.protocol.TimeoutError ("device unresponsive"). + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc result: dict = {"is_monitoring": status.is_monitoring} if status.battery_v is not None: @@ -1117,6 +1143,529 @@ def device_monitor_status( return result +@app.get("/device/events/storage_range") +def device_events_storage_range( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), +) -> dict: + """ + Read the device's event storage range (SUB 0x06) — first and last + stored event keys. POLL handshake + one read; no connect(), no + config reads, no event walk. Completes in ~2 seconds. + + Useful for checking whether the device has any stored events + without invoking the slow count_events() 1E/1F chain. Both keys = + `01110000` means the device is empty. + """ + log.info("GET /device/events/storage_range host=%s tcp_port=%s", host, tcp_port) + try: + def _do(): + with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: + try: + client.poll() + except Exception as exc: + log.warning("storage_range poll retry: %s", exc) + client.poll() + proto = client._require_proto() + return proto.read_event_storage_range() + rng = _run_with_retry(_do, is_tcp=_is_tcp(host)) + except HTTPException: + raise + except ProtocolError as exc: + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc + + data = bytes(rng.data) + result: dict = {"raw_len": len(data), "raw_hex": data.hex()} + if len(data) >= 8: + first_key = data[-8:-4].hex() + last_key = data[-4:].hex() + result["first_key"] = first_key + result["last_key"] = last_key + result["is_empty"] = (first_key == "01110000" and last_key == "01110000") + return result + + +@app.get("/device/events/index") +def device_events_index( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), +) -> dict: + """ + Read the device's event index (SUB 0x08) — returns the lifetime + event counter at data[10:12] (uint16 BE). POLL handshake + one + read; no connect(), no config reads, no event walk. ~2 seconds. + + Note: this is a LIFETIME counter (events ever recorded) — it does + NOT decrement when events are erased. After an erase, the device + counter resets to 0 only on the next recorded event. For "are + there stored events right now?" use /device/events/storage_range + instead. + """ + log.info("GET /device/events/index host=%s tcp_port=%s", host, tcp_port) + try: + def _do(): + with _build_client(port=port, baud=baud, host=host, tcp_port=tcp_port) as client: + try: + client.poll() + except Exception as exc: + log.warning("event_index poll retry: %s", exc) + client.poll() + proto = client._require_proto() + return proto.read_event_index() + idx_raw = _run_with_retry(_do, is_tcp=_is_tcp(host)) + except HTTPException: + raise + except ProtocolError as exc: + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc + + raw = bytes(idx_raw) + result: dict = {"raw_len": len(raw), "raw_hex": raw.hex()} + try: + result["lifetime_count"] = _decode_event_count(raw) + except Exception as exc: + result["decode_error"] = str(exc) + return result + + +@app.post("/device/events/erase") +def device_events_erase( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), +) -> dict: + """ + Erase ALL stored events from the device memory. + + Sequence: SUB 0xA3 → 0x1C → 0x06 → 0xA2 (confirmed 2026-04-11). + After this call the unit's event memory is empty and event keys reset + to 0x01110000. The device returns to its normal operating state + automatically — no restart-monitoring call is needed. + + Note: this endpoint does NOT touch the ACH server's `ach_state.json`. + If a call-home subsequently lands on the ACH server, its post-erase + detection logic (max(device_keys) vs max_downloaded_key) handles the + key-counter rollback. + """ + log.info("POST /device/events/erase port=%s host=%s tcp_port=%s", port, host, tcp_port) + + try: + def _do(): + with _build_client(port, baud, host, tcp_port) as client: + client.connect() + client.delete_all_events() + _run_with_retry(_do, is_tcp=_is_tcp(host)) + except HTTPException: + raise + except ProtocolError as exc: + raise HTTPException(status_code=502, detail=f"Protocol error: {exc}") from exc + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Connection error: {exc}") from exc + except Exception as exc: + raise HTTPException(status_code=500, detail=f"Device error: {exc}") from exc + + conn_key = SFMCache.make_conn_key(host, tcp_port, port, baud) + cleared = get_cache().clear_device(conn_key) + return { + "status": "ok", + "message": "Device event memory cleared", + "cache_cleared": cleared, + } + + +@app.post("/device/stop_monitoring_blind") +def device_stop_monitoring_blind( + host: str = Query(..., description="TCP host — modem IP"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + connect_timeout: float = Query(5.0, description="TCP connect timeout in seconds (default 5)"), + repeat: int = Query(3, description="How many times to send the frame within one TCP session (default 3)"), +) -> dict: + """ + Fire-and-forget Stop Monitoring (SUB 0x97). TCP-only. + + Opens a TCP session, dumps the FULL handshake the device's protocol + state machine expects — `SESSION_RESET + POLL_PROBE + SESSION_RESET + + POLL_DATA` — and then N back-to-back copies of the stop-monitoring + frame. Does NOT read any S3 response. Succeeds as long as the bytes + left the socket. + + The POLL handshake bytes are required: monitoring units ignore command + frames received without a preceding POLL exchange. Sending the POLL + bytes "blind" (without reading the responses) still works because the + device processes inbound bytes in order regardless of whether we drain + its outbound buffer. + + Idempotent: the device processes extra copies of SUB 0x97 the same as + one (already-stopped is a no-op). + + Returns the number of bytes sent. A 503 means the TCP connect failed + (device busy in another session — caller should retry). + """ + log.info( + "POST /device/stop_monitoring_blind host=%s tcp_port=%s connect_timeout=%.1fs repeat=%d", + host, tcp_port, connect_timeout, repeat, + ) + if repeat < 1: + repeat = 1 + + frame = build_bw_write_frame(SUB_STOP_MONITORING, b"") + payload = ( + SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA + + (frame * repeat) + ) + t0 = time.monotonic() + + transport = TcpTransport(host, port=tcp_port, connect_timeout=connect_timeout) + try: + transport.connect() + except OSError as exc: + raise HTTPException(status_code=503, detail=f"Connection error: {exc}") from exc + + try: + transport.write(payload) + except OSError as exc: + transport.disconnect() + raise HTTPException(status_code=502, detail=f"Send error: {exc}") from exc + finally: + transport.disconnect() + + return { + "status": "sent", + "bytes_sent": len(payload), + "frame_size": len(frame), + "repeat": repeat, + "elapsed_s": round(time.monotonic() - t0, 3), + } + + +@app.post("/device/stop_monitoring_slow_drip") +def device_stop_monitoring_slow_drip( + host: str = Query(..., description="TCP host — modem IP"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + duration_s: float = Query(120.0, description="Total time to hold the session open (seconds)"), + interval_s: float = Query(3.0, description="Seconds between drip sends"), + connect_timeout: float = Query(5.0, description="TCP connect timeout"), +) -> dict: + """ + Hold a single TCP session open for *duration_s* seconds and drip + stop-monitoring frames into the device at a slow rate so its UART + RX FIFO has time to drain between sends. + + Sequence: + 1. Open TCP session. + 2. Send the wake preamble: SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA (so the device's protocol parser + is primed for a write command). + 3. Wait interval_s for the device to drain. + 4. Drip-send (SESSION_RESET + stop_monitoring_frame) every + interval_s until duration_s elapses. + 5. Opportunistically drain any bytes the device sends back (so + the modem's TX queue doesn't fill up). Successful drains are + counted in `bytes_received` — non-zero strongly suggests the + device has started responding to us. + 6. Close. + + Designed for units whose firmware is too busy with event-recording + to keep up with high-rate spam. Heavy spam overruns the UART FIFO; + slow drip stays under it. + + Compared to spam mode: ~40× fewer bytes/sec on the wire, but each + byte has a much higher chance of actually being parsed. + """ + log.info( + "POST /device/stop_monitoring_slow_drip host=%s tcp_port=%s duration=%.1fs interval=%.2fs connect_timeout=%.1fs", + host, tcp_port, duration_s, interval_s, connect_timeout, + ) + duration_s = max(1.0, min(duration_s, 600.0)) # clamp 1s..10min + interval_s = max(0.1, min(interval_s, 30.0)) + connect_timeout = max(0.1, connect_timeout) + + stop_frame = build_bw_write_frame(SUB_STOP_MONITORING, b"") + preamble = ( + SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA + ) + + t0 = time.monotonic() + drips_sent = 0 + bytes_sent = 0 + bytes_received = 0 + + try: + sock = socket.create_connection((host, tcp_port), timeout=connect_timeout) + except OSError as exc: + raise HTTPException(status_code=503, detail=f"Connection error: {exc}") from exc + + # Short read timeout so opportunistic drains don't block. + sock.settimeout(0.1) + + try: + # Initial wake preamble. + try: + sock.sendall(preamble) + bytes_sent += len(preamble) + except OSError as exc: + raise HTTPException(status_code=502, detail=f"Preamble send failed: {exc}") from exc + + # Initial settle. + time.sleep(interval_s) + + # Try a non-blocking drain of any response to the wake. + try: + data = sock.recv(4096) + if data: + bytes_received += len(data) + log.info("slow_drip: device responded to wake preamble (%d bytes)", len(data)) + except socket.timeout: + pass + except OSError: + pass + + deadline = t0 + duration_s + drip = SESSION_RESET + stop_frame # 2 + 21 = 23 bytes per drip + send_error: Optional[str] = None + + while time.monotonic() < deadline: + try: + sock.sendall(drip) + bytes_sent += len(drip) + drips_sent += 1 + except OSError as exc: + send_error = f"{exc}" + log.warning("slow_drip: send failed after %d drips: %s", drips_sent, exc) + break + + # Drain any inbound bytes; ignore timeouts. + try: + data = sock.recv(4096) + if data: + bytes_received += len(data) + except socket.timeout: + pass + except OSError: + pass + + # Sleep the interval, but don't oversleep past the deadline. + remaining = deadline - time.monotonic() + if remaining <= 0: + break + time.sleep(min(interval_s, remaining)) + finally: + try: + sock.shutdown(socket.SHUT_RDWR) + except OSError: + pass + try: + sock.close() + except OSError: + pass + + elapsed = time.monotonic() - t0 + log.info( + "slow_drip done — drips=%d bytes_sent=%d bytes_received=%d in %.1fs", + drips_sent, bytes_sent, bytes_received, elapsed, + ) + return { + "status": "done", + "duration_s": round(elapsed, 2), + "drips_sent": drips_sent, + "bytes_sent": bytes_sent, + "bytes_received": bytes_received, + "preamble_bytes": len(preamble), + "drip_bytes": len(drip), + "send_error": send_error, + } + + +@app.post("/device/stop_monitoring_spam") +def device_stop_monitoring_spam( + host: str = Query(..., description="TCP host — modem IP"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + duration_s: float = Query(10.0, description="How long to hammer the device for (seconds)"), + connect_timeout: float = Query(0.5, description="Per-attempt TCP connect timeout (default 0.5s)"), + repeat: int = Query(3, description="Stop frames per TCP session (default 3)"), +) -> dict: + """ + Hammer the device with blind stop-monitoring sessions as fast as + possible for `duration_s` seconds. Each attempt: open TCP → write + SESSION_RESET + POLL handshake + STOP frames × repeat → close. No + response is read. + + Designed for units that are aggressively calling home — short + connect_timeout (default 500 ms) means every failed attempt loses + only that much time before retrying, so we can fit several attempts + per second even when the modem is mostly busy with its own outbound + sessions. + + Single HTTP call kicks off the whole burst; counters are returned + when it finishes. No streaming; if you want live progress, watch + SFM logs. + """ + log.info( + "POST /device/stop_monitoring_spam host=%s tcp_port=%s duration=%.1fs connect_timeout=%.3fs repeat=%d", + host, tcp_port, duration_s, connect_timeout, repeat, + ) + if repeat < 1: + repeat = 1 + duration_s = max(0.1, min(duration_s, 300.0)) # clamp 0.1s..5min + connect_timeout = max(0.05, connect_timeout) + + frame = build_bw_write_frame(SUB_STOP_MONITORING, b"") + payload = ( + SESSION_RESET + POLL_PROBE + + SESSION_RESET + POLL_DATA + + (frame * repeat) + ) + + t0 = time.monotonic() + deadline = t0 + duration_s + sent_ok = 0 + connect_failed = 0 + write_failed = 0 + + while time.monotonic() < deadline: + try: + sock = socket.create_connection((host, tcp_port), timeout=connect_timeout) + except OSError: + connect_failed += 1 + continue + try: + sock.sendall(payload) + sent_ok += 1 + except OSError: + write_failed += 1 + finally: + try: + sock.shutdown(socket.SHUT_RDWR) + except OSError: + pass + try: + sock.close() + except OSError: + pass + + elapsed = time.monotonic() - t0 + total = sent_ok + connect_failed + write_failed + log.info( + "stop_monitoring_spam done — sent=%d connect_failed=%d write_failed=%d in %.2fs", + sent_ok, connect_failed, write_failed, elapsed, + ) + return { + "status": "done", + "duration_s": round(elapsed, 2), + "sent_ok": sent_ok, + "connect_failed": connect_failed, + "write_failed": write_failed, + "total_attempts": total, + "rate_attempts_per_s": round(total / elapsed, 1) if elapsed > 0 else 0, + "payload_bytes": len(payload), + } + + +@app.post("/device/rescue") +def device_rescue( + port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), + baud: int = Query(38400, description="Serial baud rate"), + host: Optional[str] = Query(None, description="TCP host — modem IP or ACH relay"), + tcp_port: int = Query(DEFAULT_TCP_PORT, description=f"TCP port (default {DEFAULT_TCP_PORT})"), + connect_timeout: float = Query(5.0, description="TCP connect timeout in seconds (default 5)"), + recv_timeout: float = Query(5.0, description="Per-frame S3 recv timeout in seconds (default 5)"), + disable_ach: bool = Query(True, description="Disable Auto Call Home on the device before erasing"), + erase: bool = Query(True, description="Erase all stored events after disabling ACH"), +) -> dict: + """ + Rescue an uncooperative unit by squeezing all maintenance work into a + single TCP session. + + Designed for devices that are actively calling home to a separate ACH + server (BW or otherwise). While we hold this TCP session open the + modem cannot accept an inbound ACH call, so the order matters: + + 1. Short-timeout TCP connect (fails fast if the device is busy in + another session — the caller should retry in a tight loop). + 2. POLL handshake. + 3. (optional) Write call_home config with auto_call_home_enabled=false + so the device stops calling out even after we drop the session. + 4. (optional) Erase all stored events (0xA3 → 0x1C → 0x06 → 0xA2). + 5. Close the TCP session. + + Both `disable_ach` and `erase` default to true. Pass `?erase=false` if + you only want to silence the unit without wiping its events. + + Caller pattern (bash): + + until curl -sS --max-time 30 -X POST \\ + "http://localhost:8001/api/sfm/device/rescue?host=$IP&tcp_port=$P"; do + sleep 1 + done + """ + log.info( + "POST /device/rescue host=%s tcp_port=%s connect_timeout=%.1fs recv_timeout=%.1fs disable_ach=%s erase=%s", + host, tcp_port, connect_timeout, recv_timeout, disable_ach, erase, + ) + + steps: list[dict] = [] + t0 = time.monotonic() + + try: + with _build_client( + port, baud, host, tcp_port, + timeout=recv_timeout, + connect_timeout=connect_timeout, + ) as client: + steps.append({"step": "tcp_connect", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + try: + client.poll() + except Exception as exc: + log.warning("rescue: poll retry: %s", exc) + client.poll() + steps.append({"step": "poll", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + if disable_ach: + client.set_call_home_config(auto_call_home_enabled=False) + steps.append({"step": "disable_ach", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + if erase: + client.delete_all_events() + steps.append({"step": "erase", "ok": True, "elapsed_s": round(time.monotonic() - t0, 2)}) + + except ProtocolError as exc: + steps.append({"step": "error", "ok": False, "detail": f"protocol: {exc}"}) + raise HTTPException(status_code=502, detail={"message": f"Protocol error: {exc}", "steps": steps}) from exc + except OSError as exc: + steps.append({"step": "error", "ok": False, "detail": f"socket: {exc}"}) + # Connection refused / timed out → device busy in another session. Caller should retry. + raise HTTPException(status_code=503, detail={"message": f"Connection error: {exc}", "steps": steps}) from exc + except Exception as exc: + steps.append({"step": "error", "ok": False, "detail": str(exc)}) + raise HTTPException(status_code=500, detail={"message": f"Device error: {exc}", "steps": steps}) from exc + + conn_key = SFMCache.make_conn_key(host, tcp_port, port, baud) + cleared = get_cache().clear_device(conn_key) + return { + "status": "ok", + "elapsed_s": round(time.monotonic() - t0, 2), + "disable_ach": disable_ach, + "erase": erase, + "steps": steps, + "cache_cleared": cleared, + } + + @app.post("/device/monitor/start") def device_monitor_start( port: Optional[str] = Query(None, description="Serial port (e.g. COM5)"), @@ -1403,6 +1952,175 @@ def db_set_false_trigger( return {"status": "ok", "event_id": event_id, "false_trigger": value} +def _cleanup_event_files(row: dict) -> dict: + """ + Best-effort cleanup of on-disk waveform / sidecar / pickle / hdf5 files + associated with a deleted event row. Returns a dict of {kind: bool} for + what was actually removed (true) vs not found / failed (false). + """ + serial = row.get("serial") + bw_name = row.get("blastware_filename") + a5_name = row.get("a5_pickle_filename") + sc_name = row.get("sidecar_filename") + removed: dict = {} + if not serial: + return removed + store = _get_store() + # blastware_filename is the "base" — other files derive their paths from it + # via WaveformStore helpers. Sidecar and a5 may also be stored under their + # own column values if they ever diverged historically. + base_name = bw_name or a5_name or sc_name + if base_name: + bw_path, a5_path = store.paths_for(serial, base_name) + sc_path = store.sidecar_path_for(serial, base_name) + h5_path = store.hdf5_path_for(serial, base_name) + for kind, p in [("blastware", bw_path), ("a5_pickle", a5_path), + ("sidecar", sc_path), ("hdf5", h5_path)]: + try: + if p.exists(): + p.unlink() + removed[kind] = True + except OSError as exc: + log.warning("file cleanup failed for %s (%s): %s", p, kind, exc) + removed[kind] = False + return removed + + +@app.delete("/db/events/{event_id}") +def db_delete_event(event_id: str) -> dict: + """ + Hard-delete a single event from the SFM events table and remove any + associated on-disk waveform/sidecar/pickle/hdf5 files. + + Returns 404 if the event_id is not found. + """ + log.info("DELETE /db/events/%s", event_id) + deleted = _get_db().delete_event(event_id) + if deleted is None: + raise HTTPException(status_code=404, detail=f"Event {event_id} not found") + files_removed = _cleanup_event_files(deleted) + return { + "status": "ok", + "event_id": event_id, + "files_removed": files_removed, + } + + +class BulkDeleteBody(BaseModel): + """Body for POST /db/events/delete_bulk.""" + serial: Optional[str] = None + from_dt: Optional[str] = None # ISO-8601 + to_dt: Optional[str] = None # ISO-8601 + false_trigger: Optional[bool] = None + ids: Optional[list[str]] = None + confirm: bool = False + # Safety: when no `ids` are supplied, require this many max rows to + # actually be deleted; if the matched count exceeds it, the endpoint + # returns a dry-run-style summary instead. Pass None to disable. + max_rows: Optional[int] = 10000 + + +@app.post("/db/events/delete_bulk") +def db_delete_events_bulk(body: BulkDeleteBody) -> dict: + """ + Hard-delete multiple events at once, by filter and/or by id list. + + Filters (`serial`, `from_dt`, `to_dt`, `false_trigger`) combine with AND, + matching the same semantics as `GET /db/events`. `ids` is an additional + inclusion list. At least one filter or non-empty `ids` MUST be supplied + — refusing to wipe the whole table. + + Safety knobs: + - `confirm` MUST be `true` to actually delete. When false (default), + returns the match count without deleting (dry-run). + - `max_rows` (default 10,000) caps how many rows can be deleted in one + call by-filter; if the match count exceeds it, the endpoint returns + a count summary without deleting. Ignored when only `ids` is used. + + Returns: + { + "status": "ok" | "dry_run" | "too_many", + "matched": , + "deleted": , # 0 unless status == "ok" + "files_removed": , # total file unlink successes + "sample_serials": [...], # up to 5 distinct serials touched + } + """ + log.info( + "POST /db/events/delete_bulk serial=%s from=%s to=%s ft=%s ids=%d confirm=%s max=%s", + body.serial, body.from_dt, body.to_dt, body.false_trigger, + len(body.ids or []), body.confirm, body.max_rows, + ) + + from_parsed = datetime.datetime.fromisoformat(body.from_dt) if body.from_dt else None + to_parsed = datetime.datetime.fromisoformat(body.to_dt) if body.to_dt else None + + db = _get_db() + + # Dry-run path: count matches without deleting. + rows = db.query_events( + serial=body.serial, + from_dt=from_parsed, + to_dt=to_parsed, + false_trigger=body.false_trigger, + limit=1_000_000, # we want a true count, not a page + offset=0, + ) + if body.ids: + id_set = set(body.ids) + rows = [r for r in rows if r["id"] in id_set] + matched = len(rows) + sample_serials = sorted({r.get("serial") for r in rows[:50] if r.get("serial")})[:5] + + if not body.confirm: + return { + "status": "dry_run", + "matched": matched, + "deleted": 0, + "files_removed": 0, + "sample_serials": sample_serials, + "hint": "Set confirm=true in the request body to actually delete.", + } + + if body.max_rows is not None and not body.ids and matched > body.max_rows: + return { + "status": "too_many", + "matched": matched, + "deleted": 0, + "files_removed": 0, + "sample_serials": sample_serials, + "hint": ( + f"Matched {matched} > max_rows={body.max_rows}. Either raise " + f"max_rows in the body, narrow the filter, or supply an " + f"explicit `ids` list." + ), + } + + try: + deleted_rows = db.delete_events_bulk( + serial=body.serial, + from_dt=from_parsed, + to_dt=to_parsed, + false_trigger=body.false_trigger, + ids=body.ids, + ) + except ValueError as exc: + raise HTTPException(status_code=422, detail=str(exc)) from exc + + files_removed = 0 + for row in deleted_rows: + result = _cleanup_event_files(row) + files_removed += sum(1 for ok in result.values() if ok) + + return { + "status": "ok", + "matched": matched, + "deleted": len(deleted_rows), + "files_removed": files_removed, + "sample_serials": sample_serials, + } + + # ── /db/events/{id} — waveform file accessors ───────────────────────────────── # # These endpoints serve files from the persistent WaveformStore, so a Blastware diff --git a/sfm/waveform_store.py b/sfm/waveform_store.py index 93cd970..0d04460 100644 --- a/sfm/waveform_store.py +++ b/sfm/waveform_store.py @@ -300,6 +300,16 @@ class WaveformStore: except FileNotFoundError: pass + # read_blastware_file derives record_type from its path arg, but + # that arg is the tmp file (suffix ".bw") — so override with the + # original filename's encoded type (H/W/M/E/C in the BW AB0T + # scheme). Without this override every BW-imported event lands + # in the DB with record_type="Waveform" regardless of the actual + # type (Histogram, Manual, etc.). + ev.record_type = event_file_io.derive_record_type_from_filename( + source_path.name + ) + # Parse the BW ASCII report if one was supplied. Failures here # are non-fatal: we still write the binary + sidecar without the # rich derived fields.