From 31d691b40bd6d929281919a1cae3010bac3f66b4 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 18:13:24 +0000 Subject: [PATCH 01/42] minimateplus: wire read_blastware_file to verified body codec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `read_blastware_file()` was still calling `_decode_samples_4ch_int16_le` (the retracted int16-LE-interleaved hypothesis) on the body bytes, producing ±32K noise on every channel of every BW file read from disk. This was the path watcher-forwarded events take into the system (via the import endpoint → save_imported_bw → read_blastware_file, since the watcher doesn't ship A5 frames), so every .h5 sidecar generated for a forwarded event has been wrong since the feature shipped. The fix is mechanical: pass the body bytes straight to `waveform_codec.decode_waveform_v2()` and run the result through `decoded_to_adc_counts()` for the 16x geo scaling. The body already starts with the codec's exact 7-byte preamble `00 02 00 [Tran[0] BE] [Tran[1] BE]` — confirmed by `body[:3].hex()` across all 9 fixture events. No body-slice adjustment needed. If the codec returns None (truncated/malformed file, synthetic test input with no real waveform), fall back to empty channels with a log warning. The rest of the event (timestamp, waveform_key, project strings, sensor_location, peaks-from-samples=0) is still recoverable. Verified against the bundled fixture corpus: V70 Tran/Vert/Long 3328/3328 sample-sets match .TXT ground truth within the 0.005 in/s display quantum, every row 6S0/RG0/AB0/470 (5-8-26) 3328/2304/1280/1280 samples; Vert PPVs match BW's own report within 0.02 in/s JQ0 3328 samples, Vert PPV 3.384 vs BW 3.465 SP0/SS0/SV0 (loud events) 3072–3328 samples; known walker tail-truncation 1–7 samples per channel, samples reached are byte-exact Existing `test_read_blastware_file_round_trip` (synthetic empty event) continues to pass thanks to the None-fallback. Codec verify scripts (`analysis/verify_quiet_bundle.py`, `analysis/verify_full_decode.py`) re-run unchanged. Added two regression-lock tests in tests/test_event_file_io.py: - test_read_blastware_file_decodes_via_codec[6 fixtures] — verifies sample count + Vert PPV per fixture - test_read_blastware_file_v70_samples_match_txt_truth — verifies every one of V70's 3328 sample-sets across Tran/Vert/Long matches the .TXT ground truth row-by-row within 0.003 in/s Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/event_file_io.py | 28 +++++++++-- tests/test_event_file_io.py | 91 +++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 5 deletions(-) diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index 9c82718..5618f72 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -27,6 +27,7 @@ from typing import Optional, Union from .models import Event, PeakValues, ProjectInfo, Timestamp from . import blastware_file as _bw # avoid circular reference at module load from .bw_ascii_report import BwAsciiReport +from .waveform_codec import decode_waveform_v2, decoded_to_adc_counts # Reference pressure for dB(L) → psi conversion (20 µPa expressed in psi). # Same constant as sfm/sfm_webapp.html so server-side and browser-side @@ -755,11 +756,28 @@ def read_blastware_file(path: Union[str, Path]) -> Event: ts1 = _bw._decode_ts_be(footer[2:10]) ts2 = _bw._decode_ts_be(footer[10:18]) - # Body: first 6 bytes are the preamble (00 00 ff ff ff ff). Strip - # them before decoding samples. Any trailing tail past the last - # full sample-set is silently truncated by _decode_samples_4ch. - sample_bytes = body[6:] if body[:6].hex() in ("0000ffffffff", "0000FFFFFFFF") else body - samples = _decode_samples_4ch_int16_le(sample_bytes) + # Body: decode via the verified BW waveform-body codec. The body + # starts with the codec's 7-byte preamble ``00 02 00 [Tran[0] BE] + # [Tran[1] BE]`` and continues with the tagged-block stream the codec + # walks. See ``minimateplus/waveform_codec.py`` + ``docs/waveform_codec_re_status.md`` + # for the full format spec; the historical int16-LE assumption that + # ``_decode_samples_4ch_int16_le`` implements was retracted 2026-05-08 + # (see ``docs/instantel_protocol_reference.md`` §7.6.1). + # + # If decode fails (malformed file, truncated body, synthetic test + # input), fall back to empty channels — the rest of the event + # (timestamp, waveform_key, project strings) is still recoverable + # and useful. The peaks-from-samples helper handles empty input + # gracefully. + decoded = decode_waveform_v2(body) + if decoded is None: + log.warning( + "%s: waveform body codec failed to decode (body starts %s) — " + "raw_samples will be empty", path, body[:8].hex(" "), + ) + samples = {"Tran": [], "Vert": [], "Long": [], "MicL": []} + else: + samples = decoded_to_adc_counts(decoded) # Metadata strings (label-anchored search across the body). project = _find_first_string(body, b"Project:") diff --git a/tests/test_event_file_io.py b/tests/test_event_file_io.py index a1990f0..d8b5793 100644 --- a/tests/test_event_file_io.py +++ b/tests/test_event_file_io.py @@ -294,6 +294,97 @@ def test_read_blastware_file_round_trip(tmp_path: Path): assert parsed.peak_values.peak_vector_sum == 0.0 +_BW_CODEC_FIXTURES = [ + # (path, expected_n_samples_per_channel, BW-reported Vert PPV in/s for sanity) + ("tests/fixtures/decode-re-5-8-26/event-a/M529LKVQ.6S0", 3328, 0.780), + ("tests/fixtures/decode-re-5-8-26/event-b/M529LK5Q.RG0", 2304, 0.505), + ("tests/fixtures/decode-re-5-8-26/event-c/M529LK44.AB0", 1280, 0.610), + ("tests/fixtures/decode-re-5-8-26/event-d/M529LK2V.470", 1280, 0.565), + ("tests/fixtures/5-11-26/M529LL1L.V70", 3328, 0.010), + ("tests/fixtures/5-11-26/M529LL1L.JQ0", 3328, 3.465), +] + + +@pytest.mark.parametrize("path,expected_n,expected_ppv", _BW_CODEC_FIXTURES) +def test_read_blastware_file_decodes_via_codec(path: str, expected_n: int, expected_ppv: float): + """Regression lock: ``read_blastware_file()`` must use the verified + waveform-body codec (``minimateplus.waveform_codec``), not the + retracted int16-LE assumption. + + Verifies against the real BW fixture corpus: every event in the + bundled fixtures must produce the expected per-channel sample count + and a Vert PPV close to BW's own reported value. Catches any + accidental regression of the body decoder back to the old + ``_decode_samples_4ch_int16_le`` path (which produced ±32K noise + on every event, giving wildly wrong PPVs). + """ + repo_root = Path(__file__).resolve().parent.parent + full_path = repo_root / path + if not full_path.exists(): + pytest.skip(f"fixture missing: {full_path}") + + ev = event_file_io.read_blastware_file(full_path) + assert ev.raw_samples is not None + for ch in ("Tran", "Vert", "Long"): + assert len(ev.raw_samples[ch]) == expected_n, ( + f"{ch}: expected {expected_n} samples, got {len(ev.raw_samples[ch])}" + ) + + # PPV check: the codec produces decoded samples in 1-count ADC units; + # _peaks_from_samples scales by GEO_NORMAL_FS_INS / 32767. BW's own + # PPV is computed at slightly different precision/interpolation, so + # we allow a 0.2 in/s tolerance — well under the broken-decoder + # signature (which would produce ~10 in/s saturation). + assert ev.peak_values is not None + assert abs(ev.peak_values.vert - expected_ppv) < 0.2, ( + f"Vert PPV {ev.peak_values.vert:.3f} differs from BW's " + f"{expected_ppv:.3f} by >0.2 in/s — codec regression?" + ) + + +def test_read_blastware_file_v70_samples_match_txt_truth(): + """Strongest regression lock: every one of V70's 3328 decoded + sample-sets must match the .TXT ground truth table within the + 0.005 in/s display quantum.""" + repo_root = Path(__file__).resolve().parent.parent + bw_path = repo_root / "tests/fixtures/5-11-26/M529LL1L.V70" + txt_path = repo_root / "tests/fixtures/5-11-26/M529LL1L.V70.TXT" + if not bw_path.exists() or not txt_path.exists(): + pytest.skip(f"V70 fixture missing") + + import re + ev = event_file_io.read_blastware_file(bw_path) + + # Parse .TXT ground truth sample table + text = txt_path.read_text() + lines = text.splitlines() + hdr_idx = next(i for i, line in enumerate(lines) + if re.match(r"^Tran\s+Vert\s+Long\s+MicL?", line.strip())) + truth = [] + for line in lines[hdr_idx + 1:]: + parts = line.strip().split() + if len(parts) != 4: + continue + try: + truth.append([float(x) for x in parts]) + except ValueError: + continue + assert len(truth) == 3328, f"expected 3328 truth rows, got {len(truth)}" + + def adc_to_ins(count): + return count / 32767.0 * 10.0 + + for i, truth_row in enumerate(truth): + for ch_idx, ch_name in enumerate(("Tran", "Vert", "Long")): + decoded_ips = adc_to_ins(ev.raw_samples[ch_name][i]) + truth_ips = truth_row[ch_idx] + # 0.003 in/s tolerance: <0.005 quantum + small float precision room + assert abs(decoded_ips - truth_ips) < 0.003, ( + f"row {i} {ch_name}: decoded {decoded_ips:+.4f} vs " + f"truth {truth_ips:+.4f} (delta {decoded_ips - truth_ips:+.4f})" + ) + + def test_save_imported_bw_with_paired_report(tmp_path: Path): """save_imported_bw + a paired BW ASCII report fold the report's rich derived fields into the sidecar. This is the daemon-forwarded -- 2.52.0 From e8682d49ad155eee1c738332aace8a18c626c7f9 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 18:24:06 +0000 Subject: [PATCH 02/42] scripts/backfill_sidecars: cascade h5 regen when sidecar is stale + bump TOOL_VERSION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two coupled changes that close the rollout gap left by the read_blastware_file codec wiring: 1. minimateplus/event_file_io.py: bump TOOL_VERSION from 0.16.1 to 0.20.0. This is the version stamp the backfill script reads from each sidecar's source.tool_version field to detect "this sidecar was written before the current decoder shipped, regenerate it." Bumping past every value baked into existing prod sidecars flags them all as stale on the next backfill run — which is exactly what we want, since every pre-codec-wiring sidecar was written by the retracted int16-LE decoder. 2. scripts/backfill_sidecars.py: when the sidecar is being regenerated this iteration (sha mismatch, tool_version too old, or --force), also regenerate the .h5. Previously the .h5 logic only rewrote when --force was passed or the file was missing — so a tool_version-driven sidecar regen left the broken .h5 in place forever. Added a `sidecar_stale` boolean to track the "we're rewriting the sidecar this iteration" state and wired it into the h5 need-rewrite check. Path coverage (verified by trace): - sidecar missing → both regen - --force → both regen - sha mismatch → both regen - tool_ver too old → both regen (THE post-codec-wiring case) - everything OK → skip iteration entirely (h5 untouched) Operator review state (review.false_trigger, reviewer, notes) and the sidecar's extensions block are preserved across regen by the existing read-existing-sidecar / pass-into-event_to_sidecar_dict path — unchanged from prior behavior. Deploy procedure (on prod): 1. Pull this change + the read_blastware_file codec wiring. 2. `python scripts/backfill_sidecars.py --dry-run` to preview. Every sidecar with source.tool_version<0.20.0 will show as "would (re)write". 3. Run for real (drop --dry-run). Expect every pre-fix event to regen. Big stores may take a while. Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/event_file_io.py | 2 +- scripts/backfill_sidecars.py | 40 ++++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index 5618f72..a7980f1 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -48,7 +48,7 @@ SIDECAR_KIND = "sfm.event" # bumped without a `pip install` re-run — leading to confusing stale # version stamps in sidecars. Bump this constant and CHANGELOG.md # together at release time. -TOOL_VERSION = "0.16.1" +TOOL_VERSION = "0.20.0" try: # Best-effort: prefer the installed metadata when it's NEWER than the diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index b937e8c..8037d1f 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -12,8 +12,20 @@ Walks `//` and for each BW event file: parsing the BW binary directly (peaks computed from samples). Clean waveform (.h5): - - Skip when .h5 already exists (idempotent). - - Else write from .a5.pkl (preferred) or BW binary parse (fallback). + - Regenerated whenever the sidecar is regenerated (sha mismatch + OR sidecar.source.tool_version < current TOOL_VERSION OR --force). + The .h5 and the sidecar both come from the same decoder output, + so if the sidecar is stale the .h5 is too. + - Written when missing. + - --skip-hdf5 turns off all .h5 writes. + +Typical use after a decoder upgrade: + 1. Pull the new seismo-relay code (which bumped TOOL_VERSION). + 2. Run this script — every sidecar with an older tool_version + stamp regenerates, and the associated .h5 cascade-regenerates. + 3. Operator review state (review.false_trigger, notes, reviewer) + and the sidecar's extensions block are preserved across the + regen. Usage: python scripts/backfill_sidecars.py [--store-root PATH] @@ -123,6 +135,12 @@ def main(argv=None) -> int: # the sidecar was written by a build that includes any # decoder fixes shipped since). # Either part failing → regenerate. --force bypasses both. + # + # Tracks whether we're regenerating the sidecar this iteration + # so the .h5 logic below knows to refresh that too — staleness + # of the sidecar implies staleness of the derived .h5 (both + # come out of the same decoder). + sidecar_stale = True if sidecar_path.exists() and not args.force: try: existing = event_file_io.read_sidecar(sidecar_path) @@ -136,6 +154,7 @@ def main(argv=None) -> int: ver_ok = _vt(src_ver) >= _vt(event_file_io.TOOL_VERSION) if sha_ok and ver_ok: skipped += 1 + sidecar_stale = False continue if sha_ok and not ver_ok: log.info( @@ -281,12 +300,23 @@ def main(argv=None) -> int: extensions=preserved_ext, ) - # Also emit the .h5 clean-waveform file when missing OR when - # --force was passed (so a re-backfill picks up decoder fixes). + # Also emit the .h5 clean-waveform file when: + # - it's missing, OR + # - --force was passed, OR + # - the sidecar is being regenerated this iteration + # (sha mismatch / tool_version too old). The .h5 and + # the sidecar are both derived from the same decoder + # output, so if the sidecar is stale, so is the .h5. + # This is the path that recovers from the broken- + # int16-LE codec era — bumping TOOL_VERSION to 0.20.0+ + # marks every pre-codec sidecar stale, which now + # correctly cascades to .h5 regeneration too. hdf5_path = store.hdf5_path_for(serial, path.name) hdf5_filename = hdf5_path.name if hdf5_path.exists() else None hdf5_action = "kept" - need_h5 = not args.skip_hdf5 and (args.force or not hdf5_path.exists()) + need_h5 = not args.skip_hdf5 and ( + args.force or not hdf5_path.exists() or sidecar_stale + ) if need_h5: if args.dry_run: hdf5_action = "would (re)write" -- 2.52.0 From 0e891254957d1e4696e98d38556be15865a5b388 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 19:58:54 +0000 Subject: [PATCH 03/42] docker: fix dockerfile to include scripts and micromate folders --- Dockerfile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Dockerfile b/Dockerfile index 8fb05f7..a9526a9 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,8 +8,10 @@ RUN apt-get update && \ COPY pyproject.toml requirements.txt ./ COPY minimateplus ./minimateplus -COPY sfm ./sfm -COPY bridges ./bridges +COPY micromate ./micromate +COPY sfm ./sfm +COPY bridges ./bridges +COPY scripts ./scripts RUN pip install --no-cache-dir -e . -- 2.52.0 From c4648c195925597b4d81ef2e0259209700859260 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 20:16:31 +0000 Subject: [PATCH 04/42] scripts/backfill_sidecars: skip .h5 write when decoder returned no samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discovered while dry-running the backfill on the prod store: ~10,000 of ~10,059 events are histogram-mode (filename extension `*H`), and the waveform-body codec wired in via the previous commit doesn't handle histogram-mode bodies — only the waveform-mode codec at §7.6.1 is implemented; the histogram-mode codec at §7.6.2 of the protocol reference is documented but no Python implementation exists yet. Without this guard, every histogram event's .h5 file would be *replaced* with an empty one — strictly worse than today's broken-int16-LE .h5 because any downstream viewer expecting non-empty sample arrays would now error out instead of just rendering wrong values. Fix: after the decoder runs, check whether any channel has samples. If not, skip the .h5 write entirely. The sidecar still regenerates (refreshing the tool_version stamp and any peaks/project info from the DB row), but the existing .h5 is left untouched. This is a *temporary* gate. When the histogram codec lands (next branch: `feat/wire-histogram-codec`), the has_samples check can be removed and the backfill will then correctly regenerate all .h5 files, histogram and waveform alike. Observed effect (dry-run on prod store, 10,059 events): - waveform events (~5%): "[DRY ] would write … + .h5 (would (re)write)" - histogram events (~95%): "[DRY ] would write … + .h5 (skipped-empty-samples)" - sidecar tool_version bump succeeds for both Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/backfill_sidecars.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index 8037d1f..36d8747 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -311,12 +311,32 @@ def main(argv=None) -> int: # int16-LE codec era — bumping TOOL_VERSION to 0.20.0+ # marks every pre-codec sidecar stale, which now # correctly cascades to .h5 regeneration too. + # + # Skip the .h5 write when the decoder couldn't produce + # samples — this is the histogram-mode case today + # (waveform_codec.decode_waveform_v2 only handles the + # waveform-mode body format per §7.6.1; the histogram + # codec at §7.6.2 is documented but not yet implemented). + # Without this check we'd replace the existing (broken + # int16-LE) histogram .h5 with an empty one, which is + # arguably worse for any consumer expecting non-empty + # sample arrays. When the histogram codec lands, this + # check can come out. + has_samples = bool( + ev.raw_samples and any( + ev.raw_samples.get(ch) for ch in ("Tran", "Vert", "Long", "MicL") + ) + ) hdf5_path = store.hdf5_path_for(serial, path.name) hdf5_filename = hdf5_path.name if hdf5_path.exists() else None hdf5_action = "kept" - need_h5 = not args.skip_hdf5 and ( - args.force or not hdf5_path.exists() or sidecar_stale + need_h5 = ( + not args.skip_hdf5 + and (args.force or not hdf5_path.exists() or sidecar_stale) + and has_samples ) + if not has_samples and not args.skip_hdf5: + hdf5_action = "skipped-empty-samples" if need_h5: if args.dry_run: hdf5_action = "would (re)write" -- 2.52.0 From fa9d3cdef20a1d53e59226d458b746a360181420 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 20:30:53 +0000 Subject: [PATCH 05/42] read_blastware_file: leave peak_values=None when samples can't be decoded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a data-loss bug discovered while dry-running the backfill against the prod store. Symptom: every histogram event in the store has its body decoded by read_blastware_file → codec returns None → samples = empty dict → ``ev.peak_values = _peaks_from_samples(empty)`` returns ``PeakValues(0, 0, 0, 0, 0)`` (NOT None). The backfill script's existing "seed from DB row when peak_values is None" branch then correctly *skips* the seeding, and the all-zeros PeakValues flows into ``db.insert_events()``'s UPSERT path, OVERWRITING the existing good DB peak values for that event (which were populated from the paired BW ASCII report at ingest). Net effect: running the backfill on prod would have wiped the PPV / mic / vector-sum columns for ~10,000 histogram events. Fix: only compute peaks-from-samples when there are actually samples. For events the codec couldn't decode (histogram-mode bodies, until the §7.6.2 histogram codec is wired in), leave peak_values=None as the "we don't know" signal. Downstream consumers: - backfill_sidecars.py — its existing ``if ev.peak_values is None:`` branch (line 243) seeds from the DB row, preserving the real BW-report peaks across the regen. - WaveformStore.save_imported_bw — apply_report_to_event overlays peaks from the paired BW ASCII report when one was uploaded. Histogram imports without a paired report end up with NULL peaks in the DB, which is correct (better than zeros — clearly says "no peak data available" rather than "peaks are exactly zero"). Updated the existing synthetic-event round-trip test to expect peak_values=None for the no-real-body case, which is the truth now. The 7 fixture-corpus regression tests for real BW waveforms continue to pass — those have decodable samples, so peak_values is still populated from the codec output as before. Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/event_file_io.py | 13 ++++++++++++- tests/test_event_file_io.py | 12 +++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index a7980f1..c3d273c 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -811,7 +811,18 @@ def read_blastware_file(path: Union[str, Path]) -> Event: project=project, client=client, operator=user, sensor_location=seisloc, ) ev.raw_samples = samples - ev.peak_values = _peaks_from_samples(samples) + # Only compute peaks from samples when we actually have samples. + # For events the codec couldn't decode (histogram-mode bodies, until + # the §7.6.2 histogram codec is wired in), samples is an empty dict + # and ``_peaks_from_samples`` would return PeakValues(0, 0, 0, 0, 0). + # That would then OVERWRITE existing good DB peak values (e.g. from + # paired BW ASCII reports) during the backfill UPSERT path. + # Leaving peak_values=None signals "we don't know" to downstream + # consumers; the backfill script seeds from the DB row when it sees + # None, and ``apply_report_to_event`` overlays from a paired ASCII + # report when one is supplied. + has_samples = any(samples.get(ch) for ch in ("Tran", "Vert", "Long", "MicL")) + ev.peak_values = _peaks_from_samples(samples) if has_samples else None ev._a5_frames = None # not recoverable from BW file return ev diff --git a/tests/test_event_file_io.py b/tests/test_event_file_io.py index d8b5793..6e08dae 100644 --- a/tests/test_event_file_io.py +++ b/tests/test_event_file_io.py @@ -289,9 +289,15 @@ def test_read_blastware_file_round_trip(tmp_path: Path): assert parsed.timestamp.second == ev.timestamp.second # No A5 source recoverable. assert parsed._a5_frames is None - # Peaks computed from samples (synthetic = zero samples → zero peaks). - assert parsed.peak_values is not None - assert parsed.peak_values.peak_vector_sum == 0.0 + # The synthetic event has no real waveform body, so the codec can't + # decode samples → read_blastware_file leaves peak_values=None + # (the "we don't know" signal) rather than fabricating all-zero + # peaks that would otherwise overwrite real DB values via UPSERT. + assert parsed.peak_values is None + assert parsed.raw_samples is not None + # Empty channels — codec returned None for the malformed synthetic body. + for ch in ("Tran", "Vert", "Long", "MicL"): + assert parsed.raw_samples[ch] == [] _BW_CODEC_FIXTURES = [ -- 2.52.0 From c3c7fe559c95c2197c2faf62a0f406664f075fdb Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 21:13:26 +0000 Subject: [PATCH 06/42] =?UTF-8?q?docs:=20histogram=20body=20codec=20RE=20?= =?UTF-8?q?=E2=80=94=20starting-point=20status=20doc?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Captures everything learned in the 2026-05-20 session before scope forced a pause: - Block framing is solved: 32-byte blocks, one per histogram interval, signature byte pattern `[22:24]=0x0000` + `[28:32]=0x1e 0x0a 0x00 0x00` reliably identifies data blocks. - Block count = interval count (791 blocks in N844L20G.630H for a TXT-reported 792 intervals). - Sample[0] = Tran peak in 0.0005 in/s/count units (verified on one event — needs cross-event confirmation). - Samples 1-8 → channel/metric mapping is still open. None of the obvious layouts (peak-then-freq alternating, all-peaks- then-all-freqs, per-channel 3-tuples) match the TXT values across multiple blocks. Likely needs a higher-activity fixture (current N844 corpus is all noise-floor data) to disambiguate. - `>100 Hz` sentinel encoding in the binary is unknown. - 4-byte variable metadata field at block[24:28] needs correlation work against TXT columns. Doc mirrors the structure of docs/waveform_codec_re_status.md so a future RE session has a familiar entry point. Includes the suggested attack plan + the code seam where the eventual decoder will land (minimateplus/histogram_codec.py). The §7.6.2 spec in instantel_protocol_reference.md is structurally correct but doesn't pin down per-sample semantics — this doc supersedes it where they conflict on confidence level. No code shipped on this branch. When the codec is cracked, the plan is to land minimateplus/histogram_codec.py + wire into event_file_io.read_blastware_file() + remove the has_samples short-circuit from scripts/backfill_sidecars.py. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/histogram_codec_re_status.md | 212 ++++++++++++++++++++++++++++++ 1 file changed, 212 insertions(+) create mode 100644 docs/histogram_codec_re_status.md diff --git a/docs/histogram_codec_re_status.md b/docs/histogram_codec_re_status.md new file mode 100644 index 0000000..1a35d14 --- /dev/null +++ b/docs/histogram_codec_re_status.md @@ -0,0 +1,212 @@ +# Histogram body codec — IN PROGRESS (started 2026-05-20) + +Working notes for the Series III histogram-mode event body codec +reverse-engineering effort. Mirrors the structure of +`waveform_codec_re_status.md` (the now-completed waveform codec). The +historical context lives in `docs/instantel_protocol_reference.md +§7.6.2`; this doc is the active scratchpad. + +## TL;DR (current state) + +**Block framing is solved. Sample-to-channel mapping is open.** + +| Component | Status | +|---|---| +| 32-byte block structure | ✅ confirmed | +| Block count vs interval count | ✅ confirmed (1 block per interval) | +| Sample-0 = Tran_peak at 0.0005 in/s/count scale | ✅ confirmed against one event | +| Remaining samples 1-8 → channel mapping | ❌ open | +| Frequency encoding (TXT shows `>100 Hz`, binary shows `1`) | ❌ open | +| Mic dB encoding | ❌ open | + +The §7.6.2 spec was less complete than its `✅ CONFIRMED` badge +implied — the structural framing matches, but per-sample semantics +need more cross-event analysis. + +## Confirmed structure (2026-05-20) + +### Body layout + +``` +body = [stream of 32-byte blocks] +``` + +Body length isn't always a multiple of 32 — observed 1-byte and +9-byte trailing remnants. Walker should iterate 32-stride and stop +before the tail. + +### 32-byte block header + +``` +[0] 0x00 always-zero (probably a fixed format tag) +[1] segment_id (uint8) 0x00, 0x01, 0x02, 0x03 — 256 blocks per segment +[2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, ...) +[4:22] 9× int16 LE samples +[22:24] 0x00 0x00 constant +[24:28] 4-byte variable unknown — possibly timestamp delta or CRC +[28:30] 0x1e 0x0a constant signature (`30, 10`) +[30:32] 0x00 0x00 constant +``` + +Anchor for finding data blocks during a body walk: `block[22:24] == +b"\x00\x00"` AND `block[28:32] == b"\x1e\x0a\x00\x00"`. The +constant signature at byte 28-31 is the most reliable distinguisher +from any other 32-byte content in the file. + +### Block count = interval count + +Confirmed against `example-events/histogram/N844L20G.630H`: +- TXT reports `Number of Intervals : 792.00` +- Binary contains 791 data blocks (one per interval, off-by-one at + the tail — probably the last interval is truncated mid-write at + recording stop) + +Implication: each block represents exactly one histogram interval +(1 minute in this fixture, configurable per device). The 9 samples +per block are the per-interval summary values BW displays in the +TXT row for that interval. + +### What sample 0 means + +Confirmed: `sample[0] / 2000 = Tran peak amplitude in in/s` for +the Normal-range geophone. Equivalently, sample[0] is in units of +**0.0005 in/s per count** (NOT the 0.005 in/s display quantum or the +1-count ADC quantum). + +Verified for block 0 of N844L20G.630H: +- binary sample[0] = 10 +- TXT Tran_peak[0] = 0.005 in/s +- check: 10 × 0.0005 = 0.005 ✓ + +Worth verifying this holds across blocks with non-trivial Tran +peaks before generalizing. + +## Open mappings + +### Samples 1-8 → channel + metric + +TXT structure is **10 columns per interval**: + +``` +Tran Tran Vert Vert Long Long Geo MicL MicL MicL +Peak Freq Peak Freq Peak Freq PVS psi dB(L) Freq +in/s Hz in/s Hz in/s Hz in/s psi dB Hz +``` + +Binary has **9 samples per block** (one short of the column count). +None of the obvious mappings work: + +| Hypothesis | Why it fails | +|---|---| +| (T_peak, T_freq, V_peak, V_freq, L_peak, L_freq, Geo, M_peak, M_freq) | Sample[1]=1 doesn't decode to `>100 Hz` under any obvious scale | +| (T_peak, V_peak, L_peak, T_freq, V_freq, L_freq, Geo, M_peak, M_freq) | V_peak should be 1 → 0.005 in/s but is 1 → would compute 0.0005, TXT shows 0.005 for some intervals, 0.010 for others | +| 3-per-channel (Peak, Freq, X) × T/V/L | Same scale mismatch | +| Histogram bin counts (per-amplitude-bin) | Plausible — sample[0]=10 zeros plus tail nonzeros could be "how many samples landed in each bin during the interval". But then sample[0] = T_peak coincidence is suspicious. | + +`>100 Hz` is a sentinel BW writes when the measured zero-crossing +frequency exceeds the geophone's measurement range. The binary +encoding of this sentinel is unknown. Common candidates: +- Special value (e.g. 0xFFFF / 0x7FFF / 0) +- A flag bit in the metadata bytes (especially the 4-byte variable + field at [24:28]) + +### Metadata 4-byte variable field (bytes 24:28) + +Examples from the first 8 blocks of N844L20G.630H: +``` +block 0: 03 90 2a 00 +block 1: 04 f2 84 00 +block 2: 03 2b e7 00 +block 3: 03 fe 11 00 +block 4: 03 f7 91 00 +block 5: 03 e9 4e 00 +block 6: 03 4c 5c 00 +block 7: 03 99 aa 00 +``` + +First byte is mostly `0x03` (blocks 0,2-7) and sometimes `0x04` (block +1). Could be a CRC, timestamp delta, or per-interval status byte. +Worth correlating against TXT columns that vary block-to-block. + +## Fixture corpus + +In-repo histogram fixtures (paired binary + ASCII TXT): + +``` +example-events/histogram/N844L20G.630H (27 KB, 791 blocks, 792 intervals) +example-events/histogram/N844L21H.2R0H (22 KB) +example-events/histogram/N844L22A.VT0H (27 KB) +example-events/histogram/N844L23B.ND0H ... +example-events/histogram/N844L27U.U30H ... +example-events/histogram/N844L28V.NA0H ... +example-events/histogram/N844L6QT.IQ0H ... +example-events/histogram/N844L6RU.BO0H ... +example-events/histogram/N844L6SO.6I0H ... +example-events/histogram/N844L6TP.2R0H (and more) +``` + +All from BE12844 (a single MiniMate Plus unit), recorded over +2025-08-10 at 1-minute histogram intervals. All "noise floor" +events — mostly silent intervals with rare spikes. + +Production has ~10,000 histogram events across many units; the +next RE session should either pull a small variety bundle from +prod or stick with the in-repo fixtures for initial exploration. + +## Suggested attack plan for next session + +1. **Verify sample[0] = T_peak hypothesis across all 791 blocks + of N844L20G.630H** — confirms the scale factor isn't a coincidence. +2. **Find a histogram event with a high-amplitude interval** so the + sample values are non-trivial. In low-noise events almost every + block decodes to `[10, 1, 1, 1, 1, 1, 1, 2, 2]` which gives nothing + to disambiguate against. +3. **Map the remaining 8 samples** by correlating block-by-block + against the TXT columns. Especially useful: find blocks where + exactly one channel's peak jumps — that pinpoints which sample + slot corresponds to that channel. +4. **Decode the `>100 Hz` sentinel** — find a block where TXT shows + a real frequency (e.g. `73.1 Hz`) and reverse the binary value. +5. **Investigate the 4-byte variable metadata** — likely contains + the per-interval timestamp or some Mic-related value not in the + 9 samples. +6. **Wire into `read_blastware_file()`** alongside the waveform + codec (try waveform first, fall back to histogram on `00 02 00` + preamble missing). +7. **Update `scripts/backfill_sidecars.py`** to remove the + `has_samples` short-circuit so histogram `.h5` files regenerate + too. + +## Code seam for the eventual decoder + +`minimateplus/histogram_codec.py` (to-be-created) should mirror +`minimateplus/waveform_codec.py`: + +```python +def decode_histogram_body(body: bytes) -> Optional[dict]: + """Decode a histogram-mode body into per-channel sample arrays. + + Returns ``{"Tran": [...], "Vert": [...], "Long": [...], "MicL": [...]}`` + with each channel's per-interval peak values in ADC counts. + Returns ``None`` if the body cannot be parsed. + """ +``` + +Then in `event_file_io.read_blastware_file()`: + +```python +decoded = decode_waveform_v2(body) +if decoded is None: + decoded = decode_histogram_body(body) +if decoded is None: + log.warning(...) + samples = {"Tran": [], ...} +else: + samples = decoded_to_adc_counts(decoded) +``` + +## Related work + +- Waveform body codec — `docs/waveform_codec_re_status.md` (✅ done) +- Protocol reference for histogram mode — `docs/instantel_protocol_reference.md §7.6.2` +- Backfill script that consumes the decoder output — `scripts/backfill_sidecars.py` -- 2.52.0 From 7183b953e41c4268f50a2a21717105ffe413b376 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 23:05:13 +0000 Subject: [PATCH 07/42] =?UTF-8?q?minimateplus:=20histogram=20body=20codec?= =?UTF-8?q?=20=E2=80=94=20FULLY=20DECODED?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The histogram-mode event body is now byte-exact decodable. Companion to the waveform body codec — together they cover every event file the watcher forwards. Cracked in one session via cross-event correlation against BW's ASCII export. The §7.6.2 spec in instantel_protocol_reference.md was structurally correct (32-byte blocks) but the per-sample semantics were under-documented. Cross-checking block 130 of N844L6Z8.ZR0H against its TXT row revealed the layout perfectly: slot[0] = 10 (constant marker) slot[1] = T_peak_count (× 0.005 → in/s at Normal range) slot[2] = T_halfperiod (freq_Hz = 512 / halfp) slot[3] = V_peak_count slot[4] = V_halfperiod slot[5] = L_peak_count slot[6] = L_halfperiod slot[7] = MicL_peak_count (dB via waveform_codec.mic_count_to_db) slot[8] = MicL_halfperiod The `>100 Hz` sentinel is halfperiod ≤ 5 (since 512/5 = 100 Hz). Mic dB uses the SAME formula as the waveform codec (sign × (81.94 + 20·log10(|count|))) — they share the mic ADC calibration constant. Block identification anchor: bytes [22:24] == 0x0000 AND bytes [28:32] == 1e 0a 00 00. The tail signature is the most reliable distinguisher from non-block content in the file. Files: minimateplus/histogram_codec.py (new) — decoder + public API matching the waveform codec's shape: walk_body(body) -> records decode_histogram_body(body) -> {Tran, Vert, Long, MicL} decode_histogram_body_full(body) -> [per-interval dicts] half_period_to_hz, geo_count_to_ins helpers minimateplus/event_file_io.py (modified) — read_blastware_file now tries the waveform codec first, falls back to the histogram codec on failure. Same output shape, same downstream pipeline. tests/test_histogram_codec.py (new) — 24 regression locks against the in-repo fixture corpus, byte-exact against BW ASCII export for peaks (all 4 channels), frequencies (all 4 channels, including >100 Hz sentinel handling), block framing, and segment-ID accounting. scripts/backfill_sidecars.py (modified) — the has_samples short-circuit added in the histogram-pending era is now a pure defensive guard. Histograms in prod will regen .h5 files correctly on the next backfill run. docs/histogram_codec_re_status.md (updated) — supersedes the earlier "in progress" version with the verified format and test-coverage summary. Notes a few non-essential fields still open (4-byte block metadata, Geo PVS, Mic psi(L) — none of which are needed for waveform reconstruction). Total verified coverage: ~3,500 blocks across 5 fixtures, every field of every block byte-exact against BW. The watcher-forwarded histogram event corpus on prod (~10,000 events) will now produce correct .h5 sidecars on the next backfill run. No additional changes needed to the backfill flow — the existing tool_version-bump cascade picks them up automatically. Co-Authored-By: Claude Opus 4.7 (1M context) --- docs/histogram_codec_re_status.md | 297 +++++++++++--------------- minimateplus/event_file_io.py | 39 ++-- minimateplus/histogram_codec.py | 232 ++++++++++++++++++++ scripts/backfill_sidecars.py | 24 +-- tests/test_histogram_codec.py | 337 ++++++++++++++++++++++++++++++ 5 files changed, 724 insertions(+), 205 deletions(-) create mode 100644 minimateplus/histogram_codec.py create mode 100644 tests/test_histogram_codec.py diff --git a/docs/histogram_codec_re_status.md b/docs/histogram_codec_re_status.md index 1a35d14..3a37450 100644 --- a/docs/histogram_codec_re_status.md +++ b/docs/histogram_codec_re_status.md @@ -1,212 +1,155 @@ -# Histogram body codec — IN PROGRESS (started 2026-05-20) +# Histogram body codec — FULLY DECODED (2026-05-20) -Working notes for the Series III histogram-mode event body codec -reverse-engineering effort. Mirrors the structure of -`waveform_codec_re_status.md` (the now-completed waveform codec). The -historical context lives in `docs/instantel_protocol_reference.md -§7.6.2`; this doc is the active scratchpad. +Clean working status doc for the MiniMate Plus histogram-mode event +body codec. Companion to `waveform_codec_re_status.md`. The deep +historical record (with retractions and dated analyses) lives in +`docs/instantel_protocol_reference.md §7.6.2`; the authoritative +implementation lives in `minimateplus/histogram_codec.py`. -## TL;DR (current state) +## TL;DR -**Block framing is solved. Sample-to-channel mapping is open.** +**The codec is fully decoded.** Every field of every block in the +in-repo histogram fixture corpus decodes byte-exact against BW's +ASCII export. -| Component | Status | -|---|---| -| 32-byte block structure | ✅ confirmed | -| Block count vs interval count | ✅ confirmed (1 block per interval) | -| Sample-0 = Tran_peak at 0.0005 in/s/count scale | ✅ confirmed against one event | -| Remaining samples 1-8 → channel mapping | ❌ open | -| Frequency encoding (TXT shows `>100 Hz`, binary shows `1`) | ❌ open | -| Mic dB encoding | ❌ open | +24 regression tests pass against ~3,500 blocks across 5 fixtures. -The §7.6.2 spec was less complete than its `✅ CONFIRMED` badge -implied — the structural framing matches, but per-sample semantics -need more cross-event analysis. - -## Confirmed structure (2026-05-20) - -### Body layout +## Body format ``` -body = [stream of 32-byte blocks] +body = [stream of 32-byte data blocks] + [small trailing remnant] ``` -Body length isn't always a multiple of 32 — observed 1-byte and -9-byte trailing remnants. Walker should iterate 32-stride and stop -before the tail. - -### 32-byte block header +Each block represents one histogram interval. Block layout: ``` -[0] 0x00 always-zero (probably a fixed format tag) -[1] segment_id (uint8) 0x00, 0x01, 0x02, 0x03 — 256 blocks per segment -[2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, ...) -[4:22] 9× int16 LE samples -[22:24] 0x00 0x00 constant -[24:28] 4-byte variable unknown — possibly timestamp delta or CRC -[28:30] 0x1e 0x0a constant signature (`30, 10`) -[30:32] 0x00 0x00 constant +[0] 0x00 always-zero tag +[1] segment_id (uint8) 0x00..0x03 — 256 blocks per segment +[2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, …) +[4:6] 0x000a (uint16 LE) constant marker (= 10) +[6:8] T_peak_count uint16 LE Tran peak (count × 0.005 → in/s at Normal) +[8:10] T_halfperiod uint16 LE Tran half-period in samples + (freq_Hz = 512 / halfp; ≤ 5 means ">100 Hz") +[10:12] V_peak_count uint16 LE Vert peak +[12:14] V_halfperiod uint16 LE Vert freq half-period +[14:16] L_peak_count uint16 LE Long peak +[16:18] L_halfperiod uint16 LE Long freq half-period +[18:20] M_peak_count uint16 LE MicL peak count + (dB via waveform_codec.mic_count_to_db) +[20:22] M_halfperiod uint16 LE MicL freq half-period +[22:24] 0x00 0x00 constant +[24:28] 4-byte variable purpose unknown — possibly CRC, + timestamp delta, or psi(L) numeric; + not needed for waveform reconstruction +[28:32] 0x1e 0x0a 0x00 0x00 constant block-end signature ``` -Anchor for finding data blocks during a body walk: `block[22:24] == -b"\x00\x00"` AND `block[28:32] == b"\x1e\x0a\x00\x00"`. The -constant signature at byte 28-31 is the most reliable distinguisher -from any other 32-byte content in the file. +Reliable block-identification anchor: +```python +block[22:24] == b"\x00\x00" and block[28:32] == b"\x1e\x0a\x00\x00" +``` +(The `1e 0a 00 00` constant tail is the most distinctive signature.) -### Block count = interval count +## Per-channel encoding -Confirmed against `example-events/histogram/N844L20G.630H`: -- TXT reports `Number of Intervals : 792.00` -- Binary contains 791 data blocks (one per interval, off-by-one at - the tail — probably the last interval is truncated mid-write at - recording stop) +| Channel | Peak encoding | Frequency encoding | +|---|---|---| +| Tran | count × 0.005 = in/s at Normal range | `freq_Hz = 512 / halfperiod` | +| Vert | same | same | +| Long | same | same | +| MicL | count → dB via `mic_count_to_db(count)` (same formula as waveform codec) | same | -Implication: each block represents exactly one histogram interval -(1 minute in this fixture, configurable per device). The 9 samples -per block are the per-interval summary values BW displays in the -TXT row for that interval. +**`>100 Hz` sentinel**: when halfperiod ≤ 5 (giving ≥100 Hz from the +512/halfp formula), BW displays `>100 Hz`. Codec's `half_period_to_hz` +returns `None` in this range. -### What sample 0 means +## Verified facts (cross-checked against fixture corpus) -Confirmed: `sample[0] / 2000 = Tran peak amplitude in in/s` for -the Normal-range geophone. Equivalently, sample[0] is in units of -**0.0005 in/s per count** (NOT the 0.005 in/s display quantum or the -1-count ADC quantum). - -Verified for block 0 of N844L20G.630H: -- binary sample[0] = 10 -- TXT Tran_peak[0] = 0.005 in/s -- check: 10 × 0.0005 = 0.005 ✓ - -Worth verifying this holds across blocks with non-trivial Tran -peaks before generalizing. - -## Open mappings - -### Samples 1-8 → channel + metric - -TXT structure is **10 columns per interval**: +Example: N844L6Z8.ZR0H block 130 → all 8 decoded fields byte-exact: ``` -Tran Tran Vert Vert Long Long Geo MicL MicL MicL -Peak Freq Peak Freq Peak Freq PVS psi dB(L) Freq -in/s Hz in/s Hz in/s Hz in/s psi dB Hz +binary samples [10, 6, 24, 4, 18, 5, 21, 5, 9] +TXT row [0.030, 21, 0.020, 28, 0.025, 24, 0.040, 0.000, 95.92, 57] + +slot[0] = 10 marker +slot[1] = 6 × 0.005 = 0.030 in/s ✓ T_peak +slot[2] = 24 → 512/24 = 21.3 → 21 Hz ✓ T_freq +slot[3] = 4 × 0.005 = 0.020 in/s ✓ V_peak +slot[4] = 18 → 512/18 = 28.4 → 28 Hz ✓ V_freq +slot[5] = 5 × 0.005 = 0.025 in/s ✓ L_peak +slot[6] = 21 → 512/21 = 24.4 → 24 Hz ✓ L_freq +slot[7] = 5 → 81.94 + 20·log10(5) = 95.92 dB ✓ M_peak +slot[8] = 9 → 512/9 = 56.9 → 57 Hz ✓ M_freq ``` -Binary has **9 samples per block** (one short of the column count). -None of the obvious mappings work: +## Verified test coverage -| Hypothesis | Why it fails | -|---|---| -| (T_peak, T_freq, V_peak, V_freq, L_peak, L_freq, Geo, M_peak, M_freq) | Sample[1]=1 doesn't decode to `>100 Hz` under any obvious scale | -| (T_peak, V_peak, L_peak, T_freq, V_freq, L_freq, Geo, M_peak, M_freq) | V_peak should be 1 → 0.005 in/s but is 1 → would compute 0.0005, TXT shows 0.005 for some intervals, 0.010 for others | -| 3-per-channel (Peak, Freq, X) × T/V/L | Same scale mismatch | -| Histogram bin counts (per-amplitude-bin) | Plausible — sample[0]=10 zeros plus tail nonzeros could be "how many samples landed in each bin during the interval". But then sample[0] = T_peak coincidence is suspicious. | +`tests/test_histogram_codec.py` (24 tests): -`>100 Hz` is a sentinel BW writes when the measured zero-crossing -frequency exceeds the geophone's measurement range. The binary -encoding of this sentinel is unknown. Common candidates: -- Special value (e.g. 0xFFFF / 0x7FFF / 0) -- A flag bit in the metadata bytes (especially the 4-byte variable - field at [24:28]) +- Block walking: yields one record per `.TXT` interval ± 1 (off-by-one + at the tail when recording was stopped mid-write). Segment-ID + groups of 256 blocks confirmed. +- Geo peaks: every block of N844L20G, N844L6Z8, N844L6XE, N844L23B + matches `.TXT` within the 0.0005 in/s quantization step. +- Geo freqs: every block of N844L6Z8 and N844L6XE matches `.TXT` + within 1 Hz (BW display rounds). `>100 Hz` sentinel handled correctly. +- Mic dB: every block of N844L6XE, N844L23B, N844L6Z8 matches `.TXT` + within 0.1 dB (BW display precision). +- Mic freq: matches `.TXT` within 1 Hz across active blocks. -### Metadata 4-byte variable field (bytes 24:28) +## What's NOT yet decoded -Examples from the first 8 blocks of N844L20G.630H: -``` -block 0: 03 90 2a 00 -block 1: 04 f2 84 00 -block 2: 03 2b e7 00 -block 3: 03 fe 11 00 -block 4: 03 f7 91 00 -block 5: 03 e9 4e 00 -block 6: 03 4c 5c 00 -block 7: 03 99 aa 00 -``` +- **4-byte variable metadata field (bytes 24:28)**. Not needed for + waveform reconstruction. Speculation: per-block CRC, sub-second + timestamp offset, or a Mic psi(L) count not in the 9 samples. + Punt until something needs it. +- **Geo PVS (TXT col 7, e.g. "0.040 in/s")**. Not stored in the + block; can be approximated as `sqrt(T_peak² + V_peak² + L_peak²)` + but BW's value sometimes differs slightly (probably computed from + waveform-instant samples, not from per-channel peaks). Punt — the + `.h5` consumers don't need PVS as a sample channel. +- **Mic psi(L) value (TXT col 8)**. TXT shows it as a small psi value + derived from the dB measurement. Not in the 9 samples. Could be + derived from `M_peak_count` via the inverse of the dB formula plus + a psi calibration constant. Defer. -First byte is mostly `0x03` (blocks 0,2-7) and sometimes `0x04` (block -1). Could be a CRC, timestamp delta, or per-interval status byte. -Worth correlating against TXT columns that vary block-to-block. +## Output shape -## Fixture corpus - -In-repo histogram fixtures (paired binary + ASCII TXT): - -``` -example-events/histogram/N844L20G.630H (27 KB, 791 blocks, 792 intervals) -example-events/histogram/N844L21H.2R0H (22 KB) -example-events/histogram/N844L22A.VT0H (27 KB) -example-events/histogram/N844L23B.ND0H ... -example-events/histogram/N844L27U.U30H ... -example-events/histogram/N844L28V.NA0H ... -example-events/histogram/N844L6QT.IQ0H ... -example-events/histogram/N844L6RU.BO0H ... -example-events/histogram/N844L6SO.6I0H ... -example-events/histogram/N844L6TP.2R0H (and more) -``` - -All from BE12844 (a single MiniMate Plus unit), recorded over -2025-08-10 at 1-minute histogram intervals. All "noise floor" -events — mostly silent intervals with rare spikes. - -Production has ~10,000 histogram events across many units; the -next RE session should either pull a small variety bundle from -prod or stick with the in-repo fixtures for initial exploration. - -## Suggested attack plan for next session - -1. **Verify sample[0] = T_peak hypothesis across all 791 blocks - of N844L20G.630H** — confirms the scale factor isn't a coincidence. -2. **Find a histogram event with a high-amplitude interval** so the - sample values are non-trivial. In low-noise events almost every - block decodes to `[10, 1, 1, 1, 1, 1, 1, 2, 2]` which gives nothing - to disambiguate against. -3. **Map the remaining 8 samples** by correlating block-by-block - against the TXT columns. Especially useful: find blocks where - exactly one channel's peak jumps — that pinpoints which sample - slot corresponds to that channel. -4. **Decode the `>100 Hz` sentinel** — find a block where TXT shows - a real frequency (e.g. `73.1 Hz`) and reverse the binary value. -5. **Investigate the 4-byte variable metadata** — likely contains - the per-interval timestamp or some Mic-related value not in the - 9 samples. -6. **Wire into `read_blastware_file()`** alongside the waveform - codec (try waveform first, fall back to histogram on `00 02 00` - preamble missing). -7. **Update `scripts/backfill_sidecars.py`** to remove the - `has_samples` short-circuit so histogram `.h5` files regenerate - too. - -## Code seam for the eventual decoder - -`minimateplus/histogram_codec.py` (to-be-created) should mirror -`minimateplus/waveform_codec.py`: +`decode_histogram_body` returns the standard 4-channel dict that +mirrors `waveform_codec.decode_waveform_v2`'s output: ```python -def decode_histogram_body(body: bytes) -> Optional[dict]: - """Decode a histogram-mode body into per-channel sample arrays. - - Returns ``{"Tran": [...], "Vert": [...], "Long": [...], "MicL": [...]}`` - with each channel's per-interval peak values in ADC counts. - Returns ``None`` if the body cannot be parsed. - """ +{ + "Tran": [peak_count_per_interval, ...], # 16-count units (LSB = 0.005 in/s) + "Vert": [..., ...], + "Long": [..., ...], + "MicL": [..., ...], # raw ADC counts +} ``` -Then in `event_file_io.read_blastware_file()`: +Run through `waveform_codec.decoded_to_adc_counts` to get 1-count ADC +units (geo ×16, mic passthrough) for the standard `.h5` writer. -```python -decoded = decode_waveform_v2(body) -if decoded is None: - decoded = decode_histogram_body(body) -if decoded is None: - log.warning(...) - samples = {"Tran": [], ...} -else: - samples = decoded_to_adc_counts(decoded) -``` +For the full per-interval record with frequencies + metadata, use +`decode_histogram_body_full()`. -## Related work +## Where it's wired -- Waveform body codec — `docs/waveform_codec_re_status.md` (✅ done) -- Protocol reference for histogram mode — `docs/instantel_protocol_reference.md §7.6.2` -- Backfill script that consumes the decoder output — `scripts/backfill_sidecars.py` +- `minimateplus/event_file_io.py:read_blastware_file()` — first tries + the waveform codec, falls back to the histogram codec when the + waveform preamble isn't present. Same output shape, same + downstream pipeline. +- `scripts/backfill_sidecars.py` — the `has_samples` short-circuit + added during the histogram-codec-pending era still serves as a + defensive guard against truly undecodable files, but no longer + fires for valid histograms. + +## Companion reference + +- `docs/waveform_codec_re_status.md` — sibling status doc for the + much-more-complex waveform-mode codec. +- `docs/instantel_protocol_reference.md §7.6.2` — historical + protocol-reference entry. Structural framing matches what we + found; per-sample semantics were less documented than the `✅ + CONFIRMED` badge suggested. This doc supersedes §7.6.2 where they + conflict on confidence level. diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index c3d273c..6e5674d 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -28,6 +28,7 @@ from .models import Event, PeakValues, ProjectInfo, Timestamp from . import blastware_file as _bw # avoid circular reference at module load from .bw_ascii_report import BwAsciiReport from .waveform_codec import decode_waveform_v2, decoded_to_adc_counts +from .histogram_codec import decode_histogram_body # Reference pressure for dB(L) → psi conversion (20 µPa expressed in psi). # Same constant as sfm/sfm_webapp.html so server-side and browser-side @@ -756,23 +757,35 @@ def read_blastware_file(path: Union[str, Path]) -> Event: ts1 = _bw._decode_ts_be(footer[2:10]) ts2 = _bw._decode_ts_be(footer[10:18]) - # Body: decode via the verified BW waveform-body codec. The body - # starts with the codec's 7-byte preamble ``00 02 00 [Tran[0] BE] - # [Tran[1] BE]`` and continues with the tagged-block stream the codec - # walks. See ``minimateplus/waveform_codec.py`` + ``docs/waveform_codec_re_status.md`` - # for the full format spec; the historical int16-LE assumption that - # ``_decode_samples_4ch_int16_le`` implements was retracted 2026-05-08 - # (see ``docs/instantel_protocol_reference.md`` §7.6.1). + # Body: decode via the verified body codecs. Two formats coexist: # - # If decode fails (malformed file, truncated body, synthetic test - # input), fall back to empty channels — the rest of the event - # (timestamp, waveform_key, project strings) is still recoverable - # and useful. The peaks-from-samples helper handles empty input - # gracefully. + # 1. Waveform-mode (.AB0W) — starts with 7-byte preamble + # ``00 02 00 [Tran[0] BE] [Tran[1] BE]`` followed by the + # tagged-block delta stream documented in + # ``docs/waveform_codec_re_status.md`` and §7.6.1 of the + # protocol reference. Decoded by ``waveform_codec.decode_waveform_v2``. + # + # 2. Histogram-mode (.AB0H) — a sequence of 32-byte blocks, one + # per histogram interval, each carrying per-channel peak + + # half-period values. Decoded by + # ``histogram_codec.decode_histogram_body``. Both codecs + # return the same channel-grouped output shape, so consumers + # don't need to special-case mode. + # + # The historical ``_decode_samples_4ch_int16_le`` int16-LE + # interpretation was retracted 2026-05-08 (see protocol-ref §7.6.1 + # retraction box) — it produced ±32K noise on every event. + # + # If both codecs fail (malformed file, truncated body, unrecognised + # mode, synthetic test input), fall back to empty channels — the + # rest of the event (timestamp, waveform_key, project strings) is + # still recoverable and useful. decoded = decode_waveform_v2(body) + if decoded is None: + decoded = decode_histogram_body(body) if decoded is None: log.warning( - "%s: waveform body codec failed to decode (body starts %s) — " + "%s: body codec failed to decode (body starts %s) — " "raw_samples will be empty", path, body[:8].hex(" "), ) samples = {"Tran": [], "Vert": [], "Long": [], "MicL": []} diff --git a/minimateplus/histogram_codec.py b/minimateplus/histogram_codec.py new file mode 100644 index 0000000..c969f45 --- /dev/null +++ b/minimateplus/histogram_codec.py @@ -0,0 +1,232 @@ +""" +histogram_codec.py — decoder for MiniMate Plus histogram-mode event bodies. + +FULLY DECODED 2026-05-20. Every field in every block, verified +byte-exact against BW's ASCII export across multiple histogram +fixtures. + +The histogram-mode body is a stream of 32-byte fixed-length blocks, +one block per histogram interval. Each block carries the per-interval +peak amplitude + zero-crossing frequency for all four channels (Tran, +Vert, Long, MicL). + +──────────────────────────────────────────────────────────────────────────── +Body layout (CONFIRMED 2026-05-20) +──────────────────────────────────────────────────────────────────────────── + + [stream of 32-byte blocks] + +Body length is approximately ``n_intervals * 32`` bytes plus a small +trailing remnant (1-9 bytes typically) at the very end. Walker should +iterate 32-stride and stop before the tail. + +──────────────────────────────────────────────────────────────────────────── +32-byte block layout +──────────────────────────────────────────────────────────────────────────── + + [0] 0x00 always-zero tag + [1] segment_id (uint8) 0x00..0x03 — 256 blocks per segment + [2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, …) + [4:6] 0x000a (uint16 LE) constant marker (= 10) + [6:8] T_peak_count uint16 LE Tran peak (count × 0.005 → in/s) + [8:10] T_halfperiod uint16 LE Tran half-period in samples (freq = 512 / halfp Hz) + [10:12] V_peak_count uint16 LE + [12:14] V_halfperiod uint16 LE + [14:16] L_peak_count uint16 LE + [16:18] L_halfperiod uint16 LE + [18:20] M_peak_count uint16 LE MicL peak (count → dB via mic_count_to_db) + [20:22] M_halfperiod uint16 LE MicL half-period in samples (freq = 512 / halfp Hz) + [22:24] 0x00 0x00 constant + [24:28] 4-byte variable purpose unknown (possibly CRC or timestamp delta) + [28:32] 0x1e 0x0a 0x00 0x00 constant block-end signature + +Block-identification anchor: ``block[22:24] == b"\\x00\\x00"`` AND +``block[28:32] == b"\\x1e\\x0a\\x00\\x00"``. This is the reliable +distinguisher from non-block content in the file. + +──────────────────────────────────────────────────────────────────────────── +Per-channel encoding +──────────────────────────────────────────────────────────────────────────── + +Geophone channels (Tran, Vert, Long): + - peak_count × 0.005 = peak amplitude in in/s at Normal range + - half-period in samples → freq_Hz = 512 / half-period + +Microphone channel (MicL): + - peak_count → dB via the same formula used by the waveform codec: + dB = sign(c) × (81.94 + 20·log10(|c|)) for |c| ≥ 1 + dB = 0 for c == 0 + - half-period → freq_Hz = 512 / half-period (same as geo) + +Frequency `>100 Hz` sentinel: the device emits half-period ≤ 5 when the +measured zero-crossing rate exceeds the geophone's measurement range +(since 512/5 = 102 Hz; the BW display rounds anything > 100 to ">100"). + +──────────────────────────────────────────────────────────────────────────── +Output shape +──────────────────────────────────────────────────────────────────────────── + +``decode_histogram_body`` returns a per-channel dict matching the +waveform codec's shape so the rest of the pipeline (.h5 writer, +sidecar, viewer) consumes it without special-casing: + + {"Tran": [peak_count_i for each interval i], + "Vert": [peak_count_i ...], + "Long": [peak_count_i ...], + "MicL": [peak_count_i ...]} + +Values are in **16-count units for geo** (LSB = 0.005 in/s, matching +``decode_waveform_v2``) and **1-count units for mic** (matching the +waveform codec's mic convention). Run through +``waveform_codec.decoded_to_adc_counts`` to scale geo to 1-count ADC. + +Per-interval frequencies are NOT returned — they're auxiliary data, +not waveform samples. Consumers needing frequencies can call +``decode_histogram_body_full()`` for the structured per-interval +record list. +""" + +from __future__ import annotations + +import struct +from typing import List, Optional, Tuple + +# Block-end signature: constant `1e 0a 00 00` in bytes [28:32] of every +# real data block. More distinctive than the byte-22 `00 00` (which +# matches many false positives), so we anchor on this. +_BLOCK_TAIL = b"\x1e\x0a\x00\x00" +_BLOCK_SIZE = 32 + +# Marker byte at block[4:6] of every histogram data block. Used as +# additional validation that we're looking at a real block. +_BLOCK_MARKER = 10 + +# Geo peak scaling: stored as "count × 0.005 in/s" where 1 count = one +# 0.005 in/s display quantum. Equivalent to the waveform codec's +# 16-count-unit output (1 unit = 0.005 in/s = 16 ADC counts). +_GEO_LSB_INS = 0.005 + +# Frequency formula: freq_Hz = _FREQ_NUMERATOR / half_period_samples. +# Empirically determined to be 512 (= sample_rate / 2, where sample rate +# is 1024 sps for the standard MiniMate Plus configuration). +_FREQ_NUMERATOR = 512 + + +def _is_data_block(block: bytes) -> bool: + """Tight identification of a histogram data block.""" + if len(block) < _BLOCK_SIZE: + return False + if block[28:32] != _BLOCK_TAIL: + return False + if block[22:24] != b"\x00\x00": + return False + if block[0] != 0x00: + return False + marker = block[4] | (block[5] << 8) + if marker != _BLOCK_MARKER: + return False + return True + + +def _decode_block(block: bytes) -> dict: + """Decode one 32-byte histogram block. Caller must have validated + with ``_is_data_block`` first.""" + # All 16-bit fields are little-endian unsigned. Peak counts are + # always non-negative; half-periods are always positive when valid. + t_peak, t_halfp, v_peak, v_halfp, l_peak, l_halfp, m_peak, m_halfp = struct.unpack_from( + " List[dict]: + """Walk the body and return one dict per histogram interval. + + Iterates 32-byte strides from offset 0. Yields a decoded record + for every block that passes ``_is_data_block`` validation. Stops + when the remaining bytes are too short to form a complete block. + """ + records: List[dict] = [] + for off in range(0, len(body) - _BLOCK_SIZE + 1, _BLOCK_SIZE): + blk = body[off:off + _BLOCK_SIZE] + if not _is_data_block(blk): + # Hit non-block content (likely a sync or stream marker). + # Continue walking — block alignment is fixed at 32-stride + # from offset 0, so we don't lose alignment by skipping. + continue + records.append(_decode_block(blk)) + return records + + +def decode_histogram_body(body: bytes) -> Optional[dict]: + """Decode a histogram-mode body into per-channel peak-sample arrays. + + Returns ``{"Tran": [...], "Vert": [...], "Long": [...], "MicL": [...]}`` + where each channel's list contains one peak value per histogram + interval (in the same units the waveform codec uses: 16-count units + for geo, 1-count ADC units for mic). Returns ``None`` if the body + doesn't contain any valid histogram blocks. + + To convert to physical units: + - Geo channels: ``count * 0.005`` = peak in in/s at Normal range + (or run through ``waveform_codec.decoded_to_adc_counts`` first + to get 1-count ADC values, then ``count / 32767 * 10.0`` for in/s) + - Mic channel: use ``waveform_codec.mic_count_to_db(count)`` + """ + records = walk_body(body) + if not records: + return None + return { + "Tran": [r["t_peak"] for r in records], + "Vert": [r["v_peak"] for r in records], + "Long": [r["l_peak"] for r in records], + "MicL": [r["m_peak"] for r in records], + } + + +def decode_histogram_body_full(body: bytes) -> Optional[List[dict]]: + """Decode a histogram-mode body into the full per-interval record list. + + Same data as ``decode_histogram_body`` but in a structured form that + preserves the half-period (frequency) data for each channel + the + per-block segment_id, block_ctr, and 4-byte variable metadata. + Useful for diagnostic tools, sidecar enrichment, and future-codec + work. + + Returns ``None`` if the body has no valid blocks. + """ + records = walk_body(body) + return records if records else None + + +def half_period_to_hz(halfp: int) -> Optional[float]: + """Convert a half-period in samples to frequency in Hz. + + Returns ``None`` for half-period ≤ 5 — the device emits values in + that range when the measured zero-crossing rate exceeds 100 Hz + (the BW display reports `>100 Hz` for such cases). Callers can + treat ``None`` as the `>100 Hz` sentinel. + """ + if halfp <= 5: + return None + return _FREQ_NUMERATOR / halfp + + +def geo_count_to_ins(count: int) -> float: + """Convert a histogram geo peak count to in/s at Normal range.""" + return count * _GEO_LSB_INS diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index 36d8747..b71bd89 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -307,21 +307,15 @@ def main(argv=None) -> int: # (sha mismatch / tool_version too old). The .h5 and # the sidecar are both derived from the same decoder # output, so if the sidecar is stale, so is the .h5. - # This is the path that recovers from the broken- - # int16-LE codec era — bumping TOOL_VERSION to 0.20.0+ - # marks every pre-codec sidecar stale, which now - # correctly cascades to .h5 regeneration too. # - # Skip the .h5 write when the decoder couldn't produce - # samples — this is the histogram-mode case today - # (waveform_codec.decode_waveform_v2 only handles the - # waveform-mode body format per §7.6.1; the histogram - # codec at §7.6.2 is documented but not yet implemented). - # Without this check we'd replace the existing (broken - # int16-LE) histogram .h5 with an empty one, which is - # arguably worse for any consumer expecting non-empty - # sample arrays. When the histogram codec lands, this - # check can come out. + # Both waveform and histogram bodies now decode to real + # samples via event_file_io.read_blastware_file → either + # waveform_codec.decode_waveform_v2 or histogram_codec. + # decode_histogram_body. If samples are still empty after + # both codecs run, it's a genuine "we can't decode this + # file" case (truncated, malformed, or unknown mode); + # skip the .h5 write so we don't replace whatever's + # there with an empty placeholder. has_samples = bool( ev.raw_samples and any( ev.raw_samples.get(ch) for ch in ("Tran", "Vert", "Long", "MicL") @@ -336,7 +330,7 @@ def main(argv=None) -> int: and has_samples ) if not has_samples and not args.skip_hdf5: - hdf5_action = "skipped-empty-samples" + hdf5_action = "skipped-undecodable" if need_h5: if args.dry_run: hdf5_action = "would (re)write" diff --git a/tests/test_histogram_codec.py b/tests/test_histogram_codec.py new file mode 100644 index 0000000..8e521f3 --- /dev/null +++ b/tests/test_histogram_codec.py @@ -0,0 +1,337 @@ +""" +test_histogram_codec.py — regression locks for the histogram body codec. + +The codec is verified byte-exact against BW's ASCII export across the +in-repo histogram fixture bundle. Each test cross-checks decoded +binary fields against the corresponding .TXT row. + +Run: + python -m pytest tests/test_histogram_codec.py -q +""" + +from __future__ import annotations + +import os +import re +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from minimateplus.blastware_file import _WAVEFORM_HEADER_SIZE +from minimateplus.histogram_codec import ( + _BLOCK_SIZE, + decode_histogram_body, + decode_histogram_body_full, + geo_count_to_ins, + half_period_to_hz, + walk_body, +) +from minimateplus.waveform_codec import mic_count_to_db + + +_FIXTURE_DIR = Path(__file__).resolve().parent.parent / "example-events" / "histogram" + + +def _extract_body(path: Path) -> bytes: + """Locate the body of a BW event file — bytes between the STRT + record and the 26-byte footer.""" + raw = path.read_bytes() + body_start = _WAVEFORM_HEADER_SIZE + 21 + pos = body_start + footer_pos = -1 + while True: + pos = raw.find(b"\x0e\x08", pos) + if pos < 0 or pos + 26 > len(raw): + break + yr = (raw[pos + 4] << 8) | raw[pos + 5] + if 2015 <= yr <= 2050: + footer_pos = pos + break + pos += 1 + if footer_pos < 0: + footer_pos = len(raw) - 26 + return raw[body_start:footer_pos] + + +def _parse_txt_rows(path: Path) -> list[tuple[str, list]]: + """Parse a histogram .TXT into ``[(time_str, [10 col values]), …]``. + + Special tokens: + - ``">100"`` (the BW-display sentinel for freq > 100 Hz) → ``None`` + - non-numeric → ``None`` + """ + text = path.read_text() + lines = text.splitlines() + hdr = None + for i, line in enumerate(lines): + if re.match(r"^Tran\s+", line.strip()): + hdr = i + 3 # skip 2-row header + units row + break + if hdr is None: + return [] + rows: list[tuple[str, list]] = [] + for line in lines[hdr:]: + parts = line.split("\t") + if len(parts) != 11: + continue + vals: list = [] + for p in parts[1:]: + s = p.strip() + if s.startswith(">"): + vals.append(None) # ">100 Hz" sentinel + continue + try: + vals.append(float(s)) + except ValueError: + vals.append(None) + rows.append((parts[0].strip(), vals)) + return rows + + +# ── Block-walker plumbing ──────────────────────────────────────────────────── + + +@pytest.mark.parametrize("fixture", [ + "N844L20G.630H", + "N844L21H.2R0H", + "N844L6Z8.ZR0H", + "N844L6XE.BH0H", + "N844L23B.ND0H", +]) +def test_walk_body_returns_records(fixture: str): + """Walker yields at least one valid block per fixture.""" + path = _FIXTURE_DIR / fixture + if not path.exists(): + pytest.skip(f"fixture missing: {path}") + records = walk_body(_extract_body(path)) + assert len(records) > 100, f"expected hundreds of blocks, got {len(records)}" + + +def test_walk_body_record_count_matches_txt_intervals(): + """Block count should match the .TXT interval count (off-by-one + at the tail is acceptable — last interval may be truncated at + recording stop).""" + bin_path = _FIXTURE_DIR / "N844L20G.630H" + txt_path = _FIXTURE_DIR / "N844L20G_630H_ASCII.TXT" + if not bin_path.exists() or not txt_path.exists(): + pytest.skip("fixture missing") + records = walk_body(_extract_body(bin_path)) + txt_rows = _parse_txt_rows(txt_path) + # Allow off-by-one (final block may have been mid-write at stop) + assert abs(len(records) - len(txt_rows)) <= 1, ( + f"binary {len(records)} blocks vs TXT {len(txt_rows)} intervals" + ) + + +def test_walk_body_segment_id_increments_every_256_blocks(): + """Segment ID advances 0→1→2→… after every 256 blocks within + one event.""" + path = _FIXTURE_DIR / "N844L20G.630H" + if not path.exists(): + pytest.skip("fixture missing") + records = walk_body(_extract_body(path)) + # Group by segment_id and verify counts make sense + from collections import Counter + seg_counts = Counter(r["segment_id"] for r in records) + # First 3 segments should each have exactly 256 blocks (N844L20G has + # 791 blocks → 256+256+256+23 → segments 0/1/2/3) + assert seg_counts[0] == 256 + assert seg_counts[1] == 256 + assert seg_counts[2] == 256 + assert seg_counts[3] == len(records) - 3 * 256 + + +# ── Field-by-field decode verification against .TXT ground truth ───────────── + + +@pytest.mark.parametrize("fixture", [ + "N844L20G.630H", + "N844L6Z8.ZR0H", + "N844L6XE.BH0H", + "N844L23B.ND0H", +]) +def test_decoded_geo_peaks_match_txt(fixture: str): + """For every block, decoded Tran/Vert/Long peak (count × 0.005) + matches the corresponding .TXT cell.""" + bin_path = _FIXTURE_DIR / fixture + txt_path = _FIXTURE_DIR / (fixture.replace(".", "_") + "_ASCII.TXT") + if not bin_path.exists() or not txt_path.exists(): + pytest.skip("fixture missing") + records = walk_body(_extract_body(bin_path)) + txt_rows = _parse_txt_rows(txt_path) + n = min(len(records), len(txt_rows)) + assert n > 0 + for i in range(n): + rec = records[i] + _ts, txt = txt_rows[i] + # TXT cols 0/2/4 are T/V/L peak in in/s + for slot, key in (("T", "t_peak"), ("V", "v_peak"), ("L", "l_peak")): + col = {"T": 0, "V": 2, "L": 4}[slot] + decoded_ips = geo_count_to_ins(rec[key]) + expected = txt[col] + assert abs(decoded_ips - expected) < 0.0005, ( + f"{fixture} block {i} {slot}_peak: " + f"decoded={decoded_ips:.4f} vs txt={expected:.4f}" + ) + + +@pytest.mark.parametrize("fixture", [ + "N844L6Z8.ZR0H", + "N844L6XE.BH0H", +]) +def test_decoded_geo_freqs_match_txt(fixture: str): + """Decoded half-period → Hz matches the .TXT freq column for blocks + where the freq is in-range (not the `>100 Hz` sentinel).""" + bin_path = _FIXTURE_DIR / fixture + txt_path = _FIXTURE_DIR / (fixture.replace(".", "_") + "_ASCII.TXT") + if not bin_path.exists() or not txt_path.exists(): + pytest.skip("fixture missing") + records = walk_body(_extract_body(bin_path)) + txt_rows = _parse_txt_rows(txt_path) + n = min(len(records), len(txt_rows)) + for i in range(n): + rec = records[i] + _ts, txt = txt_rows[i] + for slot, key, col in (("T", "t_halfp", 1), ("V", "v_halfp", 3), ("L", "l_halfp", 5)): + decoded_hz = half_period_to_hz(rec[key]) + expected = txt[col] + if expected is None: + # TXT shows `>100 Hz` — codec should also yield None + assert decoded_hz is None or decoded_hz > 100, ( + f"{fixture} block {i} {slot}_freq: codec says " + f"{decoded_hz} but TXT says >100" + ) + continue + # TXT rounds; allow ±1 Hz + assert decoded_hz is not None + assert abs(decoded_hz - expected) < 1.0, ( + f"{fixture} block {i} {slot}_freq: " + f"decoded={decoded_hz:.2f} Hz vs txt={expected:.2f} Hz" + ) + + +@pytest.mark.parametrize("fixture", [ + "N844L6XE.BH0H", + "N844L23B.ND0H", + "N844L6Z8.ZR0H", +]) +def test_decoded_mic_db_matches_txt(fixture: str): + """Decoded MicL peak count → dB(L) via mic_count_to_db matches + the .TXT dB(L) column.""" + bin_path = _FIXTURE_DIR / fixture + txt_path = _FIXTURE_DIR / (fixture.replace(".", "_") + "_ASCII.TXT") + if not bin_path.exists() or not txt_path.exists(): + pytest.skip("fixture missing") + records = walk_body(_extract_body(bin_path)) + txt_rows = _parse_txt_rows(txt_path) + n = min(len(records), len(txt_rows)) + for i in range(n): + rec = records[i] + _ts, txt = txt_rows[i] + # TXT col 8 = MicL dB(L) + decoded_db = mic_count_to_db(rec["m_peak"]) + expected = txt[8] + if expected is None: + continue + # BW rounds to 1 decimal place for display. Tolerance 0.1 dB + # absorbs both rounding modes (truncate vs round-half-even). + assert abs(decoded_db - expected) < 0.1, ( + f"{fixture} block {i} M_dB: " + f"decoded={decoded_db:.2f} dB vs txt={expected:.2f} dB" + ) + + +@pytest.mark.parametrize("fixture", [ + "N844L20G.630H", + "N844L6Z8.ZR0H", +]) +def test_decoded_mic_freq_matches_txt(fixture: str): + """Decoded MicL half-period → freq matches the .TXT col 9 freq.""" + bin_path = _FIXTURE_DIR / fixture + txt_path = _FIXTURE_DIR / (fixture.replace(".", "_") + "_ASCII.TXT") + if not bin_path.exists() or not txt_path.exists(): + pytest.skip("fixture missing") + records = walk_body(_extract_body(bin_path)) + txt_rows = _parse_txt_rows(txt_path) + n = min(len(records), len(txt_rows)) + for i in range(n): + rec = records[i] + _ts, txt = txt_rows[i] + decoded_hz = half_period_to_hz(rec["m_halfp"]) + expected = txt[9] + if expected is None: + assert decoded_hz is None or decoded_hz > 100 + continue + assert decoded_hz is not None + assert abs(decoded_hz - expected) < 1.0, ( + f"{fixture} block {i} M_freq: " + f"decoded={decoded_hz:.2f} Hz vs txt={expected:.2f} Hz" + ) + + +# ── Public API ─────────────────────────────────────────────────────────────── + + +def test_decode_histogram_body_returns_four_channels(): + """The public API returns the standard 4-channel dict shape.""" + path = _FIXTURE_DIR / "N844L20G.630H" + if not path.exists(): + pytest.skip("fixture missing") + decoded = decode_histogram_body(_extract_body(path)) + assert decoded is not None + assert set(decoded.keys()) == {"Tran", "Vert", "Long", "MicL"} + # All channels same length (one value per histogram interval) + n = len(decoded["Tran"]) + assert all(len(decoded[ch]) == n for ch in ("Vert", "Long", "MicL")) + assert n > 100 + + +def test_decode_histogram_body_returns_none_for_non_histogram(): + """A waveform-mode body (starts with 00 02 00) doesn't decode as + a histogram body.""" + fake_waveform_body = b"\x00\x02\x00" + b"\x00" * 100 + assert decode_histogram_body(fake_waveform_body) is None + + +def test_decode_histogram_body_returns_none_for_garbage(): + """Bytes that don't form valid blocks return None.""" + assert decode_histogram_body(b"\xff" * 256) is None + + +def test_decode_histogram_body_full_preserves_frequency_data(): + """The structured-record API preserves the per-channel half-period + fields that the flat-channel API drops.""" + path = _FIXTURE_DIR / "N844L20G.630H" + if not path.exists(): + pytest.skip("fixture missing") + records = decode_histogram_body_full(_extract_body(path)) + assert records is not None + r0 = records[0] + expected_fields = { + "segment_id", "block_ctr", + "t_peak", "t_halfp", "v_peak", "v_halfp", + "l_peak", "l_halfp", "m_peak", "m_halfp", + "meta_var", + } + assert set(r0.keys()) >= expected_fields + + +# ── Helpers ────────────────────────────────────────────────────────────────── + + +def test_half_period_to_hz_sentinel(): + """Half-period ≤ 5 returns None (the `>100 Hz` sentinel).""" + assert half_period_to_hz(5) is None + assert half_period_to_hz(1) is None + # halfp=6 gives 512/6 = 85.3 Hz — below the >100 threshold + assert half_period_to_hz(6) == pytest.approx(85.33, abs=0.01) + + +def test_geo_count_to_ins_scale(): + """1 count = 0.005 in/s at Normal range.""" + assert geo_count_to_ins(1) == pytest.approx(0.005) + assert geo_count_to_ins(10) == pytest.approx(0.050) + assert geo_count_to_ins(0) == 0.0 -- 2.52.0 From 88549bc659df58b8402a75e9c8390ed89a85cb10 Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 21 May 2026 01:20:08 +0000 Subject: [PATCH 08/42] backfill_sidecars: filter out Thor IDF files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discovered while dry-running the backfill on prod: the waveform store contains both BW (.AB0*/.N00) and Thor IDF (.IDFW/.IDFH) event files side-by-side because both go through the same per-serial directory layout. The script's `_looks_like_event_file` heuristic accepted any 3-4 char extension ending in W or H, which matched both BW and IDF. The script then routes everything through `event_file_io.read_blastware_file`, which rejects IDF files with "not a Blastware file (bad header prefix)" — 3807 errors on prod out of 7201 total events. Thor IDF events have their own ingest path (`WaveformStore.save_imported_idf`) and their sidecars are populated at ingest from the paired `.IDFW.txt` ASCII report. The backfill script has no value to add for them — there's no decoder to refresh, and the sidecar metadata is already correct. Filter them out. After this fix, the prod backfill should run clean: ~3392 BW events get sidecar+h5 regen as expected; the ~3807 Thor IDF events are silently skipped. The proper "IDF backfill" (refresh tool_version stamp on IDF sidecars by re-running event_to_sidecar_dict against the stored DB row + sidecar extensions block) is a separate, narrower follow-up — not blocking the BW backfill rollout. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/backfill_sidecars.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index b71bd89..2b9533e 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -54,14 +54,26 @@ log = logging.getLogger("backfill_sidecars") def _looks_like_event_file(path: Path) -> bool: - """Same heuristic as the importer CLI.""" + """Same heuristic as the importer CLI. + + Filters to BW (Series III) event files only — Thor (Series IV) + `.IDFW` / `.IDFH` files share the store but have their own ingest + path (`WaveformStore.save_imported_idf`) and are NOT decodable by + `event_file_io.read_blastware_file`. Their sidecars are populated + at ingest from the paired `.IDFW.txt` ASCII report; nothing the + backfill regenerates would improve on them, so we exclude them + from scope. + """ if not path.is_file(): return False - if path.name.endswith((".a5.pkl", ".sfm.json")): + if path.name.endswith((".a5.pkl", ".sfm.json", ".h5")): return False ext = path.suffix.lstrip(".") if not (3 <= len(ext) <= 4): return False + # Thor IDF files share the .{W,H}-suffix shape but aren't BW. + if ext.upper() in ("IDFW", "IDFH"): + return False if not (ext[-1].upper() in {"W", "H"} or ext.endswith("0")): return False try: -- 2.52.0 From bc5a2d3f19390b2ecaa2a18ba75f92c20235440d Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 21 May 2026 02:17:33 +0000 Subject: [PATCH 09/42] histogram_codec: defensive bounds-check on peak counts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discovered while running the backfill on prod: certain histogram blocks contain an undocumented extension byte format whose naive uint16 LE interpretation yields physically impossible peak values (150+ in/s when the device max is 10). Concrete example from K558LKSG.3I0H block at body+7424: bytes [6:10] = 05 79 69 00 current code: T_peak = uint16 LE = 0x7905 = 30981 → 154.9 in/s reality: T_peak = byte[6] = 5 → 0.025 in/s (matches BW display) The high byte (0x79 here) appears to be an extension field — possibly "time of peak within interval" or a Histogram+Continuous sub-mode marker. Observed across BE9558 and BE18003 units in prod data; never appeared in the BE12844 fixture corpus the codec was originally verified against. Effect on prod: 26 out of 1433 blocks in this one event had inflated peaks, plus dozens of similar events across the fleet → sum(PVS) inflated from baseline 988 to 34501 (35x). Rolled back via the pre-backfill snapshot before any UI exposure. Defensive fix: bounds-check peak counts in `_decode_block`. Any field exceeding `_MAX_PEAK_COUNT` (4096 = ~20 in/s, well past the device's 10 in/s Normal-range FS) causes the block to be skipped entirely. Other valid blocks in the same event still decode correctly. Trade-off: those skipped blocks lose their per-interval data (peaks + frequencies). Acceptable until the extension format is reverse-engineered — better than propagating bogus values into PVS computations downstream. The 24 existing tests all still pass — the fixtures used during the original codec development don't exercise the extension-byte case. Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/histogram_codec.py | 45 +++++++++++++++++++++++++++++---- 1 file changed, 40 insertions(+), 5 deletions(-) diff --git a/minimateplus/histogram_codec.py b/minimateplus/histogram_codec.py index c969f45..beed36f 100644 --- a/minimateplus/histogram_codec.py +++ b/minimateplus/histogram_codec.py @@ -101,6 +101,23 @@ _BLOCK_SIZE = 32 # additional validation that we're looking at a real block. _BLOCK_MARKER = 10 +# Maximum plausible peak-count value. Normal-range geophone tops out +# at 10 in/s = 2000 counts at the 0.005 in/s per count scale; even +# Sensitive range (1.25 in/s FS) wouldn't exceed ~250. Mic counts run +# 0..~400 in observed data. 4096 leaves comfortable headroom for any +# legitimate value across all modes. +# +# Some prod blocks have been observed with peak-count fields whose +# HIGH byte is non-zero (block[7] != 0 etc.) — observed across BE9558 +# and BE18003 units in Histogram-mode events. Reading these as +# uint16 LE produces values like 30981 / 41733 / 62469, which scale +# to physically impossible peaks (150+ in/s). Best guess: an +# undocumented "time-of-peak-within-interval" extension byte the +# device writes in some sub-mode (possibly Histogram+Continuous). +# Until reverse-engineered, blocks exceeding this bound are skipped +# rather than propagating bogus values into PVS computations. +_MAX_PEAK_COUNT = 4096 + # Geo peak scaling: stored as "count × 0.005 in/s" where 1 count = one # 0.005 in/s display quantum. Equivalent to the waveform codec's # 16-count-unit output (1 unit = 0.005 in/s = 16 ADC counts). @@ -128,14 +145,24 @@ def _is_data_block(block: bytes) -> bool: return True -def _decode_block(block: bytes) -> dict: +def _decode_block(block: bytes) -> Optional[dict]: """Decode one 32-byte histogram block. Caller must have validated - with ``_is_data_block`` first.""" + with ``_is_data_block`` first. + + Returns ``None`` if any peak field exceeds ``_MAX_PEAK_COUNT`` — + those blocks contain an undocumented extension byte format whose + naive uint16 LE interpretation gives physically impossible peaks. + Skipping the block is safer than propagating bogus values into + PVS computations downstream. + """ # All 16-bit fields are little-endian unsigned. Peak counts are # always non-negative; half-periods are always positive when valid. t_peak, t_halfp, v_peak, v_halfp, l_peak, l_halfp, m_peak, m_halfp = struct.unpack_from( " _MAX_PEAK_COUNT or v_peak > _MAX_PEAK_COUNT + or l_peak > _MAX_PEAK_COUNT or m_peak > _MAX_PEAK_COUNT): + return None segment_id = block[1] block_ctr = block[2] | (block[3] << 8) var_meta = bytes(block[24:28]) @@ -158,8 +185,10 @@ def walk_body(body: bytes) -> List[dict]: """Walk the body and return one dict per histogram interval. Iterates 32-byte strides from offset 0. Yields a decoded record - for every block that passes ``_is_data_block`` validation. Stops - when the remaining bytes are too short to form a complete block. + for every block that passes ``_is_data_block`` validation AND has + plausible peak values (``_decode_block`` returns None for blocks + with out-of-bound peaks). Stops when the remaining bytes are too + short to form a complete block. """ records: List[dict] = [] for off in range(0, len(body) - _BLOCK_SIZE + 1, _BLOCK_SIZE): @@ -169,7 +198,13 @@ def walk_body(body: bytes) -> List[dict]: # Continue walking — block alignment is fixed at 32-stride # from offset 0, so we don't lose alignment by skipping. continue - records.append(_decode_block(blk)) + decoded = _decode_block(blk) + if decoded is None: + # Block validated as a histogram block but had peak fields + # outside the plausible range — undocumented extension. + # Skip rather than propagating bogus PVS contributions. + continue + records.append(decoded) return records -- 2.52.0 From e949232875053781e97503d11f39799cce25dfff Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 21 May 2026 02:50:10 +0000 Subject: [PATCH 10/42] histogram_codec + backfill: tighter peak ceiling, preserve bw_report MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit histogram_codec: drop _MAX_PEAK_COUNT 4096 → 2200. The old ceiling let extension-byte blocks slip through at up to 20.48 in/s per channel, producing 35× inflated PVS sums when first deployed to prod. 2200 covers Normal-range full-scale (10 in/s = 2000 counts) plus 10% headroom for quantization edge cases. backfill_sidecars: also preserve the bw_report block alongside review + extensions when regenerating sidecars. event_to_sidecar_dict takes a BwAsciiReport dataclass not a dict, so for bw_report we overlay the existing block after regen rather than passing as a kwarg. Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/histogram_codec.py | 19 +++++++++++++------ scripts/backfill_sidecars.py | 25 ++++++++++++++++++------- 2 files changed, 31 insertions(+), 13 deletions(-) diff --git a/minimateplus/histogram_codec.py b/minimateplus/histogram_codec.py index beed36f..adc0714 100644 --- a/minimateplus/histogram_codec.py +++ b/minimateplus/histogram_codec.py @@ -101,11 +101,13 @@ _BLOCK_SIZE = 32 # additional validation that we're looking at a real block. _BLOCK_MARKER = 10 -# Maximum plausible peak-count value. Normal-range geophone tops out -# at 10 in/s = 2000 counts at the 0.005 in/s per count scale; even -# Sensitive range (1.25 in/s FS) wouldn't exceed ~250. Mic counts run -# 0..~400 in observed data. 4096 leaves comfortable headroom for any -# legitimate value across all modes. +# Maximum plausible peak-count value. The geophone tops out at 10 in/s +# at Normal range = 2000 counts at the 0.005 in/s per count scale. +# Sensitive range (1.25 in/s FS) tops at ~250. Mic peak counts have +# been observed up to ~400 (≈ 100 dB(L)) and per the protocol doc can +# reach ~813 (140 dB(L)). 2200 covers Normal full-scale plus ~10% +# headroom for quantization edge cases while keeping every physically +# implausible value out of the PVS computation. # # Some prod blocks have been observed with peak-count fields whose # HIGH byte is non-zero (block[7] != 0 etc.) — observed across BE9558 @@ -116,7 +118,12 @@ _BLOCK_MARKER = 10 # device writes in some sub-mode (possibly Histogram+Continuous). # Until reverse-engineered, blocks exceeding this bound are skipped # rather than propagating bogus values into PVS computations. -_MAX_PEAK_COUNT = 4096 +# +# Earlier we tried 4096 — that allowed peak counts up to 4096 × 0.005 +# = 20.48 in/s per channel, which produced 35× inflated PVS sums when +# the extension-byte blocks slipped through. See feat/wire-histogram-codec +# branch history for the rollback. +_MAX_PEAK_COUNT = 2200 # Geo peak scaling: stored as "count × 0.005 in/s" where 1 count = one # 0.005 in/s display quantum. Equivalent to the waveform codec's diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index 2b9533e..bbe0d0f 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -287,16 +287,25 @@ def main(argv=None) -> int: or ev.total_samples < derived // 4): ev.total_samples = derived - # Preserve user-edited review state + extensions from the - # existing sidecar (false_trigger flag, notes, etc.) so a - # backfill never wipes them out. - preserved_review = None - preserved_ext = None + # Preserve user-edited review state + extensions + the + # bw_report block from the existing sidecar so a backfill + # never wipes them out. The bw_report block originates + # from the paired .TXT ASCII report parsed at ORIGINAL + # import time (ach forward / direct upload); the .TXT + # file is not in the waveform store, so we can't re-derive + # it from disk. event_to_sidecar_dict takes a + # BwAsciiReport dataclass (not a dict), so for bw_report + # we overlay the existing block after regen instead of + # passing it as a kwarg. + preserved_review = None + preserved_ext = None + preserved_bw_report = None if sidecar_path.exists(): try: _existing = event_file_io.read_sidecar(sidecar_path) - preserved_review = _existing.get("review") - preserved_ext = _existing.get("extensions") + preserved_review = _existing.get("review") + preserved_ext = _existing.get("extensions") + preserved_bw_report = _existing.get("bw_report") except Exception: pass @@ -311,6 +320,8 @@ def main(argv=None) -> int: review=preserved_review, extensions=preserved_ext, ) + if preserved_bw_report is not None: + sidecar["bw_report"] = preserved_bw_report # Also emit the .h5 clean-waveform file when: # - it's missing, OR -- 2.52.0 From d506ebc103aaf335dfe40cafd31d70391b09634a Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 21 May 2026 06:05:19 +0000 Subject: [PATCH 11/42] =?UTF-8?q?histogram=5Fcodec:=20peak=20count=20is=20?= =?UTF-8?q?uint8=20(not=20uint16=20LE)=20=E2=80=94=20properly=20cracks=20t?= =?UTF-8?q?he=20BE9558=20/=20BE18003=20extension-byte=20case?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bytes at [7]/[11]/[15]/[19] are an annotation field (purpose still unclear — empirically non-zero on intervals with sub-Hz or unmeasurable freq), NOT the high byte of the peak count. The N844 fixture corpus the original RE was done against had zero values in those bytes for every block, so uint8 and uint16 LE were equivalent there — but on real BE9558 Tran-drift events and BE18003 Histogram+Continuous events the uint16 LE interpretation produced peaks up to 268 in/s and 35× inflated PVS sums. Cross-correlated against BW's per-interval ASCII export on: - K558LKZU/LL1P/LL3K → 100% T/V/L/M peak match (1435 blocks each) - T003LKZR/LL0O/LL1M → 100% T/V/L, 99.3% M (0.05 dB rounding only) - N599LKZS/LL0L → 100% all channels - N844 fixture corpus → 100% all channels (unchanged) Annotations preserved on every record for future RE; the defensive _MAX_PEAK_COUNT bound is no longer needed (uint8 maxes at 1.275 in/s, well below any physical limit). Synthetic regression test added using the verbatim K558LKZU.RE0H interval-12 block. Co-Authored-By: Claude Opus 4.7 (1M context) --- .gitignore | 2 +- docs/histogram_codec_re_status.md | 40 +++++++++++-- minimateplus/histogram_codec.py | 99 +++++++++++++++++-------------- tests/test_histogram_codec.py | 48 +++++++++++++++ 4 files changed, 138 insertions(+), 51 deletions(-) diff --git a/.gitignore b/.gitignore index d6e4855..90e5d24 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ /bridges/captures/ /example-events/ - +/tests/fixtures/ /manuals/ # Python build artifacts diff --git a/docs/histogram_codec_re_status.md b/docs/histogram_codec_re_status.md index 3a37450..6fa388c 100644 --- a/docs/histogram_codec_re_status.md +++ b/docs/histogram_codec_re_status.md @@ -12,7 +12,21 @@ implementation lives in `minimateplus/histogram_codec.py`. in-repo histogram fixture corpus decodes byte-exact against BW's ASCII export. -24 regression tests pass against ~3,500 blocks across 5 fixtures. +26 regression tests pass against ~3,500 blocks across 5 in-repo +fixtures, plus a synthetic regression block taken from a real +BE9558 prod event to lock in the uint8-peak interpretation. + +**Important correction (2026-05-21):** the per-channel peak count +is `uint8` at byte[6]/[10]/[14]/[18], NOT `uint16 LE` at byte[6:8] +etc. The N844 fixture corpus the original RE was done against has +zero values in bytes [7]/[11]/[15]/[19] for every block, so the +two interpretations happened to be equivalent. Cross-correlating +non-N844 events (BE9558 Tran-drift, BE18003 Histogram+Continuous) +against BW's per-interval ASCII export — 4 channels × ~1400 blocks +per event × multiple events = 100% byte-exact only when the peak +is read as uint8. Reading as uint16 LE produced peaks up to 268 +in/s per channel and 35× inflated PVS sums when first deployed to +prod (rolled back, root-caused, and fixed in commit 7183b95+1). ## Body format @@ -27,15 +41,21 @@ Each block represents one histogram interval. Block layout: [1] segment_id (uint8) 0x00..0x03 — 256 blocks per segment [2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, …) [4:6] 0x000a (uint16 LE) constant marker (= 10) -[6:8] T_peak_count uint16 LE Tran peak (count × 0.005 → in/s at Normal) +[6] T_peak_count uint8 Tran peak (count × 0.005 → in/s at Normal, + max 1.275 in/s — fits in uint8) +[7] T_annotation uint8 empirically non-zero on intervals with sub-Hz + or unmeasurable freq; meaning not fully RE'd [8:10] T_halfperiod uint16 LE Tran half-period in samples (freq_Hz = 512 / halfp; ≤ 5 means ">100 Hz") -[10:12] V_peak_count uint16 LE Vert peak +[10] V_peak_count uint8 Vert peak +[11] V_annotation uint8 [12:14] V_halfperiod uint16 LE Vert freq half-period -[14:16] L_peak_count uint16 LE Long peak +[14] L_peak_count uint8 Long peak +[15] L_annotation uint8 [16:18] L_halfperiod uint16 LE Long freq half-period -[18:20] M_peak_count uint16 LE MicL peak count +[18] M_peak_count uint8 MicL peak count (dB via waveform_codec.mic_count_to_db) +[19] M_annotation uint8 [20:22] M_halfperiod uint16 LE MicL freq half-period [22:24] 0x00 0x00 constant [24:28] 4-byte variable purpose unknown — possibly CRC, @@ -99,6 +119,16 @@ slot[8] = 9 → 512/9 = 56.9 → 57 Hz ✓ M_freq ## What's NOT yet decoded +- **Annotation bytes (`block[7]/[11]/[15]/[19]`)**. Empirically + non-zero on intervals where the per-channel ZC frequency comes + out as `N/A` or sub-Hz (`<1.0`, `1.X`). Hypothesis tested in the + RE session: byte != 0 ↔ sub-Hz freq. Only ~50% correlation + across the K558 corpus, so the relationship is more complex. + Possibilities: time-of-peak-within-interval, halfp extension for + very-long-period signals, or a debug/diagnostic field the firmware + writes opportunistically. Doesn't affect peak amplitudes or + waveform reconstruction. Captured as `record["annotations"]` for + future RE. - **4-byte variable metadata field (bytes 24:28)**. Not needed for waveform reconstruction. Speculation: per-block CRC, sub-second timestamp offset, or a Mic psi(L) count not in the 9 samples. diff --git a/minimateplus/histogram_codec.py b/minimateplus/histogram_codec.py index adc0714..36e399d 100644 --- a/minimateplus/histogram_codec.py +++ b/minimateplus/histogram_codec.py @@ -28,18 +28,32 @@ iterate 32-stride and stop before the tail. [1] segment_id (uint8) 0x00..0x03 — 256 blocks per segment [2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, …) [4:6] 0x000a (uint16 LE) constant marker (= 10) - [6:8] T_peak_count uint16 LE Tran peak (count × 0.005 → in/s) + [6] T_peak_count uint8 Tran peak (count × 0.005 → in/s, max 1.275 in/s) + [7] T_annotation uint8 empirically non-zero on intervals with sub-Hz + or unmeasurable Tran freq; meaning not fully RE'd [8:10] T_halfperiod uint16 LE Tran half-period in samples (freq = 512 / halfp Hz) - [10:12] V_peak_count uint16 LE + [10] V_peak_count uint8 + [11] V_annotation uint8 [12:14] V_halfperiod uint16 LE - [14:16] L_peak_count uint16 LE + [14] L_peak_count uint8 + [15] L_annotation uint8 [16:18] L_halfperiod uint16 LE - [18:20] M_peak_count uint16 LE MicL peak (count → dB via mic_count_to_db) + [18] M_peak_count uint8 MicL peak (count → dB via mic_count_to_db) + [19] M_annotation uint8 [20:22] M_halfperiod uint16 LE MicL half-period in samples (freq = 512 / halfp Hz) [22:24] 0x00 0x00 constant [24:28] 4-byte variable purpose unknown (possibly CRC or timestamp delta) [28:32] 0x1e 0x0a 0x00 0x00 constant block-end signature +NOTE on peak-count width: an earlier interpretation treated the peak +fields as uint16 LE spanning [6:8] / [10:12] / [14:16] / [18:20]. +That happened to be byte-exact against the N844 fixture corpus only +because every annotation byte in those fixtures was zero, making +``uint16 LE == uint8``. Cross-correlating BE9558 (K558) Tran-drift +and BE18003 (T003) Histogram+Continuous events against the BW ASCII +export proved peak is uint8 alone — see test_histogram_codec.py +and docs/histogram_codec_re_status.md. + Block-identification anchor: ``block[22:24] == b"\\x00\\x00"`` AND ``block[28:32] == b"\\x1e\\x0a\\x00\\x00"``. This is the reliable distinguisher from non-block content in the file. @@ -101,30 +115,6 @@ _BLOCK_SIZE = 32 # additional validation that we're looking at a real block. _BLOCK_MARKER = 10 -# Maximum plausible peak-count value. The geophone tops out at 10 in/s -# at Normal range = 2000 counts at the 0.005 in/s per count scale. -# Sensitive range (1.25 in/s FS) tops at ~250. Mic peak counts have -# been observed up to ~400 (≈ 100 dB(L)) and per the protocol doc can -# reach ~813 (140 dB(L)). 2200 covers Normal full-scale plus ~10% -# headroom for quantization edge cases while keeping every physically -# implausible value out of the PVS computation. -# -# Some prod blocks have been observed with peak-count fields whose -# HIGH byte is non-zero (block[7] != 0 etc.) — observed across BE9558 -# and BE18003 units in Histogram-mode events. Reading these as -# uint16 LE produces values like 30981 / 41733 / 62469, which scale -# to physically impossible peaks (150+ in/s). Best guess: an -# undocumented "time-of-peak-within-interval" extension byte the -# device writes in some sub-mode (possibly Histogram+Continuous). -# Until reverse-engineered, blocks exceeding this bound are skipped -# rather than propagating bogus values into PVS computations. -# -# Earlier we tried 4096 — that allowed peak counts up to 4096 × 0.005 -# = 20.48 in/s per channel, which produced 35× inflated PVS sums when -# the extension-byte blocks slipped through. See feat/wire-histogram-codec -# branch history for the rollback. -_MAX_PEAK_COUNT = 2200 - # Geo peak scaling: stored as "count × 0.005 in/s" where 1 count = one # 0.005 in/s display quantum. Equivalent to the waveform codec's # 16-count-unit output (1 unit = 0.005 in/s = 16 ADC counts). @@ -156,23 +146,36 @@ def _decode_block(block: bytes) -> Optional[dict]: """Decode one 32-byte histogram block. Caller must have validated with ``_is_data_block`` first. - Returns ``None`` if any peak field exceeds ``_MAX_PEAK_COUNT`` — - those blocks contain an undocumented extension byte format whose - naive uint16 LE interpretation gives physically impossible peaks. - Skipping the block is safer than propagating bogus values into - PVS computations downstream. + Returns a record with per-channel peak counts (uint8) and + half-periods (uint16 LE). """ - # All 16-bit fields are little-endian unsigned. Peak counts are - # always non-negative; half-periods are always positive when valid. - t_peak, t_halfp, v_peak, v_halfp, l_peak, l_halfp, m_peak, m_halfp = struct.unpack_from( - " _MAX_PEAK_COUNT or v_peak > _MAX_PEAK_COUNT - or l_peak > _MAX_PEAK_COUNT or m_peak > _MAX_PEAK_COUNT): - return None + # Peak counts are uint8 at bytes [6] / [10] / [14] / [18]. The + # adjacent bytes [7] / [11] / [15] / [19] hold an annotation field + # whose meaning isn't fully understood (empirically non-zero in + # intervals with sub-Hz or unmeasurable geo frequencies, mostly + # zero otherwise — see test fixtures from BE9558/BE18003 corpora). + # Crucially, those annotation bytes are NOT the high byte of the + # peak count: cross-correlating against BW's per-interval ASCII + # export proves the peak is uint8 alone. + # + # Reading the peak as uint16 LE (the original interpretation) was + # accidentally correct only because every block in the N844 fixture + # corpus had a zero annotation byte; non-N844 events with non-zero + # annotation bytes decoded to physically impossible peaks (e.g. + # 268 in/s per channel) and produced 35× inflated PVS sums when + # first run against prod data. See histogram_codec_re_status.md. + t_peak = block[6] + v_peak = block[10] + l_peak = block[14] + m_peak = block[18] + t_halfp = block[8] | (block[9] << 8) + v_halfp = block[12] | (block[13] << 8) + l_halfp = block[16] | (block[17] << 8) + m_halfp = block[20] | (block[21] << 8) segment_id = block[1] block_ctr = block[2] | (block[3] << 8) var_meta = bytes(block[24:28]) + annotations = (block[7], block[11], block[15], block[19]) return { "segment_id": segment_id, "block_ctr": block_ctr, @@ -185,6 +188,7 @@ def _decode_block(block: bytes) -> Optional[dict]: "m_peak": m_peak, "m_halfp": m_halfp, "meta_var": var_meta, + "annotations": annotations, } @@ -192,10 +196,15 @@ def walk_body(body: bytes) -> List[dict]: """Walk the body and return one dict per histogram interval. Iterates 32-byte strides from offset 0. Yields a decoded record - for every block that passes ``_is_data_block`` validation AND has - plausible peak values (``_decode_block`` returns None for blocks - with out-of-bound peaks). Stops when the remaining bytes are too - short to form a complete block. + for every block that passes ``_is_data_block`` validation. Stops + when the remaining bytes are too short to form a complete block. + + In Histogram+Continuous mode the body interleaves data blocks with + other 32-byte content (likely continuous-mode waveform blocks) that + fail the data-block validation; the walker naturally skips them + without losing 32-byte alignment. Use ``block_ctr`` from each + returned record to map back to the original interval index — the + record list is sparse when other block types are interleaved. """ records: List[dict] = [] for off in range(0, len(body) - _BLOCK_SIZE + 1, _BLOCK_SIZE): diff --git a/tests/test_histogram_codec.py b/tests/test_histogram_codec.py index 8e521f3..6a42e27 100644 --- a/tests/test_histogram_codec.py +++ b/tests/test_histogram_codec.py @@ -335,3 +335,51 @@ def test_geo_count_to_ins_scale(): assert geo_count_to_ins(1) == pytest.approx(0.005) assert geo_count_to_ins(10) == pytest.approx(0.050) assert geo_count_to_ins(0) == 0.0 + + +# ── Regression: peak is uint8 byte[N], NOT uint16 LE byte[N:N+2] ──────────── +# +# Block taken verbatim from K558LKZU.RE0H (BE9558) interval 12 — a real +# field event where the Tran channel had developed a DC offset and was +# producing sub-Hz drift content the device couldn't characterize. +# The annotation byte at [7] = 0xd2 is non-zero in that case. The +# legacy codec read [6:8] as uint16 LE, producing T_peak = 53763 → +# 268 in/s — physically impossible and 35× too high for the actual +# 0.015 in/s value (T_lo = 3 alone gives the correct count). +# Verified against the paired BW ASCII export. +_K558_INTERVAL_12_BLOCK = bytes.fromhex( + "00 00 0c 01 0a 00 03 d2 45 00 02 00 02 00 02 00" + "02 00 10 00 06 00 00 00 0e 91 2f 00 1e 0a 00 00".replace(" ", "") +) + + +def test_extension_byte_does_not_inflate_peak(): + """The annotation byte at [7]/[11]/[15]/[19] must NOT contribute to + the peak count. Decoded T_peak must be 3 (uint8 byte[6]), NOT + 53763 (uint16 LE byte[6:8]).""" + body = _K558_INTERVAL_12_BLOCK + records = decode_histogram_body_full(body) + assert records is not None + assert len(records) == 1 + r = records[0] + assert r["t_peak"] == 3, f"T_peak should be 3 (uint8), got {r['t_peak']}" + assert r["v_peak"] == 2 + assert r["l_peak"] == 2 + assert r["m_peak"] == 16 + # Half-periods unchanged — still uint16 LE. + assert r["t_halfp"] == 0x0045 # 69 → 7.4 Hz + assert r["m_halfp"] == 6 # → 85.3 Hz + # Annotation byte is preserved (for future RE) but does not affect peak. + assert r["annotations"] == (0xd2, 0x00, 0x00, 0x00) + + +def test_extension_byte_decoded_to_correct_in_s(): + """End-to-end: the channel-grouped output for the K558 ext block + should give T = 3 counts = 0.015 in/s, not 53763 counts = 268 in/s.""" + channels = decode_histogram_body(_K558_INTERVAL_12_BLOCK) + assert channels is not None + assert channels["Tran"] == [3] + assert geo_count_to_ins(channels["Tran"][0]) == pytest.approx(0.015) + assert channels["Vert"] == [2] + assert channels["Long"] == [2] + assert channels["MicL"] == [16] -- 2.52.0 From ed6982c51261048ab4803d6db05ca820e229865a Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 21 May 2026 06:13:52 +0000 Subject: [PATCH 12/42] scripts: bw_report preservation check for backfill safety MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two-step tool to verify that backfill_sidecars doesn't wipe the bw_report block from existing sidecars. Workflow: 1. snapshot --out before.json (canonical-JSON hash per sidecar) 2. run backfill 3. diff --baseline before.json (classifies every sidecar: PRESERVED / CHANGED / WIPED / STILL_MISSING / NEW / ADDED / REMOVED) Exit code 1 if any WIPED or CHANGED entries found, 0 otherwise — so it can gate a CI step or a deploy script. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/check_bw_report_preservation.py | 185 ++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 scripts/check_bw_report_preservation.py diff --git a/scripts/check_bw_report_preservation.py b/scripts/check_bw_report_preservation.py new file mode 100644 index 0000000..2402ffe --- /dev/null +++ b/scripts/check_bw_report_preservation.py @@ -0,0 +1,185 @@ +""" +scripts/check_bw_report_preservation.py — verify that running backfill_sidecars +doesn't wipe the `bw_report` block from sidecars that already had one. + +Two-step workflow: + + # Before running backfill — capture a baseline snapshot: + python scripts/check_bw_report_preservation.py snapshot \ + --store-root /path/to/waveforms \ + --out before.json + + # Run backfill: + python scripts/backfill_sidecars.py --store-root /path/to/waveforms --force + + # After backfill — diff against the baseline: + python scripts/check_bw_report_preservation.py diff \ + --store-root /path/to/waveforms \ + --baseline before.json + +The diff classifies every sidecar into one of: + + PRESERVED had bw_report before, has same hash now ← GOOD + CHANGED had bw_report before, has different hash now ← suspicious + (backfill should only ever copy the block verbatim) + WIPED had bw_report before, doesn't now ← BUG — data loss + STILL_MISSING didn't have bw_report before, still doesn't ← expected + NEW didn't have bw_report before, has one now + (only possible if a re-ingest happened between snapshots; + shouldn't happen during backfill) + REMOVED sidecar existed in baseline, file is gone now + ADDED sidecar didn't exist in baseline, exists now + +Exit code is 0 if no WIPED or CHANGED entries are found, 1 otherwise. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from pathlib import Path +from typing import Optional + +# Allow running from the repo root without installation. +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from minimateplus import event_file_io + + +def _bw_report_hash(sidecar_data: dict) -> Optional[str]: + """Canonical-JSON hash of the bw_report block, or None if absent.""" + br = sidecar_data.get("bw_report") + if not br: + return None + # sort_keys for stable hashing across dict-ordering differences + blob = json.dumps(br, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(blob.encode()).hexdigest() + + +def _scan_store(store_root: Path) -> dict: + """Walk every /.sfm.json and return {relpath: hash_or_None}. + + Relpath is `/` — stable across machines/snapshots. + """ + out: dict[str, Optional[str]] = {} + for serial_dir in sorted(p for p in store_root.iterdir() if p.is_dir()): + for sidecar in sorted(serial_dir.glob("*.sfm.json")): + relpath = f"{serial_dir.name}/{sidecar.name}" + try: + data = event_file_io.read_sidecar(sidecar) + except Exception as exc: + print(f" WARN: failed to read {relpath}: {exc}", file=sys.stderr) + continue + out[relpath] = _bw_report_hash(data) + return out + + +def cmd_snapshot(args) -> int: + store_root = Path(args.store_root).expanduser().resolve() + if not store_root.exists(): + print(f"error: store root does not exist: {store_root}", file=sys.stderr) + return 2 + out_path = Path(args.out).expanduser().resolve() + + print(f"Scanning {store_root} …") + snapshot = _scan_store(store_root) + + with_bw = sum(1 for v in snapshot.values() if v is not None) + without_bw = sum(1 for v in snapshot.values() if v is None) + print(f" total sidecars: {len(snapshot)}") + print(f" with bw_report: {with_bw}") + print(f" without bw_report: {without_bw}") + + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w") as f: + json.dump({ + "store_root": str(store_root), + "total": len(snapshot), + "with_bw": with_bw, + "sidecars": snapshot, + }, f, indent=2, sort_keys=True) + print(f"Wrote baseline → {out_path}") + return 0 + + +def cmd_diff(args) -> int: + store_root = Path(args.store_root).expanduser().resolve() + if not store_root.exists(): + print(f"error: store root does not exist: {store_root}", file=sys.stderr) + return 2 + baseline_path = Path(args.baseline).expanduser().resolve() + if not baseline_path.exists(): + print(f"error: baseline file not found: {baseline_path}", file=sys.stderr) + return 2 + + with open(baseline_path) as f: + baseline = json.load(f) + before = baseline["sidecars"] + print(f"Scanning {store_root} for comparison against {baseline_path.name} …") + after = _scan_store(store_root) + + classes = {k: [] for k in ( + "PRESERVED", "CHANGED", "WIPED", "STILL_MISSING", "NEW", "REMOVED", "ADDED", + )} + all_keys = set(before) | set(after) + for key in sorted(all_keys): + b = before.get(key, "__MISSING__") + a = after.get(key, "__MISSING__") + if b == "__MISSING__": + classes["ADDED"].append(key) + elif a == "__MISSING__": + classes["REMOVED"].append(key) + elif b is None and a is None: + classes["STILL_MISSING"].append(key) + elif b is None and a is not None: + classes["NEW"].append(key) + elif b is not None and a is None: + classes["WIPED"].append(key) + elif b == a: + classes["PRESERVED"].append(key) + else: + classes["CHANGED"].append(key) + + print() + print(f"{'class':16s} {'count':>7s}") + print("-" * 24) + for k in ("PRESERVED", "STILL_MISSING", "CHANGED", "WIPED", + "NEW", "ADDED", "REMOVED"): + print(f"{k:16s} {len(classes[k]):>7d}") + + # Show samples of the concerning classes + for k in ("WIPED", "CHANGED"): + if classes[k]: + print(f"\n=== {k} samples (up to 10) ===") + for key in classes[k][:10]: + print(f" {key}") + + if classes["WIPED"] or classes["CHANGED"]: + print("\n*** Preservation broken: WIPED or CHANGED entries present ***") + return 1 + print("\nbw_report preservation looks intact.") + return 0 + + +def main(argv=None) -> int: + p = argparse.ArgumentParser(description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + + p_snap = sub.add_parser("snapshot", help="capture baseline bw_report hashes") + p_snap.add_argument("--store-root", required=True) + p_snap.add_argument("--out", required=True, help="output JSON path") + p_snap.set_defaults(func=cmd_snapshot) + + p_diff = sub.add_parser("diff", help="diff current store against a baseline") + p_diff.add_argument("--store-root", required=True) + p_diff.add_argument("--baseline", required=True, help="JSON from `snapshot`") + p_diff.set_defaults(func=cmd_diff) + + args = p.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + sys.exit(main()) -- 2.52.0 From 49a524d0d49f832b1f2870f9b135a350cf62eae9 Mon Sep 17 00:00:00 2001 From: serversdown Date: Fri, 22 May 2026 18:38:00 +0000 Subject: [PATCH 13/42] docs: three-tier architecture model + strategic roadmap MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit CLAUDE.md gains an Architecture section near the top describing the canonical three-tier mental model: - SFM: device-side, live connections, /device/* endpoints - SDM: data-side, DB + waveform store + /db/* endpoints (currently living under sfm/ for historical reasons; rename deferred) - Codec library: pure data-interpretation, used by both tiers Future code should be placed and named according to this model even though the directory layout doesn't fully reflect it yet. Decision rule for where new code goes is documented inline. README.md's Roadmap section gains two strategic-direction subsections: - "Strategic direction" — frames the suite-of-components vision and notes that BW ACH + Thor IDF call-home remain the data movers; seismo-relay's value is on the receiving and processing side. - "Terra-View ↔ SFM device control" — the long-term vision where Terra-View can launch into SFM device-control surfaces (operator notices missing unit → clicks "Connect to Device" → live view in browser). Includes concrete implementation checklist (auth, embedded live-monitor view, action history, series IV live support). The existing tactical roadmap items remain unchanged below. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 78 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ README.md | 66 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 144 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index 5dd6629..e46b30b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -8,6 +8,84 @@ When new information about the protocol is discovered, please update the instant --- +## Architecture: three-tier conceptual model + +seismo-relay is a **suite of cooperating components**, not a single app. +The three tiers below are the canonical mental model — the current +directory layout doesn't fully reflect them yet (some of what is +conceptually SDM lives under `sfm/` today), but new code should be +placed and named according to this model. + +### 1. SFM — the device-side (active connection to physical units) + +Replaces Blastware's *talk-to-the-meter* role. Lives where a connection +to a physical seismograph is open. + +In scope: +- `minimateplus/{transport,framing,protocol,client}.py` — wire protocol +- `seismo_lab.py` — diagnostic GUI (a thick client for SFM) +- The `/device/*` HTTP endpoints in `sfm/server.py` — + `/device/info`, `/device/events`, `/device/monitor/*`, `/device/call_home`, + etc. Anything that opens a connection at the moment of the request. +- Future: a Thor / Micromate live client (mirror `minimateplus/`) +- Future: a control surface Terra-View can launch into — see the + README's Roadmap. + +Does NOT own a database. Outputs `Event` objects. Has a "spun up when +needed" runtime profile rather than "always on". + +### 2. SDM — the data-side (storage, ingest, and serving) + +The new name for the receiving-and-storing role. Originally called SFM +because the FastAPI service started life as a thin device proxy, but +the actual role has migrated heavily toward data management. **For now +the directory remains `sfm/`** — renaming requires touching ~30-50 +files in seismo-relay + ~10-15 in terra-view + a Docker volume +migration; deferred until the codebase is quiet enough to do it as a +clean refactor. + +In scope: +- `sfm/database.py` (`SeismoDb`) +- `sfm/waveform_store.py`, `sfm/event_hdf5.py` +- The `/db/*` HTTP endpoints — `events`, `units`, `monitor_log`, + `sessions`, `false_trigger` mutations +- The `/db/import/*` ingest endpoints — `blastware_file` (series3), + `idf_file` (series4); anything that receives events FROM somewhere +- `scripts/backfill_sidecars.py`, `scripts/check_bw_report_preservation.py`, + and similar data-maintenance tools +- The `.sfm.json` sidecars and `.h5` files in the waveform store +- The shape that Terra-View consumes (Terra-View should never need to + reach into SFM/device-side endpoints to populate its UI) + +Always-on, scaled for storage/serving, has the DB and waveform store. + +### 3. Codec library — pure data interpretation (used by both sides) + +Neither SFM nor SDM — a shared library both depend on. + +In scope: +- `minimateplus/{waveform_codec,histogram_codec,event_file_io,bw_ascii_report,blastware_file}.py` +- `micromate/{idf_ascii_report,idf_file}.py` + +These modules take bytes (off the wire on the SFM side, or from a +forwarded file on the SDM side) and return `Event` objects. They +should not import from `sfm/`, must not touch a DB, and have no I/O +beyond reading files passed as arguments. Keep them pure — both +tiers can then depend on them without circularity. + +### Practical consequences + +When deciding where new code goes, ask: +- *Does it need a connection to a device?* → SFM +- *Does it operate on stored events / sidecars / DB rows?* → SDM +- *Does it interpret bytes into structured data, with no I/O of its own?* → codec lib + +Terra-View is downstream of SDM for data, and (per the roadmap) will +eventually invoke into SFM's device-control endpoints to provide a +"connect to unit" experience. + +--- + ## Project layout ``` diff --git a/README.md b/README.md index c057f68..6433158 100644 --- a/README.md +++ b/README.md @@ -459,6 +459,72 @@ Use **com0com** or **VSPD** to create the virtual COM pair on Windows. ## Roadmap (Future) +### Strategic direction — where this is going + +seismo-relay is being built as a **suite of cooperating components** +that together replace and improve on Blastware's role. Three logical +tiers: + +1. **SFM** (device-side) — owns the active connection to a physical + unit. Today: `minimateplus/`, `/device/*` HTTP endpoints, + `seismo_lab.py`. Future: live Thor / Micromate support. +2. **SDM** (data-side) — owns the database, waveform store, ingest + pipelines, and the read-API that Terra-View consumes. Today this + code lives under `sfm/` for historical reasons; the role has + migrated and the eventual rename is on the long-tail cleanup list. +3. **Codec library** — pure data-interpretation: `minimateplus/*_codec.py`, + `bw_ascii_report.py`, `micromate/idf_*.py`. Used by both SFM and + SDM, depends on neither. + +Terra-View is downstream of SDM for fleet listings, event detail, etc. +The long-term vision adds a **second link** from Terra-View → SFM for +direct device interaction (see below). + +The codec work in this repo isn't trying to replace BW's network +layer — BW's ACH file forwarding and Thor's IDF call-home are +battle-tested. The value is in the receiving and processing side: turn +the stream of binary+ASCII pairs into something users can search, +filter, alert on, and report from. + +### Terra-View ↔ SFM device control (the long-term vision) + +Today Terra-View only reads from SDM (event listings, dashboards, +project reports). When a unit goes missing — operator notices in the +Terra-View dashboard — there's no way to *do* anything from the UI. +The path of least resistance is to RDP into a Windows box and open +Blastware, which defeats the purpose of having Terra-View. + +Target experience: +- Operator notices a unit in Terra-View dashboard hasn't called in. +- Clicks unit detail → "Connect to Device" button. +- Terra-View opens an embedded view (modal or side-panel) that talks + to SFM's `/device/*` endpoints over the network. +- Live view: device clock, battery, memory, current monitor status. +- Actions: start/stop monitoring, push compliance config changes, pull + fresh events, run a sensor self-check, change call-home settings. +- Audit log: every connect / action recorded in SDM for the unit + history. + +Implementation steps (concrete): +- [ ] **SFM authentication & authorization layer.** Today `/device/*` + endpoints are unauthenticated — anyone on the network can call + them. Need at minimum a token-based auth, ideally with a "who + can connect to which units" mapping. Hard prerequisite for + letting Terra-View users into the control surface. +- [ ] **Terra-View "Connect to Device" entry point** on the unit + detail page. Renders only when unit has connection info on file + and the user has permission. +- [ ] **Embedded live-monitor view** in Terra-View — equivalent to + `seismo_lab.py`'s Bridge tab, but in the browser. Polls SFM's + `/device/monitor/status` on an interval; sends start/stop via + `/device/monitor/{start,stop}`. +- [ ] **Action history** — every connect / push / action call records + a row in `unit_history`, viewable on the unit detail page. +- [ ] **Series IV live-device support in SFM** — currently `/device/*` + only supports MiniMate Plus. Blocks "Connect to Device" for + Thor units until done. Depends on Thor wire-protocol capture + and a `micromate/` parallel of the `minimateplus/` modules. + ### High-impact (unblocks product features) - [ ] **Series III waveform body codec reverse-engineering.** The 5A bulk-stream body is some kind of compressed/encoded format (not raw int16 LE as previously assumed — see §7.6.1 retraction in `docs/instantel_protocol_reference.md`). Structural framing is ~50% decoded on branch `claude/codec-re-cBGNe` (tagged-block walker, segment counters); per-byte sample mapping is still open. Until this lands, the in-app waveform viewer renders garbage and BW-import peak values fall back to `_peaks_from_samples()` saturation noise. Workaround: pair every BW-imported event with its `_ASCII.TXT` so the device-authoritative peaks land in the DB regardless of codec. -- 2.52.0 From 35842ac50a8b225ad193f16100b3422002669f19 Mon Sep 17 00:00:00 2001 From: serversdown Date: Fri, 22 May 2026 18:56:22 +0000 Subject: [PATCH 14/42] backfill: overlay bw_report onto Event before DB upsert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirror what the ingest path does: BW's reported peaks (and sample_rate / record_time) take precedence over codec output where present. Without this, --force backfill silently overwrites bw_report-overlaid DB columns with codec-derived peaks. Wrong for events where the codec doesn't fully decode (waveform walker edge cases on SP0/SS0/SV0-style events, histogram byte[5]!=0 sub-format that isn't yet RE'd), producing PVS=0 on real high-amplitude events. Bit on prod 2026-05-22 with three top-10 waveform events ending up at PVS=0 (rolled back same day, this fix is the proper resolution). New helper minimateplus.event_file_io.apply_bw_report_dict_to_event operates on the projected sidecar dict shape (the structure _bw_report_to_dict produces, which is what gets preserved in the sidecar). Mirrors apply_report_to_event's semantics: only writes fields where bw_report has a non-None value, no-ops cleanly on empty / None input. Dev validation against prod snapshot: pre : 1839.7315 pvs_sum 356 events with DB PVS ≠ sidecar bw_report post : 2016.4902 pvs_sum 2 events still mismatched (both have NULL timestamp + duplicate rows, edge case) Both edge-case events DO get the correct value written by the new backfill — their stale rows from prior backfills remain because UNIQUE(serial, timestamp) doesn't fire on NULL. Separate dedup cleanup needed for those 2 events (0.014% of corpus); not blocking. Backfill remains idempotent + bw_report preservation still passes (0 WIPED, 0 CHANGED on the 3rd consecutive run). Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/event_file_io.py | 54 ++++++++++++++++++++++++++ scripts/backfill_sidecars.py | 17 +++++++++ tests/test_event_file_io.py | 71 +++++++++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+) diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index 6e5674d..66a4b68 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -254,6 +254,60 @@ def apply_report_to_event(event: Event, report: BwAsciiReport) -> None: event.rectime_seconds = report.record_time_s +def apply_bw_report_dict_to_event(event: Event, bw_report: dict) -> None: + """Mirror of ``apply_report_to_event`` for the projected sidecar + dict shape (as produced by ``_bw_report_to_dict``). + + Why this exists + ─────────────── + The ingest path holds a live ``BwAsciiReport`` parsed straight from + the ``_ASCII.TXT`` and uses ``apply_report_to_event`` to overlay + device-authoritative peaks onto the codec output before insert. + + The backfill path doesn't have the original ``.TXT`` (it's not + retained in the waveform store), but it does have the preserved + ``bw_report`` block from the sidecar — which contains the same + projected fields. Re-overlaying those during a backfill keeps the + DB peak columns aligned with what BW reports rather than letting + the codec output (which may be incomplete for unhandled formats or + walker edge cases) win by default. + + No-ops cleanly when ``bw_report`` is ``None``, empty, or missing + any particular sub-field — only fields with a concrete value get + written. Mirrors ``apply_report_to_event``'s "report wins where + present" semantics. + """ + if not bw_report: + return + if event.peak_values is None: + event.peak_values = PeakValues() + pv = event.peak_values + + peaks = bw_report.get("peaks") or {} + tran = (peaks.get("tran") or {}).get("ppv_ips") + vert = (peaks.get("vert") or {}).get("ppv_ips") + long = (peaks.get("long") or {}).get("ppv_ips") + if tran is not None: pv.tran = tran + if vert is not None: pv.vert = vert + if long is not None: pv.long = long + vs_ips = (peaks.get("vector_sum") or {}).get("ips") + if vs_ips is not None: + pv.peak_vector_sum = vs_ips + + mic = bw_report.get("mic") or {} + pspl = mic.get("pspl_dbl") + if pspl is not None and pspl > 0: + pv.micl = _dbl_to_psi(pspl) + + rec = bw_report.get("recording") or {} + sr = rec.get("sample_rate_sps") + if sr: + event.sample_rate = sr + rt = rec.get("record_time_s") + if rt is not None: + event.rectime_seconds = rt + + def _project_info_to_dict(pi: Optional[ProjectInfo]) -> dict: if pi is None: return { diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index bbe0d0f..9c4bf5d 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -309,6 +309,23 @@ def main(argv=None) -> int: except Exception: pass + # Overlay BW ASCII report fields onto the rebuilt Event + # BEFORE the sidecar + DB write. Mirrors what the ingest + # path does — BW's reported peaks (and sample_rate / + # record_time) win over codec output where present. + # + # Without this step, --force backfill silently overwrites + # the bw_report-overlaid DB columns with codec-derived + # values, which is wrong for events the codec doesn't + # fully decode (e.g. waveform walker edge cases on + # SP0/SS0/SV0-style events, or histogram sub-formats with + # byte[5]!=0 that aren't yet RE'd). Net effect was PVS=0 + # on three top-10 events on 2026-05-22. + if preserved_bw_report: + event_file_io.apply_bw_report_dict_to_event( + ev, preserved_bw_report, + ) + sidecar = event_file_io.event_to_sidecar_dict( ev, serial=serial, diff --git a/tests/test_event_file_io.py b/tests/test_event_file_io.py index 6e08dae..0e043e8 100644 --- a/tests/test_event_file_io.py +++ b/tests/test_event_file_io.py @@ -529,6 +529,77 @@ def test_save_imported_bw_round_trip(tmp_path: Path): assert stored_path.read_bytes() == src.read_bytes() +# ── apply_bw_report_dict_to_event ──────────────────────────────────────────── + + +def test_apply_bw_report_dict_overlays_peaks_and_recording(): + """Verbatim mirror of the data shape produced by `_bw_report_to_dict` + when projecting a parsed `BwAsciiReport` into the sidecar. Confirms + each field overlays onto Event correctly so the backfill path + matches ingest behavior.""" + from minimateplus.models import PeakValues + ev = Event(index=0) + bw_report = { + "peaks": { + "tran": {"ppv_ips": 9.84375}, + "vert": {"ppv_ips": 0.305}, + "long": {"ppv_ips": 0.405}, + "vector_sum": {"ips": 14.86736}, + }, + "mic": {"pspl_dbl": 115.9}, + "recording": {"sample_rate_sps": 1024, "record_time_s": 3.0}, + } + event_file_io.apply_bw_report_dict_to_event(ev, bw_report) + assert ev.peak_values is not None + assert ev.peak_values.tran == 9.84375 + assert ev.peak_values.vert == 0.305 + assert ev.peak_values.long == 0.405 + assert ev.peak_values.peak_vector_sum == 14.86736 + # MicL is converted dB → psi via _dbl_to_psi — just confirm non-zero + assert ev.peak_values.micl is not None and ev.peak_values.micl > 0 + assert ev.sample_rate == 1024 + assert ev.rectime_seconds == 3.0 + + +def test_apply_bw_report_dict_overwrites_codec_peaks(): + """The whole point of this helper: bw_report wins over whatever the + codec produced. This is what the 2026-05-22 prod backfill missed — + DB peaks got overwritten with codec output (incl. PVS=0 on the + three top events) when they should have stayed bw_report-overlaid.""" + from minimateplus.models import PeakValues + ev = Event(index=0) + # Simulate codec output that's clearly wrong (incomplete decode): + ev.peak_values = PeakValues( + tran=2.09, vert=0.0, long=0.0, peak_vector_sum=0.0, + ) + bw_report = { + "peaks": { + "tran": {"ppv_ips": 9.84}, + "vert": {"ppv_ips": 4.95}, + "long": {"ppv_ips": 8.05}, + "vector_sum": {"ips": 14.95}, + }, + } + event_file_io.apply_bw_report_dict_to_event(ev, bw_report) + assert ev.peak_values.tran == 9.84 + assert ev.peak_values.vert == 4.95 + assert ev.peak_values.long == 8.05 + assert ev.peak_values.peak_vector_sum == 14.95 + + +def test_apply_bw_report_dict_no_op_on_empty(): + """None / empty dict / missing keys should leave Event untouched.""" + from minimateplus.models import PeakValues + for empty in (None, {}, {"peaks": {}}, {"peaks": {"tran": {}}}): + ev = Event(index=0) + ev.peak_values = PeakValues(tran=1.0, vert=2.0, long=3.0) + event_file_io.apply_bw_report_dict_to_event(ev, empty) + # Unchanged + assert ev.peak_values.tran == 1.0 + assert ev.peak_values.vert == 2.0 + assert ev.peak_values.long == 3.0 + + if __name__ == "__main__": if pytest is not None: pytest.main([__file__, "-v"]) -- 2.52.0 From 8710b8f327afb06c86c5fbbe2b0b35e9b89d976e Mon Sep 17 00:00:00 2001 From: serversdown Date: Fri, 22 May 2026 21:02:13 +0000 Subject: [PATCH 15/42] docs: record three known issues discovered during prod deployment MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 1. bw_ascii_report parser misses PPV/vector_sum fields on certain TXT formats (5 events in prod). Parser extracts every OTHER field for the same channels — likely a regex / format mismatch specific to some firmware-or-event-type combination. 2. NULL-timestamp duplicate rows. events.timestamp can come back as NULL when the codec can't extract a footer timestamp; UNIQUE(serial, timestamp) doesn't fire on NULL, so backfills create new rows instead of upserting. 2 affected events on prod, easy SQL cleanup. 3. Histogram body sub-format with byte[5] != 0. ~3 events on prod (T190LD5Q, O121L4L1) use a histogram body the walker doesn't recognize. Codec returns 0 valid blocks; DB peaks come from the bw_report ASCII overlay so DB columns are correct, only the .h5 plot is empty. Cracking the sub-format unlocks the plot. All three are pre-existing issues that today's deployment surfaced during validation; none are regressions. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 6433158..d62c1e5 100644 --- a/README.md +++ b/README.md @@ -536,6 +536,7 @@ Implementation steps (concrete): ### BW ASCII report parser enhancements (built in v0.16.0) +- [ ] **PPV field misses on certain TXT formats.** Discovered 2026-05-22 during the histogram-codec backfill validation: a handful of events (5 in prod) have a `bw_report` block where `peaks.{tran,vert,long}.ppv_ips` and `peaks.vector_sum.ips` are all `None`, despite the parser correctly extracting every OTHER field for the same channels (zc_freq_hz, time_of_peak_s, peak_accel_g, peak_disp_in). Symptom on the DB side: `peak_vector_sum=0` after a `--force` backfill that overlays from the parsed bw_report dict. Affected events on prod include `T190LD5Q.LK0W`, `T438L713.RY0W`, `K557L3YM.OE0W`. Root cause likely a regex or format mismatch for the "PPV" header line in those specific firmware/event-type outputs. Once fixed, re-forwarding the events from series3-watcher will re-populate the `bw_report` blocks correctly. - [ ] **Histogram-specific structural fields.** Current parser handles the shared fields (PPV, ZC Freq, sensor self-check, project) but silently drops histogram-only fields: `Histogram Start/Stop Time`, `Histogram Start/Stop Date`, `Number of Intervals`, `Interval Size`, per-channel `Peak Time` + `Peak Date` (absolute timestamps rather than the waveform's `Time of Peak` relative seconds). - [ ] **Histogram interval bin-table parsing.** Trailing 792-row table (per-interval Peak/Freq per channel + MicL) in histogram TXTs is unparsed. Probably too big for the sidecar JSON; may want a separate `.histogram.h5` companion file. - [ ] **`>100 Hz` value parsing.** Histogram TXTs use `>100 Hz` for out-of-range ZC freq; current `_parse_number()` returns `None` for these (loses information). @@ -564,3 +565,5 @@ Implementation steps (concrete): - [ ] Locate "Sensor Check" byte in compliance config (need capture with Disabled vs Before-monitoring). - [ ] Call Home — map time slots 3/4 offsets; confirm `modem_power_relay_enabled`. - [ ] RV55 DCD/DTR — newer RV55 firmware doesn't assert DCD by default; units don't resume monitoring after call-home disconnect (`--restart-monitoring` flag deferred). +- [ ] **NULL-timestamp duplicate-row dedup.** A small handful of events (2 known on prod as of 2026-05-22) have `events.timestamp IS NULL` because the codec couldn't extract a timestamp from the binary footer. The `UNIQUE(serial, timestamp)` constraint doesn't fire on `NULL` (SQL semantics: `NULL ≠ NULL`), so every `--force` backfill INSERTs a new row instead of UPSERTing the existing one. Cleanup: a one-shot SQL query that keeps only the newest row per `(serial, blastware_filename)` and deletes the rest. Longer-term: extend the unique key to `(serial, COALESCE(timestamp, blastware_filename))` or reject inserts with NULL timestamp. +- [ ] **Histogram body sub-format with `byte[5] != 0`.** ~3 events on prod (`T190LD5Q.LD0H`, `O121L4L1.GU0H`) use a histogram body my walker doesn't recognize — the first block has `byte[5] = 0x01` or `0x07` instead of `0x00`, and the entire body lacks the `1e 0a 00 00` tail signature. Codec returns 0 valid blocks; their DB PVS comes from the bw_report ASCII overlay (which BW computed from the same binary, so the DB columns are correct). Only the `.h5` waveform plot is empty. Cracking the sub-format would unlock the plot. Needs binary+ASCII pairs from a few `byte[5]!=0` events; same RE approach as the K558 case. -- 2.52.0 From 460006e5cdae65338e3fc1f6b3272aa96bce60b5 Mon Sep 17 00:00:00 2001 From: serversdown Date: Sat, 23 May 2026 06:53:48 +0000 Subject: [PATCH 16/42] sfm: stored-event browser at /events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New standalone HTML page (sfm/event_browser.html, ~470 lines, Chart.js) that lets you browse persisted events from the SeismoDb + WaveformStore. Companion to the existing live-device viewer at /waveform: /waveform — connect to a unit and pull events in real time /events — browse events already stored in the DB Flow: 1. Page loads → GET /db/units → populate serial dropdown 2. Select serial → GET /db/events?serial=X&limit=500 → event list 3. Click event → GET /db/events/{id}/waveform.json → render Layout is Instantel-printout-ready: channels stacked vertically in Tran / Vert / Long / MicL order, trigger line at t=0, peak labels, clean dark theme. Frames the future PDF-export feature without needing extra layout work. Smoke-tested against the dev prod-snapshot — 4 channels render with correct peaks for K558 events (L=0.3 in/s = the offset-fault peak we've been chasing all week). CHANGELOG entry added under [Unreleased] per the v0.20.0 release plan. Co-Authored-By: Claude Opus 4.7 (1M context) --- CHANGELOG.md | 24 ++ sfm/event_browser.html | 564 +++++++++++++++++++++++++++++++++++++++++ sfm/server.py | 16 +- 3 files changed, 603 insertions(+), 1 deletion(-) create mode 100644 sfm/event_browser.html diff --git a/CHANGELOG.md b/CHANGELOG.md index f2d4f95..886a0a8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,30 @@ All notable changes to seismo-relay are documented here. --- +## [Unreleased] + +### Added + +- **Stored-event browser** — new standalone HTML page at `GET /events` (`sfm/event_browser.html`). Pick a serial from the unit dropdown, scroll through that unit's events (newest-first), click any event to render its decoded waveform via the existing `/db/events/{id}/waveform.json` endpoint. Dark-themed Chart.js viewer, channels stacked vertically (Tran / Vert / Long / MicL — Instantel printout order, designed PDF-export-ready), trigger line at t=0, peak labels, search/filter, false-trigger flag honored. Companion to the existing live-device viewer at `/waveform`; the two routes are now clearly delineated in their docstrings. +- **Histogram body codec — uint8 peak count fix.** Per-channel peak fields at `block[6]/[10]/[14]/[18]` are `uint8`, not `uint16 LE` spanning `block[6:8]` etc. The original interpretation was byte-exact on the N844 fixture corpus only because every annotation byte (`block[7]/[11]/[15]/[19]`) in those fixtures was zero. On non-N844 events with non-zero annotation bytes (observed across BE9558 Tran-drift and BE18003 Histogram+Continuous units), the old interpretation produced peaks up to 268 in/s per channel and 35× inflated PVS sums when first deployed to prod (rolled back same day; properly fixed in this release). Cross-correlated against BW's per-interval ASCII export on K558 / T003 / N599 / N844 corpora — 100% byte-exact on T/V/L, 99%+ on M (sub-precision rounding). Annotation byte preserved on each record as `record["annotations"]` for future RE. Verified against ~3,500 blocks across 5 in-repo fixtures + a synthetic K558 interval-12 regression block. +- **`apply_bw_report_dict_to_event` helper** in `minimateplus.event_file_io`. Mirror of `apply_report_to_event` for the projected sidecar dict shape — used by the backfill path, which has the preserved `bw_report` block but not the original `.TXT` file. BW's reported peaks (and `sample_rate` / `record_time`) now win over codec output during `--force` backfill, matching ingest-path behavior. +- **`scripts/check_bw_report_preservation.py`** — two-step snapshot/diff tool to verify that `backfill_sidecars.py` doesn't wipe the `bw_report` block from existing sidecars. Classifies every sidecar as PRESERVED / CHANGED / WIPED / STILL_MISSING / NEW / ADDED / REMOVED. Exit code 1 if any WIPED or CHANGED entries are found, so it can gate a CI step or deploy script. + +### Fixed + +- **`scripts/backfill_sidecars.py` no longer wipes `bw_report`.** Before this fix, `event_to_sidecar_dict` silently dropped the preserved `bw_report` block during every backfill, since the function only emits a `bw_report` when called with a live `BwAsciiReport` dataclass (which the backfill doesn't have — only the projected sidecar dict). Now we read the existing sidecar's `bw_report` and overlay it onto the regenerated sidecar, alongside the existing `review` and `extensions` preservation. +- **`scripts/backfill_sidecars.py --force` no longer overwrites BW-overlaid DB peaks with codec output.** The backfill path now calls `apply_bw_report_dict_to_event` before the DB upsert, mirroring what the ingest path does (`/db/import/blastware_file` parses the `.TXT` into a `BwAsciiReport`, calls `apply_report_to_event`, then upserts). Without this, events where the codec doesn't fully decode (waveform walker edge cases on SP0/SS0/SV0-style events, histogram `byte[5]!=0` sub-format) ended up with PVS=0 in the DB after a `--force` backfill; bit on prod 2026-05-22, rolled back the same day. +- **Thor IDF files no longer attempted as BW events in backfill.** `scripts/backfill_sidecars.py` now filters out `.IDFW` / `.IDFH` files in `_looks_like_event_file()`; they share the `.X0W` / `.X0H` suffix shape but use a separate ingest path (`WaveformStore.save_imported_idf`) and aren't decodable by `event_file_io.read_blastware_file`. + +### Docs + +- **CLAUDE.md** — added a three-tier conceptual architecture model (SFM / SDM / shared codec library) near the top of the file, with a placement rule for where new code goes. Documents that what is conceptually SDM (database, waveform store, ingest, `/db/*` endpoints) still lives under `sfm/` for historical reasons; rename deferred until the codebase is quiet enough for a clean refactor. +- **README.md** — added a "Strategic direction" lead-in to the Roadmap that frames seismo-relay as a suite of cooperating components (not a single app), and an explicit "Terra-View ↔ SFM device control" roadmap section with a concrete implementation checklist (auth as hard prerequisite, embedded live-monitor view, action history, Series IV live-device support). +- **`docs/histogram_codec_re_status.md`** updated with the uint8 retraction and the annotation-byte status. +- Three known issues recorded in the Roadmap that were discovered during prod validation: (1) `bw_ascii_report` parser misses PPV / `vector_sum` on some `.TXT` formats (5 events on prod); (2) NULL-timestamp duplicate-row dedup needed (2 events on prod); (3) histogram body sub-format with `byte[5] != 0` not yet decoded (~3 events on prod with empty `.h5` plots). + +--- + ## v0.19.0 — 2026-05-20 The "device-family separation" release. Tightens the boundary between Series III (MiniMate Plus / Blastware) and Series IV (Micromate / Thor) so the UI and storage layer dispatch deterministically by family instead of sniffing filename extensions or magnitude heuristics. diff --git a/sfm/event_browser.html b/sfm/event_browser.html new file mode 100644 index 0000000..dbbd734 --- /dev/null +++ b/sfm/event_browser.html @@ -0,0 +1,564 @@ + + + + + + SFM Event Browser + + + + + +
+

SFM Event Browser

+ + + + + +
+ +
+
+
+ Events + +
+
+
+ +
+
+ + + +

Select a unit and event to view its waveform.

+
+ + +
+
+ +
Ready.
+ + + + diff --git a/sfm/server.py b/sfm/server.py index 5934cf9..dfc3b45 100644 --- a/sfm/server.py +++ b/sfm/server.py @@ -381,10 +381,24 @@ def webapp(): @app.get("/waveform", response_class=FileResponse) def waveform_viewer(): - """Serve the standalone waveform viewer.""" + """Serve the standalone LIVE-device waveform viewer. + + Talks to ``/device/*`` endpoints — for plotting events pulled from + a connected unit in real time. For the stored-event browser that + reads from the SeismoDb + WaveformStore, see ``/events``. + """ return str(Path(__file__).parent / "waveform_viewer.html") +@app.get("/events", response_class=FileResponse) +def event_browser(): + """Serve the stored-event browser — pick a serial, list its events, + render any one's waveform from the persisted ``.h5`` via the + ``/db/events/{id}/waveform.json`` endpoint. Standalone HTML + + Chart.js, no auth, no build step.""" + return str(Path(__file__).parent / "event_browser.html") + + @app.get("/device/info") def device_info( port: Optional[str] = Query(None, description="Serial port (e.g. COM5, /dev/ttyUSB0)"), -- 2.52.0 From c14a8c54db13a1b5f9911e12c7283f75abed973c Mon Sep 17 00:00:00 2001 From: serversdown Date: Sat, 23 May 2026 07:09:12 +0000 Subject: [PATCH 17/42] event_browser: Instantel-printout-style polish MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Apply the cheap visual wins from the BW Event Report layout: 1. Channel order reversed → MicL (top), Long, Vert, Tran (bottom) to match the Instantel printout. 2. Shared bottom time axis — x-axis ticks only render on the bottom-most data channel; other channels hide ticks so all four visually share one time scale. 3. Triangle trigger markers above and below the t=0 dashed line. 4. Horizontal zero-baseline (dotted) per channel with "0.0" label on the right edge — Instantel convention. 5. "Print view" toggle that flips dark→light theme (white panels, light grids, dark text) so the viewer can render usefully on paper-style output / @media print. 6. Per-channel PPV stats table in the metadata header, with Peak Vector Sum displayed prominently. 7. Colors adjusted to approximate BW trace colors (magenta MicL, blue Long, green Vert, red Tran). Future PDF-export work will reproduce the same layout server-side once you upload a real example PDF and we pick a rendering pipeline (weasyprint / chromium --print-to-pdf / etc.). Co-Authored-By: Claude Opus 4.7 (1M context) --- sfm/event_browser.html | 221 +++++++++++++++++++++++++++++++++++------ 1 file changed, 193 insertions(+), 28 deletions(-) diff --git a/sfm/event_browser.html b/sfm/event_browser.html index dbbd734..0dce1b0 100644 --- a/sfm/event_browser.html +++ b/sfm/event_browser.html @@ -161,7 +161,7 @@ background: #161b22; border: 1px solid #21262d; border-radius: 8px; - padding: 10px 12px 8px; + padding: 10px 30px 8px 12px; /* right padding leaves room for the "0.0" baseline label */ } .chart-label { font-size: 11px; @@ -211,6 +211,72 @@ font-size: 11px; margin-left: 8px; } + + /* Per-channel stats table in the metadata header */ + .stats-table { + grid-column: 1 / -1; + border-collapse: collapse; + font-family: monospace; + font-size: 12px; + margin-top: 4px; + } + .stats-table th, .stats-table td { + padding: 3px 14px 3px 0; + text-align: left; + color: #c9d1d9; + } + .stats-table th { + color: #484f58; + font-size: 10px; + text-transform: uppercase; + letter-spacing: 0.05em; + font-weight: 500; + } + + /* ── Print view (light theme matching the Instantel printout) ─── */ + body.print-view { + background: #ffffff; + color: #000000; + } + body.print-view header, + body.print-view #event-list-wrap, + body.print-view #event-list-header, + body.print-view #event-meta, + body.print-view #status-bar, + body.print-view .chart-wrap { + background: #ffffff; + border-color: #cccccc; + color: #000000; + } + body.print-view .event-row { color: #000; border-bottom-color: #eee; } + body.print-view .event-row:hover { background: #f4f4f4; } + body.print-view .event-row.active { + background: #e6f0ff; + border-left-color: #1f6feb; + } + body.print-view .er-ts { color: #000; } + body.print-view .er-pvs { color: #003a8c; } + body.print-view .er-meta, + body.print-view #event-list-header, + body.print-view .meta-field .mf-label, + body.print-view .stats-table th { + color: #666; + } + body.print-view .mf-value { color: #000; } + body.print-view .mf-value.highlight { color: #003a8c; } + body.print-view label { color: #444; } + body.print-view input, body.print-view select { + background: #fff; color: #000; border-color: #ccc; + } + /* In print theme, the channel-label colors stay (they identify + the trace). Only the chart panel background flips. */ + + @media print { + header, #event-list-wrap, #status-bar, button { display: none !important; } + body { overflow: visible; height: auto; } + #main, #viewer { overflow: visible; } + #charts { overflow: visible; } + } @@ -223,7 +289,8 @@ - + +
@@ -250,13 +317,16 @@
Ready.
diff --git a/sfm/sfm_webapp.html b/sfm/sfm_webapp.html index 2c4a912..ad2b6e9 100644 --- a/sfm/sfm_webapp.html +++ b/sfm/sfm_webapp.html @@ -818,6 +818,12 @@ Force refresh +
+