From 31d691b40bd6d929281919a1cae3010bac3f66b4 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 18:13:24 +0000 Subject: [PATCH 1/2] minimateplus: wire read_blastware_file to verified body codec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `read_blastware_file()` was still calling `_decode_samples_4ch_int16_le` (the retracted int16-LE-interleaved hypothesis) on the body bytes, producing ±32K noise on every channel of every BW file read from disk. This was the path watcher-forwarded events take into the system (via the import endpoint → save_imported_bw → read_blastware_file, since the watcher doesn't ship A5 frames), so every .h5 sidecar generated for a forwarded event has been wrong since the feature shipped. The fix is mechanical: pass the body bytes straight to `waveform_codec.decode_waveform_v2()` and run the result through `decoded_to_adc_counts()` for the 16x geo scaling. The body already starts with the codec's exact 7-byte preamble `00 02 00 [Tran[0] BE] [Tran[1] BE]` — confirmed by `body[:3].hex()` across all 9 fixture events. No body-slice adjustment needed. If the codec returns None (truncated/malformed file, synthetic test input with no real waveform), fall back to empty channels with a log warning. The rest of the event (timestamp, waveform_key, project strings, sensor_location, peaks-from-samples=0) is still recoverable. Verified against the bundled fixture corpus: V70 Tran/Vert/Long 3328/3328 sample-sets match .TXT ground truth within the 0.005 in/s display quantum, every row 6S0/RG0/AB0/470 (5-8-26) 3328/2304/1280/1280 samples; Vert PPVs match BW's own report within 0.02 in/s JQ0 3328 samples, Vert PPV 3.384 vs BW 3.465 SP0/SS0/SV0 (loud events) 3072–3328 samples; known walker tail-truncation 1–7 samples per channel, samples reached are byte-exact Existing `test_read_blastware_file_round_trip` (synthetic empty event) continues to pass thanks to the None-fallback. Codec verify scripts (`analysis/verify_quiet_bundle.py`, `analysis/verify_full_decode.py`) re-run unchanged. Added two regression-lock tests in tests/test_event_file_io.py: - test_read_blastware_file_decodes_via_codec[6 fixtures] — verifies sample count + Vert PPV per fixture - test_read_blastware_file_v70_samples_match_txt_truth — verifies every one of V70's 3328 sample-sets across Tran/Vert/Long matches the .TXT ground truth row-by-row within 0.003 in/s Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/event_file_io.py | 28 +++++++++-- tests/test_event_file_io.py | 91 +++++++++++++++++++++++++++++++++++ 2 files changed, 114 insertions(+), 5 deletions(-) diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index 9c82718..5618f72 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -27,6 +27,7 @@ from typing import Optional, Union from .models import Event, PeakValues, ProjectInfo, Timestamp from . import blastware_file as _bw # avoid circular reference at module load from .bw_ascii_report import BwAsciiReport +from .waveform_codec import decode_waveform_v2, decoded_to_adc_counts # Reference pressure for dB(L) → psi conversion (20 µPa expressed in psi). # Same constant as sfm/sfm_webapp.html so server-side and browser-side @@ -755,11 +756,28 @@ def read_blastware_file(path: Union[str, Path]) -> Event: ts1 = _bw._decode_ts_be(footer[2:10]) ts2 = _bw._decode_ts_be(footer[10:18]) - # Body: first 6 bytes are the preamble (00 00 ff ff ff ff). Strip - # them before decoding samples. Any trailing tail past the last - # full sample-set is silently truncated by _decode_samples_4ch. - sample_bytes = body[6:] if body[:6].hex() in ("0000ffffffff", "0000FFFFFFFF") else body - samples = _decode_samples_4ch_int16_le(sample_bytes) + # Body: decode via the verified BW waveform-body codec. The body + # starts with the codec's 7-byte preamble ``00 02 00 [Tran[0] BE] + # [Tran[1] BE]`` and continues with the tagged-block stream the codec + # walks. See ``minimateplus/waveform_codec.py`` + ``docs/waveform_codec_re_status.md`` + # for the full format spec; the historical int16-LE assumption that + # ``_decode_samples_4ch_int16_le`` implements was retracted 2026-05-08 + # (see ``docs/instantel_protocol_reference.md`` §7.6.1). + # + # If decode fails (malformed file, truncated body, synthetic test + # input), fall back to empty channels — the rest of the event + # (timestamp, waveform_key, project strings) is still recoverable + # and useful. The peaks-from-samples helper handles empty input + # gracefully. + decoded = decode_waveform_v2(body) + if decoded is None: + log.warning( + "%s: waveform body codec failed to decode (body starts %s) — " + "raw_samples will be empty", path, body[:8].hex(" "), + ) + samples = {"Tran": [], "Vert": [], "Long": [], "MicL": []} + else: + samples = decoded_to_adc_counts(decoded) # Metadata strings (label-anchored search across the body). project = _find_first_string(body, b"Project:") diff --git a/tests/test_event_file_io.py b/tests/test_event_file_io.py index a1990f0..d8b5793 100644 --- a/tests/test_event_file_io.py +++ b/tests/test_event_file_io.py @@ -294,6 +294,97 @@ def test_read_blastware_file_round_trip(tmp_path: Path): assert parsed.peak_values.peak_vector_sum == 0.0 +_BW_CODEC_FIXTURES = [ + # (path, expected_n_samples_per_channel, BW-reported Vert PPV in/s for sanity) + ("tests/fixtures/decode-re-5-8-26/event-a/M529LKVQ.6S0", 3328, 0.780), + ("tests/fixtures/decode-re-5-8-26/event-b/M529LK5Q.RG0", 2304, 0.505), + ("tests/fixtures/decode-re-5-8-26/event-c/M529LK44.AB0", 1280, 0.610), + ("tests/fixtures/decode-re-5-8-26/event-d/M529LK2V.470", 1280, 0.565), + ("tests/fixtures/5-11-26/M529LL1L.V70", 3328, 0.010), + ("tests/fixtures/5-11-26/M529LL1L.JQ0", 3328, 3.465), +] + + +@pytest.mark.parametrize("path,expected_n,expected_ppv", _BW_CODEC_FIXTURES) +def test_read_blastware_file_decodes_via_codec(path: str, expected_n: int, expected_ppv: float): + """Regression lock: ``read_blastware_file()`` must use the verified + waveform-body codec (``minimateplus.waveform_codec``), not the + retracted int16-LE assumption. + + Verifies against the real BW fixture corpus: every event in the + bundled fixtures must produce the expected per-channel sample count + and a Vert PPV close to BW's own reported value. Catches any + accidental regression of the body decoder back to the old + ``_decode_samples_4ch_int16_le`` path (which produced ±32K noise + on every event, giving wildly wrong PPVs). + """ + repo_root = Path(__file__).resolve().parent.parent + full_path = repo_root / path + if not full_path.exists(): + pytest.skip(f"fixture missing: {full_path}") + + ev = event_file_io.read_blastware_file(full_path) + assert ev.raw_samples is not None + for ch in ("Tran", "Vert", "Long"): + assert len(ev.raw_samples[ch]) == expected_n, ( + f"{ch}: expected {expected_n} samples, got {len(ev.raw_samples[ch])}" + ) + + # PPV check: the codec produces decoded samples in 1-count ADC units; + # _peaks_from_samples scales by GEO_NORMAL_FS_INS / 32767. BW's own + # PPV is computed at slightly different precision/interpolation, so + # we allow a 0.2 in/s tolerance — well under the broken-decoder + # signature (which would produce ~10 in/s saturation). + assert ev.peak_values is not None + assert abs(ev.peak_values.vert - expected_ppv) < 0.2, ( + f"Vert PPV {ev.peak_values.vert:.3f} differs from BW's " + f"{expected_ppv:.3f} by >0.2 in/s — codec regression?" + ) + + +def test_read_blastware_file_v70_samples_match_txt_truth(): + """Strongest regression lock: every one of V70's 3328 decoded + sample-sets must match the .TXT ground truth table within the + 0.005 in/s display quantum.""" + repo_root = Path(__file__).resolve().parent.parent + bw_path = repo_root / "tests/fixtures/5-11-26/M529LL1L.V70" + txt_path = repo_root / "tests/fixtures/5-11-26/M529LL1L.V70.TXT" + if not bw_path.exists() or not txt_path.exists(): + pytest.skip(f"V70 fixture missing") + + import re + ev = event_file_io.read_blastware_file(bw_path) + + # Parse .TXT ground truth sample table + text = txt_path.read_text() + lines = text.splitlines() + hdr_idx = next(i for i, line in enumerate(lines) + if re.match(r"^Tran\s+Vert\s+Long\s+MicL?", line.strip())) + truth = [] + for line in lines[hdr_idx + 1:]: + parts = line.strip().split() + if len(parts) != 4: + continue + try: + truth.append([float(x) for x in parts]) + except ValueError: + continue + assert len(truth) == 3328, f"expected 3328 truth rows, got {len(truth)}" + + def adc_to_ins(count): + return count / 32767.0 * 10.0 + + for i, truth_row in enumerate(truth): + for ch_idx, ch_name in enumerate(("Tran", "Vert", "Long")): + decoded_ips = adc_to_ins(ev.raw_samples[ch_name][i]) + truth_ips = truth_row[ch_idx] + # 0.003 in/s tolerance: <0.005 quantum + small float precision room + assert abs(decoded_ips - truth_ips) < 0.003, ( + f"row {i} {ch_name}: decoded {decoded_ips:+.4f} vs " + f"truth {truth_ips:+.4f} (delta {decoded_ips - truth_ips:+.4f})" + ) + + def test_save_imported_bw_with_paired_report(tmp_path: Path): """save_imported_bw + a paired BW ASCII report fold the report's rich derived fields into the sidecar. This is the daemon-forwarded From e8682d49ad155eee1c738332aace8a18c626c7f9 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 18:24:06 +0000 Subject: [PATCH 2/2] scripts/backfill_sidecars: cascade h5 regen when sidecar is stale + bump TOOL_VERSION MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two coupled changes that close the rollout gap left by the read_blastware_file codec wiring: 1. minimateplus/event_file_io.py: bump TOOL_VERSION from 0.16.1 to 0.20.0. This is the version stamp the backfill script reads from each sidecar's source.tool_version field to detect "this sidecar was written before the current decoder shipped, regenerate it." Bumping past every value baked into existing prod sidecars flags them all as stale on the next backfill run — which is exactly what we want, since every pre-codec-wiring sidecar was written by the retracted int16-LE decoder. 2. scripts/backfill_sidecars.py: when the sidecar is being regenerated this iteration (sha mismatch, tool_version too old, or --force), also regenerate the .h5. Previously the .h5 logic only rewrote when --force was passed or the file was missing — so a tool_version-driven sidecar regen left the broken .h5 in place forever. Added a `sidecar_stale` boolean to track the "we're rewriting the sidecar this iteration" state and wired it into the h5 need-rewrite check. Path coverage (verified by trace): - sidecar missing → both regen - --force → both regen - sha mismatch → both regen - tool_ver too old → both regen (THE post-codec-wiring case) - everything OK → skip iteration entirely (h5 untouched) Operator review state (review.false_trigger, reviewer, notes) and the sidecar's extensions block are preserved across regen by the existing read-existing-sidecar / pass-into-event_to_sidecar_dict path — unchanged from prior behavior. Deploy procedure (on prod): 1. Pull this change + the read_blastware_file codec wiring. 2. `python scripts/backfill_sidecars.py --dry-run` to preview. Every sidecar with source.tool_version<0.20.0 will show as "would (re)write". 3. Run for real (drop --dry-run). Expect every pre-fix event to regen. Big stores may take a while. Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/event_file_io.py | 2 +- scripts/backfill_sidecars.py | 40 ++++++++++++++++++++++++++++++----- 2 files changed, 36 insertions(+), 6 deletions(-) diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index 5618f72..a7980f1 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -48,7 +48,7 @@ SIDECAR_KIND = "sfm.event" # bumped without a `pip install` re-run — leading to confusing stale # version stamps in sidecars. Bump this constant and CHANGELOG.md # together at release time. -TOOL_VERSION = "0.16.1" +TOOL_VERSION = "0.20.0" try: # Best-effort: prefer the installed metadata when it's NEWER than the diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index b937e8c..8037d1f 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -12,8 +12,20 @@ Walks `//` and for each BW event file: parsing the BW binary directly (peaks computed from samples). Clean waveform (.h5): - - Skip when .h5 already exists (idempotent). - - Else write from .a5.pkl (preferred) or BW binary parse (fallback). + - Regenerated whenever the sidecar is regenerated (sha mismatch + OR sidecar.source.tool_version < current TOOL_VERSION OR --force). + The .h5 and the sidecar both come from the same decoder output, + so if the sidecar is stale the .h5 is too. + - Written when missing. + - --skip-hdf5 turns off all .h5 writes. + +Typical use after a decoder upgrade: + 1. Pull the new seismo-relay code (which bumped TOOL_VERSION). + 2. Run this script — every sidecar with an older tool_version + stamp regenerates, and the associated .h5 cascade-regenerates. + 3. Operator review state (review.false_trigger, notes, reviewer) + and the sidecar's extensions block are preserved across the + regen. Usage: python scripts/backfill_sidecars.py [--store-root PATH] @@ -123,6 +135,12 @@ def main(argv=None) -> int: # the sidecar was written by a build that includes any # decoder fixes shipped since). # Either part failing → regenerate. --force bypasses both. + # + # Tracks whether we're regenerating the sidecar this iteration + # so the .h5 logic below knows to refresh that too — staleness + # of the sidecar implies staleness of the derived .h5 (both + # come out of the same decoder). + sidecar_stale = True if sidecar_path.exists() and not args.force: try: existing = event_file_io.read_sidecar(sidecar_path) @@ -136,6 +154,7 @@ def main(argv=None) -> int: ver_ok = _vt(src_ver) >= _vt(event_file_io.TOOL_VERSION) if sha_ok and ver_ok: skipped += 1 + sidecar_stale = False continue if sha_ok and not ver_ok: log.info( @@ -281,12 +300,23 @@ def main(argv=None) -> int: extensions=preserved_ext, ) - # Also emit the .h5 clean-waveform file when missing OR when - # --force was passed (so a re-backfill picks up decoder fixes). + # Also emit the .h5 clean-waveform file when: + # - it's missing, OR + # - --force was passed, OR + # - the sidecar is being regenerated this iteration + # (sha mismatch / tool_version too old). The .h5 and + # the sidecar are both derived from the same decoder + # output, so if the sidecar is stale, so is the .h5. + # This is the path that recovers from the broken- + # int16-LE codec era — bumping TOOL_VERSION to 0.20.0+ + # marks every pre-codec sidecar stale, which now + # correctly cascades to .h5 regeneration too. hdf5_path = store.hdf5_path_for(serial, path.name) hdf5_filename = hdf5_path.name if hdf5_path.exists() else None hdf5_action = "kept" - need_h5 = not args.skip_hdf5 and (args.force or not hdf5_path.exists()) + need_h5 = not args.skip_hdf5 and ( + args.force or not hdf5_path.exists() or sidecar_stale + ) if need_h5: if args.dry_run: hdf5_action = "would (re)write"