diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index 9c82718..a7980f1 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -27,6 +27,7 @@ from typing import Optional, Union from .models import Event, PeakValues, ProjectInfo, Timestamp from . import blastware_file as _bw # avoid circular reference at module load from .bw_ascii_report import BwAsciiReport +from .waveform_codec import decode_waveform_v2, decoded_to_adc_counts # Reference pressure for dB(L) → psi conversion (20 µPa expressed in psi). # Same constant as sfm/sfm_webapp.html so server-side and browser-side @@ -47,7 +48,7 @@ SIDECAR_KIND = "sfm.event" # bumped without a `pip install` re-run — leading to confusing stale # version stamps in sidecars. Bump this constant and CHANGELOG.md # together at release time. -TOOL_VERSION = "0.16.1" +TOOL_VERSION = "0.20.0" try: # Best-effort: prefer the installed metadata when it's NEWER than the @@ -755,11 +756,28 @@ def read_blastware_file(path: Union[str, Path]) -> Event: ts1 = _bw._decode_ts_be(footer[2:10]) ts2 = _bw._decode_ts_be(footer[10:18]) - # Body: first 6 bytes are the preamble (00 00 ff ff ff ff). Strip - # them before decoding samples. Any trailing tail past the last - # full sample-set is silently truncated by _decode_samples_4ch. - sample_bytes = body[6:] if body[:6].hex() in ("0000ffffffff", "0000FFFFFFFF") else body - samples = _decode_samples_4ch_int16_le(sample_bytes) + # Body: decode via the verified BW waveform-body codec. The body + # starts with the codec's 7-byte preamble ``00 02 00 [Tran[0] BE] + # [Tran[1] BE]`` and continues with the tagged-block stream the codec + # walks. See ``minimateplus/waveform_codec.py`` + ``docs/waveform_codec_re_status.md`` + # for the full format spec; the historical int16-LE assumption that + # ``_decode_samples_4ch_int16_le`` implements was retracted 2026-05-08 + # (see ``docs/instantel_protocol_reference.md`` §7.6.1). + # + # If decode fails (malformed file, truncated body, synthetic test + # input), fall back to empty channels — the rest of the event + # (timestamp, waveform_key, project strings) is still recoverable + # and useful. The peaks-from-samples helper handles empty input + # gracefully. + decoded = decode_waveform_v2(body) + if decoded is None: + log.warning( + "%s: waveform body codec failed to decode (body starts %s) — " + "raw_samples will be empty", path, body[:8].hex(" "), + ) + samples = {"Tran": [], "Vert": [], "Long": [], "MicL": []} + else: + samples = decoded_to_adc_counts(decoded) # Metadata strings (label-anchored search across the body). project = _find_first_string(body, b"Project:") diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index b937e8c..8037d1f 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -12,8 +12,20 @@ Walks `//` and for each BW event file: parsing the BW binary directly (peaks computed from samples). Clean waveform (.h5): - - Skip when .h5 already exists (idempotent). - - Else write from .a5.pkl (preferred) or BW binary parse (fallback). + - Regenerated whenever the sidecar is regenerated (sha mismatch + OR sidecar.source.tool_version < current TOOL_VERSION OR --force). + The .h5 and the sidecar both come from the same decoder output, + so if the sidecar is stale the .h5 is too. + - Written when missing. + - --skip-hdf5 turns off all .h5 writes. + +Typical use after a decoder upgrade: + 1. Pull the new seismo-relay code (which bumped TOOL_VERSION). + 2. Run this script — every sidecar with an older tool_version + stamp regenerates, and the associated .h5 cascade-regenerates. + 3. Operator review state (review.false_trigger, notes, reviewer) + and the sidecar's extensions block are preserved across the + regen. Usage: python scripts/backfill_sidecars.py [--store-root PATH] @@ -123,6 +135,12 @@ def main(argv=None) -> int: # the sidecar was written by a build that includes any # decoder fixes shipped since). # Either part failing → regenerate. --force bypasses both. + # + # Tracks whether we're regenerating the sidecar this iteration + # so the .h5 logic below knows to refresh that too — staleness + # of the sidecar implies staleness of the derived .h5 (both + # come out of the same decoder). + sidecar_stale = True if sidecar_path.exists() and not args.force: try: existing = event_file_io.read_sidecar(sidecar_path) @@ -136,6 +154,7 @@ def main(argv=None) -> int: ver_ok = _vt(src_ver) >= _vt(event_file_io.TOOL_VERSION) if sha_ok and ver_ok: skipped += 1 + sidecar_stale = False continue if sha_ok and not ver_ok: log.info( @@ -281,12 +300,23 @@ def main(argv=None) -> int: extensions=preserved_ext, ) - # Also emit the .h5 clean-waveform file when missing OR when - # --force was passed (so a re-backfill picks up decoder fixes). + # Also emit the .h5 clean-waveform file when: + # - it's missing, OR + # - --force was passed, OR + # - the sidecar is being regenerated this iteration + # (sha mismatch / tool_version too old). The .h5 and + # the sidecar are both derived from the same decoder + # output, so if the sidecar is stale, so is the .h5. + # This is the path that recovers from the broken- + # int16-LE codec era — bumping TOOL_VERSION to 0.20.0+ + # marks every pre-codec sidecar stale, which now + # correctly cascades to .h5 regeneration too. hdf5_path = store.hdf5_path_for(serial, path.name) hdf5_filename = hdf5_path.name if hdf5_path.exists() else None hdf5_action = "kept" - need_h5 = not args.skip_hdf5 and (args.force or not hdf5_path.exists()) + need_h5 = not args.skip_hdf5 and ( + args.force or not hdf5_path.exists() or sidecar_stale + ) if need_h5: if args.dry_run: hdf5_action = "would (re)write" diff --git a/tests/test_event_file_io.py b/tests/test_event_file_io.py index a1990f0..d8b5793 100644 --- a/tests/test_event_file_io.py +++ b/tests/test_event_file_io.py @@ -294,6 +294,97 @@ def test_read_blastware_file_round_trip(tmp_path: Path): assert parsed.peak_values.peak_vector_sum == 0.0 +_BW_CODEC_FIXTURES = [ + # (path, expected_n_samples_per_channel, BW-reported Vert PPV in/s for sanity) + ("tests/fixtures/decode-re-5-8-26/event-a/M529LKVQ.6S0", 3328, 0.780), + ("tests/fixtures/decode-re-5-8-26/event-b/M529LK5Q.RG0", 2304, 0.505), + ("tests/fixtures/decode-re-5-8-26/event-c/M529LK44.AB0", 1280, 0.610), + ("tests/fixtures/decode-re-5-8-26/event-d/M529LK2V.470", 1280, 0.565), + ("tests/fixtures/5-11-26/M529LL1L.V70", 3328, 0.010), + ("tests/fixtures/5-11-26/M529LL1L.JQ0", 3328, 3.465), +] + + +@pytest.mark.parametrize("path,expected_n,expected_ppv", _BW_CODEC_FIXTURES) +def test_read_blastware_file_decodes_via_codec(path: str, expected_n: int, expected_ppv: float): + """Regression lock: ``read_blastware_file()`` must use the verified + waveform-body codec (``minimateplus.waveform_codec``), not the + retracted int16-LE assumption. + + Verifies against the real BW fixture corpus: every event in the + bundled fixtures must produce the expected per-channel sample count + and a Vert PPV close to BW's own reported value. Catches any + accidental regression of the body decoder back to the old + ``_decode_samples_4ch_int16_le`` path (which produced ±32K noise + on every event, giving wildly wrong PPVs). + """ + repo_root = Path(__file__).resolve().parent.parent + full_path = repo_root / path + if not full_path.exists(): + pytest.skip(f"fixture missing: {full_path}") + + ev = event_file_io.read_blastware_file(full_path) + assert ev.raw_samples is not None + for ch in ("Tran", "Vert", "Long"): + assert len(ev.raw_samples[ch]) == expected_n, ( + f"{ch}: expected {expected_n} samples, got {len(ev.raw_samples[ch])}" + ) + + # PPV check: the codec produces decoded samples in 1-count ADC units; + # _peaks_from_samples scales by GEO_NORMAL_FS_INS / 32767. BW's own + # PPV is computed at slightly different precision/interpolation, so + # we allow a 0.2 in/s tolerance — well under the broken-decoder + # signature (which would produce ~10 in/s saturation). + assert ev.peak_values is not None + assert abs(ev.peak_values.vert - expected_ppv) < 0.2, ( + f"Vert PPV {ev.peak_values.vert:.3f} differs from BW's " + f"{expected_ppv:.3f} by >0.2 in/s — codec regression?" + ) + + +def test_read_blastware_file_v70_samples_match_txt_truth(): + """Strongest regression lock: every one of V70's 3328 decoded + sample-sets must match the .TXT ground truth table within the + 0.005 in/s display quantum.""" + repo_root = Path(__file__).resolve().parent.parent + bw_path = repo_root / "tests/fixtures/5-11-26/M529LL1L.V70" + txt_path = repo_root / "tests/fixtures/5-11-26/M529LL1L.V70.TXT" + if not bw_path.exists() or not txt_path.exists(): + pytest.skip(f"V70 fixture missing") + + import re + ev = event_file_io.read_blastware_file(bw_path) + + # Parse .TXT ground truth sample table + text = txt_path.read_text() + lines = text.splitlines() + hdr_idx = next(i for i, line in enumerate(lines) + if re.match(r"^Tran\s+Vert\s+Long\s+MicL?", line.strip())) + truth = [] + for line in lines[hdr_idx + 1:]: + parts = line.strip().split() + if len(parts) != 4: + continue + try: + truth.append([float(x) for x in parts]) + except ValueError: + continue + assert len(truth) == 3328, f"expected 3328 truth rows, got {len(truth)}" + + def adc_to_ins(count): + return count / 32767.0 * 10.0 + + for i, truth_row in enumerate(truth): + for ch_idx, ch_name in enumerate(("Tran", "Vert", "Long")): + decoded_ips = adc_to_ins(ev.raw_samples[ch_name][i]) + truth_ips = truth_row[ch_idx] + # 0.003 in/s tolerance: <0.005 quantum + small float precision room + assert abs(decoded_ips - truth_ips) < 0.003, ( + f"row {i} {ch_name}: decoded {decoded_ips:+.4f} vs " + f"truth {truth_ips:+.4f} (delta {decoded_ips - truth_ips:+.4f})" + ) + + def test_save_imported_bw_with_paired_report(tmp_path: Path): """save_imported_bw + a paired BW ASCII report fold the report's rich derived fields into the sidecar. This is the daemon-forwarded