diff --git a/.gitignore b/.gitignore index d6e4855..90e5d24 100644 --- a/.gitignore +++ b/.gitignore @@ -1,6 +1,6 @@ /bridges/captures/ /example-events/ - +/tests/fixtures/ /manuals/ # Python build artifacts diff --git a/docs/histogram_codec_re_status.md b/docs/histogram_codec_re_status.md index 3a37450..6fa388c 100644 --- a/docs/histogram_codec_re_status.md +++ b/docs/histogram_codec_re_status.md @@ -12,7 +12,21 @@ implementation lives in `minimateplus/histogram_codec.py`. in-repo histogram fixture corpus decodes byte-exact against BW's ASCII export. -24 regression tests pass against ~3,500 blocks across 5 fixtures. +26 regression tests pass against ~3,500 blocks across 5 in-repo +fixtures, plus a synthetic regression block taken from a real +BE9558 prod event to lock in the uint8-peak interpretation. + +**Important correction (2026-05-21):** the per-channel peak count +is `uint8` at byte[6]/[10]/[14]/[18], NOT `uint16 LE` at byte[6:8] +etc. The N844 fixture corpus the original RE was done against has +zero values in bytes [7]/[11]/[15]/[19] for every block, so the +two interpretations happened to be equivalent. Cross-correlating +non-N844 events (BE9558 Tran-drift, BE18003 Histogram+Continuous) +against BW's per-interval ASCII export — 4 channels × ~1400 blocks +per event × multiple events = 100% byte-exact only when the peak +is read as uint8. Reading as uint16 LE produced peaks up to 268 +in/s per channel and 35× inflated PVS sums when first deployed to +prod (rolled back, root-caused, and fixed in commit 7183b95+1). ## Body format @@ -27,15 +41,21 @@ Each block represents one histogram interval. Block layout: [1] segment_id (uint8) 0x00..0x03 — 256 blocks per segment [2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, …) [4:6] 0x000a (uint16 LE) constant marker (= 10) -[6:8] T_peak_count uint16 LE Tran peak (count × 0.005 → in/s at Normal) +[6] T_peak_count uint8 Tran peak (count × 0.005 → in/s at Normal, + max 1.275 in/s — fits in uint8) +[7] T_annotation uint8 empirically non-zero on intervals with sub-Hz + or unmeasurable freq; meaning not fully RE'd [8:10] T_halfperiod uint16 LE Tran half-period in samples (freq_Hz = 512 / halfp; ≤ 5 means ">100 Hz") -[10:12] V_peak_count uint16 LE Vert peak +[10] V_peak_count uint8 Vert peak +[11] V_annotation uint8 [12:14] V_halfperiod uint16 LE Vert freq half-period -[14:16] L_peak_count uint16 LE Long peak +[14] L_peak_count uint8 Long peak +[15] L_annotation uint8 [16:18] L_halfperiod uint16 LE Long freq half-period -[18:20] M_peak_count uint16 LE MicL peak count +[18] M_peak_count uint8 MicL peak count (dB via waveform_codec.mic_count_to_db) +[19] M_annotation uint8 [20:22] M_halfperiod uint16 LE MicL freq half-period [22:24] 0x00 0x00 constant [24:28] 4-byte variable purpose unknown — possibly CRC, @@ -99,6 +119,16 @@ slot[8] = 9 → 512/9 = 56.9 → 57 Hz ✓ M_freq ## What's NOT yet decoded +- **Annotation bytes (`block[7]/[11]/[15]/[19]`)**. Empirically + non-zero on intervals where the per-channel ZC frequency comes + out as `N/A` or sub-Hz (`<1.0`, `1.X`). Hypothesis tested in the + RE session: byte != 0 ↔ sub-Hz freq. Only ~50% correlation + across the K558 corpus, so the relationship is more complex. + Possibilities: time-of-peak-within-interval, halfp extension for + very-long-period signals, or a debug/diagnostic field the firmware + writes opportunistically. Doesn't affect peak amplitudes or + waveform reconstruction. Captured as `record["annotations"]` for + future RE. - **4-byte variable metadata field (bytes 24:28)**. Not needed for waveform reconstruction. Speculation: per-block CRC, sub-second timestamp offset, or a Mic psi(L) count not in the 9 samples. diff --git a/minimateplus/histogram_codec.py b/minimateplus/histogram_codec.py index c969f45..36e399d 100644 --- a/minimateplus/histogram_codec.py +++ b/minimateplus/histogram_codec.py @@ -28,18 +28,32 @@ iterate 32-stride and stop before the tail. [1] segment_id (uint8) 0x00..0x03 — 256 blocks per segment [2:4] block_ctr (uint16 LE) resets each segment (0x0100, 0x0101, …) [4:6] 0x000a (uint16 LE) constant marker (= 10) - [6:8] T_peak_count uint16 LE Tran peak (count × 0.005 → in/s) + [6] T_peak_count uint8 Tran peak (count × 0.005 → in/s, max 1.275 in/s) + [7] T_annotation uint8 empirically non-zero on intervals with sub-Hz + or unmeasurable Tran freq; meaning not fully RE'd [8:10] T_halfperiod uint16 LE Tran half-period in samples (freq = 512 / halfp Hz) - [10:12] V_peak_count uint16 LE + [10] V_peak_count uint8 + [11] V_annotation uint8 [12:14] V_halfperiod uint16 LE - [14:16] L_peak_count uint16 LE + [14] L_peak_count uint8 + [15] L_annotation uint8 [16:18] L_halfperiod uint16 LE - [18:20] M_peak_count uint16 LE MicL peak (count → dB via mic_count_to_db) + [18] M_peak_count uint8 MicL peak (count → dB via mic_count_to_db) + [19] M_annotation uint8 [20:22] M_halfperiod uint16 LE MicL half-period in samples (freq = 512 / halfp Hz) [22:24] 0x00 0x00 constant [24:28] 4-byte variable purpose unknown (possibly CRC or timestamp delta) [28:32] 0x1e 0x0a 0x00 0x00 constant block-end signature +NOTE on peak-count width: an earlier interpretation treated the peak +fields as uint16 LE spanning [6:8] / [10:12] / [14:16] / [18:20]. +That happened to be byte-exact against the N844 fixture corpus only +because every annotation byte in those fixtures was zero, making +``uint16 LE == uint8``. Cross-correlating BE9558 (K558) Tran-drift +and BE18003 (T003) Histogram+Continuous events against the BW ASCII +export proved peak is uint8 alone — see test_histogram_codec.py +and docs/histogram_codec_re_status.md. + Block-identification anchor: ``block[22:24] == b"\\x00\\x00"`` AND ``block[28:32] == b"\\x1e\\x0a\\x00\\x00"``. This is the reliable distinguisher from non-block content in the file. @@ -128,17 +142,40 @@ def _is_data_block(block: bytes) -> bool: return True -def _decode_block(block: bytes) -> dict: +def _decode_block(block: bytes) -> Optional[dict]: """Decode one 32-byte histogram block. Caller must have validated - with ``_is_data_block`` first.""" - # All 16-bit fields are little-endian unsigned. Peak counts are - # always non-negative; half-periods are always positive when valid. - t_peak, t_halfp, v_peak, v_halfp, l_peak, l_halfp, m_peak, m_halfp = struct.unpack_from( - " dict: "m_peak": m_peak, "m_halfp": m_halfp, "meta_var": var_meta, + "annotations": annotations, } @@ -160,6 +198,13 @@ def walk_body(body: bytes) -> List[dict]: Iterates 32-byte strides from offset 0. Yields a decoded record for every block that passes ``_is_data_block`` validation. Stops when the remaining bytes are too short to form a complete block. + + In Histogram+Continuous mode the body interleaves data blocks with + other 32-byte content (likely continuous-mode waveform blocks) that + fail the data-block validation; the walker naturally skips them + without losing 32-byte alignment. Use ``block_ctr`` from each + returned record to map back to the original interval index — the + record list is sparse when other block types are interleaved. """ records: List[dict] = [] for off in range(0, len(body) - _BLOCK_SIZE + 1, _BLOCK_SIZE): @@ -169,7 +214,13 @@ def walk_body(body: bytes) -> List[dict]: # Continue walking — block alignment is fixed at 32-stride # from offset 0, so we don't lose alignment by skipping. continue - records.append(_decode_block(blk)) + decoded = _decode_block(blk) + if decoded is None: + # Block validated as a histogram block but had peak fields + # outside the plausible range — undocumented extension. + # Skip rather than propagating bogus PVS contributions. + continue + records.append(decoded) return records diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index b71bd89..bbe0d0f 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -54,14 +54,26 @@ log = logging.getLogger("backfill_sidecars") def _looks_like_event_file(path: Path) -> bool: - """Same heuristic as the importer CLI.""" + """Same heuristic as the importer CLI. + + Filters to BW (Series III) event files only — Thor (Series IV) + `.IDFW` / `.IDFH` files share the store but have their own ingest + path (`WaveformStore.save_imported_idf`) and are NOT decodable by + `event_file_io.read_blastware_file`. Their sidecars are populated + at ingest from the paired `.IDFW.txt` ASCII report; nothing the + backfill regenerates would improve on them, so we exclude them + from scope. + """ if not path.is_file(): return False - if path.name.endswith((".a5.pkl", ".sfm.json")): + if path.name.endswith((".a5.pkl", ".sfm.json", ".h5")): return False ext = path.suffix.lstrip(".") if not (3 <= len(ext) <= 4): return False + # Thor IDF files share the .{W,H}-suffix shape but aren't BW. + if ext.upper() in ("IDFW", "IDFH"): + return False if not (ext[-1].upper() in {"W", "H"} or ext.endswith("0")): return False try: @@ -275,16 +287,25 @@ def main(argv=None) -> int: or ev.total_samples < derived // 4): ev.total_samples = derived - # Preserve user-edited review state + extensions from the - # existing sidecar (false_trigger flag, notes, etc.) so a - # backfill never wipes them out. - preserved_review = None - preserved_ext = None + # Preserve user-edited review state + extensions + the + # bw_report block from the existing sidecar so a backfill + # never wipes them out. The bw_report block originates + # from the paired .TXT ASCII report parsed at ORIGINAL + # import time (ach forward / direct upload); the .TXT + # file is not in the waveform store, so we can't re-derive + # it from disk. event_to_sidecar_dict takes a + # BwAsciiReport dataclass (not a dict), so for bw_report + # we overlay the existing block after regen instead of + # passing it as a kwarg. + preserved_review = None + preserved_ext = None + preserved_bw_report = None if sidecar_path.exists(): try: _existing = event_file_io.read_sidecar(sidecar_path) - preserved_review = _existing.get("review") - preserved_ext = _existing.get("extensions") + preserved_review = _existing.get("review") + preserved_ext = _existing.get("extensions") + preserved_bw_report = _existing.get("bw_report") except Exception: pass @@ -299,6 +320,8 @@ def main(argv=None) -> int: review=preserved_review, extensions=preserved_ext, ) + if preserved_bw_report is not None: + sidecar["bw_report"] = preserved_bw_report # Also emit the .h5 clean-waveform file when: # - it's missing, OR diff --git a/scripts/check_bw_report_preservation.py b/scripts/check_bw_report_preservation.py new file mode 100644 index 0000000..2402ffe --- /dev/null +++ b/scripts/check_bw_report_preservation.py @@ -0,0 +1,185 @@ +""" +scripts/check_bw_report_preservation.py — verify that running backfill_sidecars +doesn't wipe the `bw_report` block from sidecars that already had one. + +Two-step workflow: + + # Before running backfill — capture a baseline snapshot: + python scripts/check_bw_report_preservation.py snapshot \ + --store-root /path/to/waveforms \ + --out before.json + + # Run backfill: + python scripts/backfill_sidecars.py --store-root /path/to/waveforms --force + + # After backfill — diff against the baseline: + python scripts/check_bw_report_preservation.py diff \ + --store-root /path/to/waveforms \ + --baseline before.json + +The diff classifies every sidecar into one of: + + PRESERVED had bw_report before, has same hash now ← GOOD + CHANGED had bw_report before, has different hash now ← suspicious + (backfill should only ever copy the block verbatim) + WIPED had bw_report before, doesn't now ← BUG — data loss + STILL_MISSING didn't have bw_report before, still doesn't ← expected + NEW didn't have bw_report before, has one now + (only possible if a re-ingest happened between snapshots; + shouldn't happen during backfill) + REMOVED sidecar existed in baseline, file is gone now + ADDED sidecar didn't exist in baseline, exists now + +Exit code is 0 if no WIPED or CHANGED entries are found, 1 otherwise. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from pathlib import Path +from typing import Optional + +# Allow running from the repo root without installation. +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from minimateplus import event_file_io + + +def _bw_report_hash(sidecar_data: dict) -> Optional[str]: + """Canonical-JSON hash of the bw_report block, or None if absent.""" + br = sidecar_data.get("bw_report") + if not br: + return None + # sort_keys for stable hashing across dict-ordering differences + blob = json.dumps(br, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(blob.encode()).hexdigest() + + +def _scan_store(store_root: Path) -> dict: + """Walk every /.sfm.json and return {relpath: hash_or_None}. + + Relpath is `/` — stable across machines/snapshots. + """ + out: dict[str, Optional[str]] = {} + for serial_dir in sorted(p for p in store_root.iterdir() if p.is_dir()): + for sidecar in sorted(serial_dir.glob("*.sfm.json")): + relpath = f"{serial_dir.name}/{sidecar.name}" + try: + data = event_file_io.read_sidecar(sidecar) + except Exception as exc: + print(f" WARN: failed to read {relpath}: {exc}", file=sys.stderr) + continue + out[relpath] = _bw_report_hash(data) + return out + + +def cmd_snapshot(args) -> int: + store_root = Path(args.store_root).expanduser().resolve() + if not store_root.exists(): + print(f"error: store root does not exist: {store_root}", file=sys.stderr) + return 2 + out_path = Path(args.out).expanduser().resolve() + + print(f"Scanning {store_root} …") + snapshot = _scan_store(store_root) + + with_bw = sum(1 for v in snapshot.values() if v is not None) + without_bw = sum(1 for v in snapshot.values() if v is None) + print(f" total sidecars: {len(snapshot)}") + print(f" with bw_report: {with_bw}") + print(f" without bw_report: {without_bw}") + + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w") as f: + json.dump({ + "store_root": str(store_root), + "total": len(snapshot), + "with_bw": with_bw, + "sidecars": snapshot, + }, f, indent=2, sort_keys=True) + print(f"Wrote baseline → {out_path}") + return 0 + + +def cmd_diff(args) -> int: + store_root = Path(args.store_root).expanduser().resolve() + if not store_root.exists(): + print(f"error: store root does not exist: {store_root}", file=sys.stderr) + return 2 + baseline_path = Path(args.baseline).expanduser().resolve() + if not baseline_path.exists(): + print(f"error: baseline file not found: {baseline_path}", file=sys.stderr) + return 2 + + with open(baseline_path) as f: + baseline = json.load(f) + before = baseline["sidecars"] + print(f"Scanning {store_root} for comparison against {baseline_path.name} …") + after = _scan_store(store_root) + + classes = {k: [] for k in ( + "PRESERVED", "CHANGED", "WIPED", "STILL_MISSING", "NEW", "REMOVED", "ADDED", + )} + all_keys = set(before) | set(after) + for key in sorted(all_keys): + b = before.get(key, "__MISSING__") + a = after.get(key, "__MISSING__") + if b == "__MISSING__": + classes["ADDED"].append(key) + elif a == "__MISSING__": + classes["REMOVED"].append(key) + elif b is None and a is None: + classes["STILL_MISSING"].append(key) + elif b is None and a is not None: + classes["NEW"].append(key) + elif b is not None and a is None: + classes["WIPED"].append(key) + elif b == a: + classes["PRESERVED"].append(key) + else: + classes["CHANGED"].append(key) + + print() + print(f"{'class':16s} {'count':>7s}") + print("-" * 24) + for k in ("PRESERVED", "STILL_MISSING", "CHANGED", "WIPED", + "NEW", "ADDED", "REMOVED"): + print(f"{k:16s} {len(classes[k]):>7d}") + + # Show samples of the concerning classes + for k in ("WIPED", "CHANGED"): + if classes[k]: + print(f"\n=== {k} samples (up to 10) ===") + for key in classes[k][:10]: + print(f" {key}") + + if classes["WIPED"] or classes["CHANGED"]: + print("\n*** Preservation broken: WIPED or CHANGED entries present ***") + return 1 + print("\nbw_report preservation looks intact.") + return 0 + + +def main(argv=None) -> int: + p = argparse.ArgumentParser(description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + + p_snap = sub.add_parser("snapshot", help="capture baseline bw_report hashes") + p_snap.add_argument("--store-root", required=True) + p_snap.add_argument("--out", required=True, help="output JSON path") + p_snap.set_defaults(func=cmd_snapshot) + + p_diff = sub.add_parser("diff", help="diff current store against a baseline") + p_diff.add_argument("--store-root", required=True) + p_diff.add_argument("--baseline", required=True, help="JSON from `snapshot`") + p_diff.set_defaults(func=cmd_diff) + + args = p.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/test_histogram_codec.py b/tests/test_histogram_codec.py index 8e521f3..6a42e27 100644 --- a/tests/test_histogram_codec.py +++ b/tests/test_histogram_codec.py @@ -335,3 +335,51 @@ def test_geo_count_to_ins_scale(): assert geo_count_to_ins(1) == pytest.approx(0.005) assert geo_count_to_ins(10) == pytest.approx(0.050) assert geo_count_to_ins(0) == 0.0 + + +# ── Regression: peak is uint8 byte[N], NOT uint16 LE byte[N:N+2] ──────────── +# +# Block taken verbatim from K558LKZU.RE0H (BE9558) interval 12 — a real +# field event where the Tran channel had developed a DC offset and was +# producing sub-Hz drift content the device couldn't characterize. +# The annotation byte at [7] = 0xd2 is non-zero in that case. The +# legacy codec read [6:8] as uint16 LE, producing T_peak = 53763 → +# 268 in/s — physically impossible and 35× too high for the actual +# 0.015 in/s value (T_lo = 3 alone gives the correct count). +# Verified against the paired BW ASCII export. +_K558_INTERVAL_12_BLOCK = bytes.fromhex( + "00 00 0c 01 0a 00 03 d2 45 00 02 00 02 00 02 00" + "02 00 10 00 06 00 00 00 0e 91 2f 00 1e 0a 00 00".replace(" ", "") +) + + +def test_extension_byte_does_not_inflate_peak(): + """The annotation byte at [7]/[11]/[15]/[19] must NOT contribute to + the peak count. Decoded T_peak must be 3 (uint8 byte[6]), NOT + 53763 (uint16 LE byte[6:8]).""" + body = _K558_INTERVAL_12_BLOCK + records = decode_histogram_body_full(body) + assert records is not None + assert len(records) == 1 + r = records[0] + assert r["t_peak"] == 3, f"T_peak should be 3 (uint8), got {r['t_peak']}" + assert r["v_peak"] == 2 + assert r["l_peak"] == 2 + assert r["m_peak"] == 16 + # Half-periods unchanged — still uint16 LE. + assert r["t_halfp"] == 0x0045 # 69 → 7.4 Hz + assert r["m_halfp"] == 6 # → 85.3 Hz + # Annotation byte is preserved (for future RE) but does not affect peak. + assert r["annotations"] == (0xd2, 0x00, 0x00, 0x00) + + +def test_extension_byte_decoded_to_correct_in_s(): + """End-to-end: the channel-grouped output for the K558 ext block + should give T = 3 counts = 0.015 in/s, not 53763 counts = 268 in/s.""" + channels = decode_histogram_body(_K558_INTERVAL_12_BLOCK) + assert channels is not None + assert channels["Tran"] == [3] + assert geo_count_to_ins(channels["Tran"][0]) == pytest.approx(0.015) + assert channels["Vert"] == [2] + assert channels["Long"] == [2] + assert channels["MicL"] == [16]