diff --git a/scripts/check_bw_report_preservation.py b/scripts/check_bw_report_preservation.py new file mode 100644 index 0000000..2402ffe --- /dev/null +++ b/scripts/check_bw_report_preservation.py @@ -0,0 +1,185 @@ +""" +scripts/check_bw_report_preservation.py — verify that running backfill_sidecars +doesn't wipe the `bw_report` block from sidecars that already had one. + +Two-step workflow: + + # Before running backfill — capture a baseline snapshot: + python scripts/check_bw_report_preservation.py snapshot \ + --store-root /path/to/waveforms \ + --out before.json + + # Run backfill: + python scripts/backfill_sidecars.py --store-root /path/to/waveforms --force + + # After backfill — diff against the baseline: + python scripts/check_bw_report_preservation.py diff \ + --store-root /path/to/waveforms \ + --baseline before.json + +The diff classifies every sidecar into one of: + + PRESERVED had bw_report before, has same hash now ← GOOD + CHANGED had bw_report before, has different hash now ← suspicious + (backfill should only ever copy the block verbatim) + WIPED had bw_report before, doesn't now ← BUG — data loss + STILL_MISSING didn't have bw_report before, still doesn't ← expected + NEW didn't have bw_report before, has one now + (only possible if a re-ingest happened between snapshots; + shouldn't happen during backfill) + REMOVED sidecar existed in baseline, file is gone now + ADDED sidecar didn't exist in baseline, exists now + +Exit code is 0 if no WIPED or CHANGED entries are found, 1 otherwise. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import sys +from pathlib import Path +from typing import Optional + +# Allow running from the repo root without installation. +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from minimateplus import event_file_io + + +def _bw_report_hash(sidecar_data: dict) -> Optional[str]: + """Canonical-JSON hash of the bw_report block, or None if absent.""" + br = sidecar_data.get("bw_report") + if not br: + return None + # sort_keys for stable hashing across dict-ordering differences + blob = json.dumps(br, sort_keys=True, separators=(",", ":")) + return hashlib.sha256(blob.encode()).hexdigest() + + +def _scan_store(store_root: Path) -> dict: + """Walk every /.sfm.json and return {relpath: hash_or_None}. + + Relpath is `/` — stable across machines/snapshots. + """ + out: dict[str, Optional[str]] = {} + for serial_dir in sorted(p for p in store_root.iterdir() if p.is_dir()): + for sidecar in sorted(serial_dir.glob("*.sfm.json")): + relpath = f"{serial_dir.name}/{sidecar.name}" + try: + data = event_file_io.read_sidecar(sidecar) + except Exception as exc: + print(f" WARN: failed to read {relpath}: {exc}", file=sys.stderr) + continue + out[relpath] = _bw_report_hash(data) + return out + + +def cmd_snapshot(args) -> int: + store_root = Path(args.store_root).expanduser().resolve() + if not store_root.exists(): + print(f"error: store root does not exist: {store_root}", file=sys.stderr) + return 2 + out_path = Path(args.out).expanduser().resolve() + + print(f"Scanning {store_root} …") + snapshot = _scan_store(store_root) + + with_bw = sum(1 for v in snapshot.values() if v is not None) + without_bw = sum(1 for v in snapshot.values() if v is None) + print(f" total sidecars: {len(snapshot)}") + print(f" with bw_report: {with_bw}") + print(f" without bw_report: {without_bw}") + + out_path.parent.mkdir(parents=True, exist_ok=True) + with open(out_path, "w") as f: + json.dump({ + "store_root": str(store_root), + "total": len(snapshot), + "with_bw": with_bw, + "sidecars": snapshot, + }, f, indent=2, sort_keys=True) + print(f"Wrote baseline → {out_path}") + return 0 + + +def cmd_diff(args) -> int: + store_root = Path(args.store_root).expanduser().resolve() + if not store_root.exists(): + print(f"error: store root does not exist: {store_root}", file=sys.stderr) + return 2 + baseline_path = Path(args.baseline).expanduser().resolve() + if not baseline_path.exists(): + print(f"error: baseline file not found: {baseline_path}", file=sys.stderr) + return 2 + + with open(baseline_path) as f: + baseline = json.load(f) + before = baseline["sidecars"] + print(f"Scanning {store_root} for comparison against {baseline_path.name} …") + after = _scan_store(store_root) + + classes = {k: [] for k in ( + "PRESERVED", "CHANGED", "WIPED", "STILL_MISSING", "NEW", "REMOVED", "ADDED", + )} + all_keys = set(before) | set(after) + for key in sorted(all_keys): + b = before.get(key, "__MISSING__") + a = after.get(key, "__MISSING__") + if b == "__MISSING__": + classes["ADDED"].append(key) + elif a == "__MISSING__": + classes["REMOVED"].append(key) + elif b is None and a is None: + classes["STILL_MISSING"].append(key) + elif b is None and a is not None: + classes["NEW"].append(key) + elif b is not None and a is None: + classes["WIPED"].append(key) + elif b == a: + classes["PRESERVED"].append(key) + else: + classes["CHANGED"].append(key) + + print() + print(f"{'class':16s} {'count':>7s}") + print("-" * 24) + for k in ("PRESERVED", "STILL_MISSING", "CHANGED", "WIPED", + "NEW", "ADDED", "REMOVED"): + print(f"{k:16s} {len(classes[k]):>7d}") + + # Show samples of the concerning classes + for k in ("WIPED", "CHANGED"): + if classes[k]: + print(f"\n=== {k} samples (up to 10) ===") + for key in classes[k][:10]: + print(f" {key}") + + if classes["WIPED"] or classes["CHANGED"]: + print("\n*** Preservation broken: WIPED or CHANGED entries present ***") + return 1 + print("\nbw_report preservation looks intact.") + return 0 + + +def main(argv=None) -> int: + p = argparse.ArgumentParser(description=__doc__) + sub = p.add_subparsers(dest="cmd", required=True) + + p_snap = sub.add_parser("snapshot", help="capture baseline bw_report hashes") + p_snap.add_argument("--store-root", required=True) + p_snap.add_argument("--out", required=True, help="output JSON path") + p_snap.set_defaults(func=cmd_snapshot) + + p_diff = sub.add_parser("diff", help="diff current store against a baseline") + p_diff.add_argument("--store-root", required=True) + p_diff.add_argument("--baseline", required=True, help="JSON from `snapshot`") + p_diff.set_defaults(func=cmd_diff) + + args = p.parse_args(argv) + return args.func(args) + + +if __name__ == "__main__": + sys.exit(main())