From 88549bc659df58b8402a75e9c8390ed89a85cb10 Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 21 May 2026 01:20:08 +0000 Subject: [PATCH] backfill_sidecars: filter out Thor IDF files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discovered while dry-running the backfill on prod: the waveform store contains both BW (.AB0*/.N00) and Thor IDF (.IDFW/.IDFH) event files side-by-side because both go through the same per-serial directory layout. The script's `_looks_like_event_file` heuristic accepted any 3-4 char extension ending in W or H, which matched both BW and IDF. The script then routes everything through `event_file_io.read_blastware_file`, which rejects IDF files with "not a Blastware file (bad header prefix)" — 3807 errors on prod out of 7201 total events. Thor IDF events have their own ingest path (`WaveformStore.save_imported_idf`) and their sidecars are populated at ingest from the paired `.IDFW.txt` ASCII report. The backfill script has no value to add for them — there's no decoder to refresh, and the sidecar metadata is already correct. Filter them out. After this fix, the prod backfill should run clean: ~3392 BW events get sidecar+h5 regen as expected; the ~3807 Thor IDF events are silently skipped. The proper "IDF backfill" (refresh tool_version stamp on IDF sidecars by re-running event_to_sidecar_dict against the stored DB row + sidecar extensions block) is a separate, narrower follow-up — not blocking the BW backfill rollout. Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/backfill_sidecars.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index b71bd89..2b9533e 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -54,14 +54,26 @@ log = logging.getLogger("backfill_sidecars") def _looks_like_event_file(path: Path) -> bool: - """Same heuristic as the importer CLI.""" + """Same heuristic as the importer CLI. + + Filters to BW (Series III) event files only — Thor (Series IV) + `.IDFW` / `.IDFH` files share the store but have their own ingest + path (`WaveformStore.save_imported_idf`) and are NOT decodable by + `event_file_io.read_blastware_file`. Their sidecars are populated + at ingest from the paired `.IDFW.txt` ASCII report; nothing the + backfill regenerates would improve on them, so we exclude them + from scope. + """ if not path.is_file(): return False - if path.name.endswith((".a5.pkl", ".sfm.json")): + if path.name.endswith((".a5.pkl", ".sfm.json", ".h5")): return False ext = path.suffix.lstrip(".") if not (3 <= len(ext) <= 4): return False + # Thor IDF files share the .{W,H}-suffix shape but aren't BW. + if ext.upper() in ("IDFW", "IDFH"): + return False if not (ext[-1].upper() in {"W", "H"} or ext.endswith("0")): return False try: