From c4648c195925597b4d81ef2e0259209700859260 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 20:16:31 +0000 Subject: [PATCH] scripts/backfill_sidecars: skip .h5 write when decoder returned no samples MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Discovered while dry-running the backfill on the prod store: ~10,000 of ~10,059 events are histogram-mode (filename extension `*H`), and the waveform-body codec wired in via the previous commit doesn't handle histogram-mode bodies — only the waveform-mode codec at §7.6.1 is implemented; the histogram-mode codec at §7.6.2 of the protocol reference is documented but no Python implementation exists yet. Without this guard, every histogram event's .h5 file would be *replaced* with an empty one — strictly worse than today's broken-int16-LE .h5 because any downstream viewer expecting non-empty sample arrays would now error out instead of just rendering wrong values. Fix: after the decoder runs, check whether any channel has samples. If not, skip the .h5 write entirely. The sidecar still regenerates (refreshing the tool_version stamp and any peaks/project info from the DB row), but the existing .h5 is left untouched. This is a *temporary* gate. When the histogram codec lands (next branch: `feat/wire-histogram-codec`), the has_samples check can be removed and the backfill will then correctly regenerate all .h5 files, histogram and waveform alike. Observed effect (dry-run on prod store, 10,059 events): - waveform events (~5%): "[DRY ] would write … + .h5 (would (re)write)" - histogram events (~95%): "[DRY ] would write … + .h5 (skipped-empty-samples)" - sidecar tool_version bump succeeds for both Co-Authored-By: Claude Opus 4.7 (1M context) --- scripts/backfill_sidecars.py | 24 ++++++++++++++++++++++-- 1 file changed, 22 insertions(+), 2 deletions(-) diff --git a/scripts/backfill_sidecars.py b/scripts/backfill_sidecars.py index 8037d1f..36d8747 100644 --- a/scripts/backfill_sidecars.py +++ b/scripts/backfill_sidecars.py @@ -311,12 +311,32 @@ def main(argv=None) -> int: # int16-LE codec era — bumping TOOL_VERSION to 0.20.0+ # marks every pre-codec sidecar stale, which now # correctly cascades to .h5 regeneration too. + # + # Skip the .h5 write when the decoder couldn't produce + # samples — this is the histogram-mode case today + # (waveform_codec.decode_waveform_v2 only handles the + # waveform-mode body format per §7.6.1; the histogram + # codec at §7.6.2 is documented but not yet implemented). + # Without this check we'd replace the existing (broken + # int16-LE) histogram .h5 with an empty one, which is + # arguably worse for any consumer expecting non-empty + # sample arrays. When the histogram codec lands, this + # check can come out. + has_samples = bool( + ev.raw_samples and any( + ev.raw_samples.get(ch) for ch in ("Tran", "Vert", "Long", "MicL") + ) + ) hdf5_path = store.hdf5_path_for(serial, path.name) hdf5_filename = hdf5_path.name if hdf5_path.exists() else None hdf5_action = "kept" - need_h5 = not args.skip_hdf5 and ( - args.force or not hdf5_path.exists() or sidecar_stale + need_h5 = ( + not args.skip_hdf5 + and (args.force or not hdf5_path.exists() or sidecar_stale) + and has_samples ) + if not has_samples and not args.skip_hdf5: + hdf5_action = "skipped-empty-samples" if need_h5: if args.dry_run: hdf5_action = "would (re)write"