From fa9d3cdef20a1d53e59226d458b746a360181420 Mon Sep 17 00:00:00 2001 From: serversdown Date: Wed, 20 May 2026 20:30:53 +0000 Subject: [PATCH] read_blastware_file: leave peak_values=None when samples can't be decoded MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixes a data-loss bug discovered while dry-running the backfill against the prod store. Symptom: every histogram event in the store has its body decoded by read_blastware_file → codec returns None → samples = empty dict → ``ev.peak_values = _peaks_from_samples(empty)`` returns ``PeakValues(0, 0, 0, 0, 0)`` (NOT None). The backfill script's existing "seed from DB row when peak_values is None" branch then correctly *skips* the seeding, and the all-zeros PeakValues flows into ``db.insert_events()``'s UPSERT path, OVERWRITING the existing good DB peak values for that event (which were populated from the paired BW ASCII report at ingest). Net effect: running the backfill on prod would have wiped the PPV / mic / vector-sum columns for ~10,000 histogram events. Fix: only compute peaks-from-samples when there are actually samples. For events the codec couldn't decode (histogram-mode bodies, until the §7.6.2 histogram codec is wired in), leave peak_values=None as the "we don't know" signal. Downstream consumers: - backfill_sidecars.py — its existing ``if ev.peak_values is None:`` branch (line 243) seeds from the DB row, preserving the real BW-report peaks across the regen. - WaveformStore.save_imported_bw — apply_report_to_event overlays peaks from the paired BW ASCII report when one was uploaded. Histogram imports without a paired report end up with NULL peaks in the DB, which is correct (better than zeros — clearly says "no peak data available" rather than "peaks are exactly zero"). Updated the existing synthetic-event round-trip test to expect peak_values=None for the no-real-body case, which is the truth now. The 7 fixture-corpus regression tests for real BW waveforms continue to pass — those have decodable samples, so peak_values is still populated from the codec output as before. Co-Authored-By: Claude Opus 4.7 (1M context) --- minimateplus/event_file_io.py | 13 ++++++++++++- tests/test_event_file_io.py | 12 +++++++++--- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/minimateplus/event_file_io.py b/minimateplus/event_file_io.py index a7980f1..c3d273c 100644 --- a/minimateplus/event_file_io.py +++ b/minimateplus/event_file_io.py @@ -811,7 +811,18 @@ def read_blastware_file(path: Union[str, Path]) -> Event: project=project, client=client, operator=user, sensor_location=seisloc, ) ev.raw_samples = samples - ev.peak_values = _peaks_from_samples(samples) + # Only compute peaks from samples when we actually have samples. + # For events the codec couldn't decode (histogram-mode bodies, until + # the §7.6.2 histogram codec is wired in), samples is an empty dict + # and ``_peaks_from_samples`` would return PeakValues(0, 0, 0, 0, 0). + # That would then OVERWRITE existing good DB peak values (e.g. from + # paired BW ASCII reports) during the backfill UPSERT path. + # Leaving peak_values=None signals "we don't know" to downstream + # consumers; the backfill script seeds from the DB row when it sees + # None, and ``apply_report_to_event`` overlays from a paired ASCII + # report when one is supplied. + has_samples = any(samples.get(ch) for ch in ("Tran", "Vert", "Long", "MicL")) + ev.peak_values = _peaks_from_samples(samples) if has_samples else None ev._a5_frames = None # not recoverable from BW file return ev diff --git a/tests/test_event_file_io.py b/tests/test_event_file_io.py index d8b5793..6e08dae 100644 --- a/tests/test_event_file_io.py +++ b/tests/test_event_file_io.py @@ -289,9 +289,15 @@ def test_read_blastware_file_round_trip(tmp_path: Path): assert parsed.timestamp.second == ev.timestamp.second # No A5 source recoverable. assert parsed._a5_frames is None - # Peaks computed from samples (synthetic = zero samples → zero peaks). - assert parsed.peak_values is not None - assert parsed.peak_values.peak_vector_sum == 0.0 + # The synthetic event has no real waveform body, so the codec can't + # decode samples → read_blastware_file leaves peak_values=None + # (the "we don't know" signal) rather than fabricating all-zero + # peaks that would otherwise overwrite real DB values via UPSERT. + assert parsed.peak_values is None + assert parsed.raw_samples is not None + # Empty channels — codec returned None for the malformed synthetic body. + for ch in ("Tran", "Vert", "Long", "MicL"): + assert parsed.raw_samples[ch] == [] _BW_CODEC_FIXTURES = [