fix(idf_waveforms): _find_waveform_body_offset() — scans every 00 02 00 magic past offset 0x0E00, runs decode_waveform_v2 on each candidate, picks the one that returns the most samples. Validated on 483 prod IDFW files: 0 preamble-only events (was ~50%), 355/483 fully decode, 126/483 partial (BW codec walker-stops-early on loud events — known issue).

IDFH now synthesises a 1-sample-per-interval array from the binary intervals and writes an .h5 so the existing renderer works unchanged. Each "sample" is the per-interval peak ADC count → h5_value = count × geo_fs/32768 yields the right bar height.
2026-05-31 20:51:09 +00:00
parent 43f440812a
commit b2c565f217
3 changed files with 180 additions and 12 deletions
@@ -62,12 +62,23 @@ _THOR_PREFIX = b"\x00\x12\x01\x00\x00\x00"
 _BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00"
 _INSTANTEL_TAG = b"Instantel"
-# Constant body offset for sig-A IDFW files (verified across 151/154 corpus
+# Most common body offset for sig-A IDFW files (~50% of prod events;
-# files in tests/fixtures/THORDATA_example).  The body is the segment-rotated
+# 151/154 in the original tests/fixtures/THORDATA_example corpus).  The
-# block stream consumed by decode_waveform_v2; bytes [0:3] are the magic
+# body is the segment-rotated block stream consumed by decode_waveform_v2;
-# ``00 02 00`` preamble.
+# bytes [0:3] are the magic ``00 02 00`` preamble.  Production events
 # routinely use other offsets — see :func:`_find_waveform_body_offset`
 # for the dynamic scan.  This constant survives only as the priority hint.
 _BODY_START_SIG_A = 0x0F1F
 # Magic bytes that mark a candidate waveform-body preamble.
 _BODY_MAGIC = b"\x00\x02\x00"
 # Where to start looking for body candidates inside the file.  Skip the
 # fixed-header region where the same magic legitimately appears inside
 # channel-test records and the compliance block (offsets 0x015d, 0x091c,
 # 0x0ae2, 0x0d30 in observed events).
 _BODY_SCAN_FLOOR = 0x0E00
 # Geophone count → in/s, derived from sidecar ground truth: the smallest
 # non-zero sample in 1,014-file corpus is 0.0003 in/s.
 _GEO_LSB_IPS = 0.0003
@@ -179,17 +190,65 @@ def extract_binary_metadata(buf: bytes) -> IdfBinaryMetadata:
 # ─── Sample decoder + unit conversion ───────────────────────────────────────
 def _find_waveform_body_offset(buf: bytes) -> Optional[int]:
    """Pick the file offset of the waveform body by trial-decoding every
    ``00 02 00`` magic position past the fixed-header region.
    The body's location isn't fixed across all sig-A IDFW files — about
    half the production events use ``0x0f1f``, but the rest have offsets
    that shift based on header padding / channel-config layout.  We
    auto-detect by:
      1. Find every ``00 02 00`` occurrence past ``_BODY_SCAN_FLOOR``.
      2. Try ``decode_waveform_v2()`` on each candidate.
      3. Pick the offset whose decoded sample count is largest.
    Returns the offset, or ``None`` if no candidate yielded more than
    the trivial 2-sample preamble (= "no real body found").
    Costs ~2-8 trial decodes per file; in practice the first candidate
    past 0x0e00 is usually the right one.
    """
    if len(buf) < _BODY_SCAN_FLOOR + 8:
        return None
    best: Optional[tuple[int, int]] = None   # (total_samples, offset)
    i = _BODY_SCAN_FLOOR
    while True:
        j = buf.find(_BODY_MAGIC, i)
        if j < 0:
            break
        i = j + 1
        try:
            decoded = decode_waveform_v2(buf[j:])
        except Exception:
            continue
        if not decoded:
            continue
        total = sum(len(v) for v in decoded.values())
        # A "real" body has more than just the 2-sample preamble.
        if total <= 2:
            continue
        if best is None or total > best[0]:
            best = (total, j)
    return best[1] if best else None
 def _decode_waveform_samples(buf: bytes) -> Optional[dict]:
-    """Decode samples from the sig-A body starting at file offset 0x0f1f.
+    """Decode samples from the sig-A waveform body.
    Returns the raw decoder counts dict — geo LSB = 0.0003 in/s, mic in
    its own count unit (see :func:`mic_count_to_psi`).  Returns None if
-    decoding fails.
+    no usable body is found.
    Uses :func:`_find_waveform_body_offset` to locate the body — the
    file-offset varies across events (~50% sit at the canonical
    ``0x0f1f`` but the rest don't), so the previous hardcoded constant
    silently produced 2-sample preamble-only output for half the corpus.
    """
-    if len(buf) < _BODY_START_SIG_A + 8:
+    off = _find_waveform_body_offset(buf)
    if off is None:
        return None
-    body = buf[_BODY_START_SIG_A:]
+    return decode_waveform_v2(buf[off:])
    return decode_waveform_v2(body)
 def geo_count_to_ips(count: int) -> float:
@@ -0,0 +1,91 @@
 """Re-ingest a prod IDFW + IDFH via the patched save_imported_idf and
 render both PDFs to confirm charts have data."""
 from __future__ import annotations
 import sys
 import json
 import datetime
 import tempfile
 from pathlib import Path
 sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
 from sfm.waveform_store import WaveformStore
 from sfm import report_pdf
 import h5py
 class FakeDb:
    def __init__(self, event):
        self.event = event
    def get_event(self, _id):
        return self.event
 def to_ts_iso(ts):
    if ts is None:
        return None
    try:
        return datetime.datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second).isoformat()
    except Exception:
        return None
 def render_case(idf_path: Path, serial: str, out_pdf: Path, h5_summary: bool = True):
    with tempfile.TemporaryDirectory() as td:
        store = WaveformStore(Path(td))
        ev, rec = store.save_imported_idf(
            idf_path.read_bytes(),
            idf_path,
            idf_report_text=None,    # production worst case: no .txt
        )
        print(f"=== {idf_path.name} ===")
        print(f"  h5: {rec['hdf5_filename']}, sidecar: {rec['sidecar_filename']}")
        h5p = Path(td) / serial / f"{idf_path.name}.h5"
        if h5p.exists() and h5_summary:
            with h5py.File(h5p) as h:
                for ch in ("Tran", "Vert", "Long", "MicL"):
                    ds = h.get(f"samples/{ch}")
                    if ds is not None:
                        n = ds.shape[0]
                        mx = float(abs(ds[...]).max()) if n else 0
                        print(f"  samples/{ch}: n={n}  max_abs={mx:.5f}")
        record_type = "Histogram" if idf_path.suffix.upper() == ".IDFH" else "Waveform"
        fake_row = {
            "serial":              serial,
            "blastware_filename":  rec["filename"],
            "record_type":         record_type,
            "timestamp":           to_ts_iso(ev.timestamp),
            "sample_rate":         ev.sample_rate,
            "project":             ev.project_info.project if ev.project_info else None,
            "client":              ev.project_info.client if ev.project_info else None,
            "operator":            ev.project_info.operator if ev.project_info else None,
            "sensor_location":     ev.project_info.sensor_location if ev.project_info else None,
            "created_at":          None,
        }
        rd = report_pdf.gather_report_data(FakeDb(fake_row), store, event_id="test-1")
        print(f"  ReportData: channels={ {k: len(v) for k,v in rd.channels.items()} }")
        if rd.is_histogram:
            print(f"  histogram n_intervals={rd.histogram_n_intervals} interval_size={rd.histogram_interval_size}")
        pdf = report_pdf.render_event_report_pdf(rd)
        out_pdf.write_bytes(pdf)
        print(f"  PDF: {out_pdf}  ({len(pdf)} bytes)")
 def main():
    out_dir = Path("/tmp/thor_render_test"); out_dir.mkdir(exist_ok=True)
    cases = [
        # IDFW that decoded to preamble-only under the old codec
        ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804154137.IDFW", "UM6047"),
        # IDFW that worked under the old codec (validates no regression)
        ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804104450.IDFW", "UM6047"),
        # IDFH histogram
        ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804190047.IDFH", "UM6047"),
    ]
    for path, serial in cases:
        render_case(Path(path), serial, out_dir / f"{Path(path).name}.pdf")
 if __name__ == "__main__":
    main()
@@ -600,10 +600,28 @@ class WaveformStore:
            n_samples = max((len(idf_samples.get(ch, [])) for ch in ("Tran", "Vert", "Long", "MicL")), default=0)
            ev.total_samples = ev.total_samples or n_samples
-        # 7. Write the .h5 clean-waveform file when we actually have samples.
+        # For IDFH histograms there are no per-sample waveform arrays — the
-        # Histograms (IDFH) don't have waveform samples — skip h5 for those.
+        # device stores one peak ADC count per interval per channel.  Synthesise
        # a 1-sample-per-interval array so the existing h5+renderer pipeline
        # (which groups samples down to ``n_intervals`` bars via max-per-group)
        # produces a non-blank histogram chart.  Each "sample" is the peak ADC
        # count for that interval, so the h5 writer's ``count × geo_fs/32768``
        # conversion yields the right physical value for the bar height.
        if is_histogram and idf_intervals:
            hist_samples = {
                "Tran": [iv.peak_count("Tran") for iv in idf_intervals],
                "Vert": [iv.peak_count("Vert") for iv in idf_intervals],
                "Long": [iv.peak_count("Long") for iv in idf_intervals],
                "MicL": [iv.peak_count("MicL") for iv in idf_intervals],
            }
            ev.raw_samples = hist_samples
            ev.total_samples = ev.total_samples or len(idf_intervals)
        # 7. Write the .h5 clean-waveform file when we have samples to write
        # (either the IDFW per-sample stream, or the IDFH synthesised per-
        # interval peak array).  The renderer treats both shapes the same way.
        hdf5_filename: Optional[str] = None
-        if idf_samples is not None and not is_histogram:
+        if ev.raw_samples:
            hdf5_path = self.hdf5_path_for(serial, filename)
            try:
                event_hdf5.write_event_hdf5(