2026-06-01 16:54:24 -04:00
3 changed files with 180 additions and 12 deletions
@@ -62,12 +62,23 @@ _THOR_PREFIX = b"\x00\x12\x01\x00\x00\x00"
 _BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00"
 _INSTANTEL_TAG = b"Instantel"

-# Constant body offset for sig-A IDFW files (verified across 151/154 corpus
-# files in tests/fixtures/THORDATA_example).  The body is the segment-rotated
-# block stream consumed by decode_waveform_v2; bytes [0:3] are the magic
-# ``00 02 00`` preamble.
+# Most common body offset for sig-A IDFW files (~50% of prod events;
+# 151/154 in the original tests/fixtures/THORDATA_example corpus).  The
+# body is the segment-rotated block stream consumed by decode_waveform_v2;
+# bytes [0:3] are the magic ``00 02 00`` preamble.  Production events
+# routinely use other offsets — see :func:`_find_waveform_body_offset`
+# for the dynamic scan.  This constant survives only as the priority hint.
 _BODY_START_SIG_A = 0x0F1F

+# Magic bytes that mark a candidate waveform-body preamble.
+_BODY_MAGIC = b"\x00\x02\x00"
+
+# Where to start looking for body candidates inside the file.  Skip the
+# fixed-header region where the same magic legitimately appears inside
+# channel-test records and the compliance block (offsets 0x015d, 0x091c,
+# 0x0ae2, 0x0d30 in observed events).
+_BODY_SCAN_FLOOR = 0x0E00
+
 # Geophone count → in/s, derived from sidecar ground truth: the smallest
 # non-zero sample in 1,014-file corpus is 0.0003 in/s.
 _GEO_LSB_IPS = 0.0003
@@ -179,17 +190,65 @@ def extract_binary_metadata(buf: bytes) -> IdfBinaryMetadata:
 # ─── Sample decoder + unit conversion ───────────────────────────────────────


+def _find_waveform_body_offset(buf: bytes) -> Optional[int]:
+    """Pick the file offset of the waveform body by trial-decoding every
+    ``00 02 00`` magic position past the fixed-header region.
+
+    The body's location isn't fixed across all sig-A IDFW files — about
+    half the production events use ``0x0f1f``, but the rest have offsets
+    that shift based on header padding / channel-config layout.  We
+    auto-detect by:
+
+      1. Find every ``00 02 00`` occurrence past ``_BODY_SCAN_FLOOR``.
+      2. Try ``decode_waveform_v2()`` on each candidate.
+      3. Pick the offset whose decoded sample count is largest.
+
+    Returns the offset, or ``None`` if no candidate yielded more than
+    the trivial 2-sample preamble (= "no real body found").
+
+    Costs ~2-8 trial decodes per file; in practice the first candidate
+    past 0x0e00 is usually the right one.
+    """
+    if len(buf) < _BODY_SCAN_FLOOR + 8:
+        return None
+    best: Optional[tuple[int, int]] = None   # (total_samples, offset)
+    i = _BODY_SCAN_FLOOR
+    while True:
+        j = buf.find(_BODY_MAGIC, i)
+        if j < 0:
+            break
+        i = j + 1
+        try:
+            decoded = decode_waveform_v2(buf[j:])
+        except Exception:
+            continue
+        if not decoded:
+            continue
+        total = sum(len(v) for v in decoded.values())
+        # A "real" body has more than just the 2-sample preamble.
+        if total <= 2:
+            continue
+        if best is None or total > best[0]:
+            best = (total, j)
+    return best[1] if best else None
+
+
 def _decode_waveform_samples(buf: bytes) -> Optional[dict]:
-    """Decode samples from the sig-A body starting at file offset 0x0f1f.
+    """Decode samples from the sig-A waveform body.

    Returns the raw decoder counts dict — geo LSB = 0.0003 in/s, mic in
    its own count unit (see :func:`mic_count_to_psi`).  Returns None if
-    decoding fails.
+    no usable body is found.
+
+    Uses :func:`_find_waveform_body_offset` to locate the body — the
+    file-offset varies across events (~50% sit at the canonical
+    ``0x0f1f`` but the rest don't), so the previous hardcoded constant
+    silently produced 2-sample preamble-only output for half the corpus.
    """
-    if len(buf) < _BODY_START_SIG_A + 8:
+    off = _find_waveform_body_offset(buf)
+    if off is None:
        return None
-    body = buf[_BODY_START_SIG_A:]
-    return decode_waveform_v2(body)
+    return decode_waveform_v2(buf[off:])


 def geo_count_to_ips(count: int) -> float:
@@ -0,0 +1,91 @@
+"""Re-ingest a prod IDFW + IDFH via the patched save_imported_idf and
+render both PDFs to confirm charts have data."""
+from __future__ import annotations
+import sys
+import json
+import datetime
+import tempfile
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
+
+from sfm.waveform_store import WaveformStore
+from sfm import report_pdf
+import h5py
+
+
+class FakeDb:
+    def __init__(self, event):
+        self.event = event
+    def get_event(self, _id):
+        return self.event
+
+
+def to_ts_iso(ts):
+    if ts is None:
+        return None
+    try:
+        return datetime.datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second).isoformat()
+    except Exception:
+        return None
+
+
+def render_case(idf_path: Path, serial: str, out_pdf: Path, h5_summary: bool = True):
+    with tempfile.TemporaryDirectory() as td:
+        store = WaveformStore(Path(td))
+        ev, rec = store.save_imported_idf(
+            idf_path.read_bytes(),
+            idf_path,
+            idf_report_text=None,    # production worst case: no .txt
+        )
+        print(f"=== {idf_path.name} ===")
+        print(f"  h5: {rec['hdf5_filename']}, sidecar: {rec['sidecar_filename']}")
+
+        h5p = Path(td) / serial / f"{idf_path.name}.h5"
+        if h5p.exists() and h5_summary:
+            with h5py.File(h5p) as h:
+                for ch in ("Tran", "Vert", "Long", "MicL"):
+                    ds = h.get(f"samples/{ch}")
+                    if ds is not None:
+                        n = ds.shape[0]
+                        mx = float(abs(ds[...]).max()) if n else 0
+                        print(f"  samples/{ch}: n={n}  max_abs={mx:.5f}")
+
+        record_type = "Histogram" if idf_path.suffix.upper() == ".IDFH" else "Waveform"
+        fake_row = {
+            "serial":              serial,
+            "blastware_filename":  rec["filename"],
+            "record_type":         record_type,
+            "timestamp":           to_ts_iso(ev.timestamp),
+            "sample_rate":         ev.sample_rate,
+            "project":             ev.project_info.project if ev.project_info else None,
+            "client":              ev.project_info.client if ev.project_info else None,
+            "operator":            ev.project_info.operator if ev.project_info else None,
+            "sensor_location":     ev.project_info.sensor_location if ev.project_info else None,
+            "created_at":          None,
+        }
+        rd = report_pdf.gather_report_data(FakeDb(fake_row), store, event_id="test-1")
+        print(f"  ReportData: channels={ {k: len(v) for k,v in rd.channels.items()} }")
+        if rd.is_histogram:
+            print(f"  histogram n_intervals={rd.histogram_n_intervals} interval_size={rd.histogram_interval_size}")
+        pdf = report_pdf.render_event_report_pdf(rd)
+        out_pdf.write_bytes(pdf)
+        print(f"  PDF: {out_pdf}  ({len(pdf)} bytes)")
+
+
+def main():
+    out_dir = Path("/tmp/thor_render_test"); out_dir.mkdir(exist_ok=True)
+    cases = [
+        # IDFW that decoded to preamble-only under the old codec
+        ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804154137.IDFW", "UM6047"),
+        # IDFW that worked under the old codec (validates no regression)
+        ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804104450.IDFW", "UM6047"),
+        # IDFH histogram
+        ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804190047.IDFH", "UM6047"),
+    ]
+    for path, serial in cases:
+        render_case(Path(path), serial, out_dir / f"{Path(path).name}.pdf")
+
+
+if __name__ == "__main__":
+    main()
@@ -600,10 +600,28 @@ class WaveformStore:
            n_samples = max((len(idf_samples.get(ch, [])) for ch in ("Tran", "Vert", "Long", "MicL")), default=0)
            ev.total_samples = ev.total_samples or n_samples

-        # 7. Write the .h5 clean-waveform file when we actually have samples.
-        # Histograms (IDFH) don't have waveform samples — skip h5 for those.
+        # For IDFH histograms there are no per-sample waveform arrays — the
+        # device stores one peak ADC count per interval per channel.  Synthesise
+        # a 1-sample-per-interval array so the existing h5+renderer pipeline
+        # (which groups samples down to ``n_intervals`` bars via max-per-group)
+        # produces a non-blank histogram chart.  Each "sample" is the peak ADC
+        # count for that interval, so the h5 writer's ``count × geo_fs/32768``
+        # conversion yields the right physical value for the bar height.
+        if is_histogram and idf_intervals:
+            hist_samples = {
+                "Tran": [iv.peak_count("Tran") for iv in idf_intervals],
+                "Vert": [iv.peak_count("Vert") for iv in idf_intervals],
+                "Long": [iv.peak_count("Long") for iv in idf_intervals],
+                "MicL": [iv.peak_count("MicL") for iv in idf_intervals],
+            }
+            ev.raw_samples = hist_samples
+            ev.total_samples = ev.total_samples or len(idf_intervals)
+
+        # 7. Write the .h5 clean-waveform file when we have samples to write
+        # (either the IDFW per-sample stream, or the IDFH synthesised per-
+        # interval peak array).  The renderer treats both shapes the same way.
        hdf5_filename: Optional[str] = None
-        if idf_samples is not None and not is_histogram:
+        if ev.raw_samples:
            hdf5_path = self.hdf5_path_for(serial, filename)
            try:
                event_hdf5.write_event_hdf5(