diff --git a/micromate/idf_file.py b/micromate/idf_file.py index 203a26e..eb8d70d 100644 --- a/micromate/idf_file.py +++ b/micromate/idf_file.py @@ -62,12 +62,23 @@ _THOR_PREFIX = b"\x00\x12\x01\x00\x00\x00" _BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00" _INSTANTEL_TAG = b"Instantel" -# Constant body offset for sig-A IDFW files (verified across 151/154 corpus -# files in tests/fixtures/THORDATA_example). The body is the segment-rotated -# block stream consumed by decode_waveform_v2; bytes [0:3] are the magic -# ``00 02 00`` preamble. +# Most common body offset for sig-A IDFW files (~50% of prod events; +# 151/154 in the original tests/fixtures/THORDATA_example corpus). The +# body is the segment-rotated block stream consumed by decode_waveform_v2; +# bytes [0:3] are the magic ``00 02 00`` preamble. Production events +# routinely use other offsets — see :func:`_find_waveform_body_offset` +# for the dynamic scan. This constant survives only as the priority hint. _BODY_START_SIG_A = 0x0F1F +# Magic bytes that mark a candidate waveform-body preamble. +_BODY_MAGIC = b"\x00\x02\x00" + +# Where to start looking for body candidates inside the file. Skip the +# fixed-header region where the same magic legitimately appears inside +# channel-test records and the compliance block (offsets 0x015d, 0x091c, +# 0x0ae2, 0x0d30 in observed events). +_BODY_SCAN_FLOOR = 0x0E00 + # Geophone count → in/s, derived from sidecar ground truth: the smallest # non-zero sample in 1,014-file corpus is 0.0003 in/s. _GEO_LSB_IPS = 0.0003 @@ -179,17 +190,65 @@ def extract_binary_metadata(buf: bytes) -> IdfBinaryMetadata: # ─── Sample decoder + unit conversion ─────────────────────────────────────── +def _find_waveform_body_offset(buf: bytes) -> Optional[int]: + """Pick the file offset of the waveform body by trial-decoding every + ``00 02 00`` magic position past the fixed-header region. + + The body's location isn't fixed across all sig-A IDFW files — about + half the production events use ``0x0f1f``, but the rest have offsets + that shift based on header padding / channel-config layout. We + auto-detect by: + + 1. Find every ``00 02 00`` occurrence past ``_BODY_SCAN_FLOOR``. + 2. Try ``decode_waveform_v2()`` on each candidate. + 3. Pick the offset whose decoded sample count is largest. + + Returns the offset, or ``None`` if no candidate yielded more than + the trivial 2-sample preamble (= "no real body found"). + + Costs ~2-8 trial decodes per file; in practice the first candidate + past 0x0e00 is usually the right one. + """ + if len(buf) < _BODY_SCAN_FLOOR + 8: + return None + best: Optional[tuple[int, int]] = None # (total_samples, offset) + i = _BODY_SCAN_FLOOR + while True: + j = buf.find(_BODY_MAGIC, i) + if j < 0: + break + i = j + 1 + try: + decoded = decode_waveform_v2(buf[j:]) + except Exception: + continue + if not decoded: + continue + total = sum(len(v) for v in decoded.values()) + # A "real" body has more than just the 2-sample preamble. + if total <= 2: + continue + if best is None or total > best[0]: + best = (total, j) + return best[1] if best else None + + def _decode_waveform_samples(buf: bytes) -> Optional[dict]: - """Decode samples from the sig-A body starting at file offset 0x0f1f. + """Decode samples from the sig-A waveform body. Returns the raw decoder counts dict — geo LSB = 0.0003 in/s, mic in its own count unit (see :func:`mic_count_to_psi`). Returns None if - decoding fails. + no usable body is found. + + Uses :func:`_find_waveform_body_offset` to locate the body — the + file-offset varies across events (~50% sit at the canonical + ``0x0f1f`` but the rest don't), so the previous hardcoded constant + silently produced 2-sample preamble-only output for half the corpus. """ - if len(buf) < _BODY_START_SIG_A + 8: + off = _find_waveform_body_offset(buf) + if off is None: return None - body = buf[_BODY_START_SIG_A:] - return decode_waveform_v2(body) + return decode_waveform_v2(buf[off:]) def geo_count_to_ips(count: int) -> float: diff --git a/scripts/test_thor_render.py b/scripts/test_thor_render.py new file mode 100644 index 0000000..8381cd2 --- /dev/null +++ b/scripts/test_thor_render.py @@ -0,0 +1,91 @@ +"""Re-ingest a prod IDFW + IDFH via the patched save_imported_idf and +render both PDFs to confirm charts have data.""" +from __future__ import annotations +import sys +import json +import datetime +import tempfile +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from sfm.waveform_store import WaveformStore +from sfm import report_pdf +import h5py + + +class FakeDb: + def __init__(self, event): + self.event = event + def get_event(self, _id): + return self.event + + +def to_ts_iso(ts): + if ts is None: + return None + try: + return datetime.datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second).isoformat() + except Exception: + return None + + +def render_case(idf_path: Path, serial: str, out_pdf: Path, h5_summary: bool = True): + with tempfile.TemporaryDirectory() as td: + store = WaveformStore(Path(td)) + ev, rec = store.save_imported_idf( + idf_path.read_bytes(), + idf_path, + idf_report_text=None, # production worst case: no .txt + ) + print(f"=== {idf_path.name} ===") + print(f" h5: {rec['hdf5_filename']}, sidecar: {rec['sidecar_filename']}") + + h5p = Path(td) / serial / f"{idf_path.name}.h5" + if h5p.exists() and h5_summary: + with h5py.File(h5p) as h: + for ch in ("Tran", "Vert", "Long", "MicL"): + ds = h.get(f"samples/{ch}") + if ds is not None: + n = ds.shape[0] + mx = float(abs(ds[...]).max()) if n else 0 + print(f" samples/{ch}: n={n} max_abs={mx:.5f}") + + record_type = "Histogram" if idf_path.suffix.upper() == ".IDFH" else "Waveform" + fake_row = { + "serial": serial, + "blastware_filename": rec["filename"], + "record_type": record_type, + "timestamp": to_ts_iso(ev.timestamp), + "sample_rate": ev.sample_rate, + "project": ev.project_info.project if ev.project_info else None, + "client": ev.project_info.client if ev.project_info else None, + "operator": ev.project_info.operator if ev.project_info else None, + "sensor_location": ev.project_info.sensor_location if ev.project_info else None, + "created_at": None, + } + rd = report_pdf.gather_report_data(FakeDb(fake_row), store, event_id="test-1") + print(f" ReportData: channels={ {k: len(v) for k,v in rd.channels.items()} }") + if rd.is_histogram: + print(f" histogram n_intervals={rd.histogram_n_intervals} interval_size={rd.histogram_interval_size}") + pdf = report_pdf.render_event_report_pdf(rd) + out_pdf.write_bytes(pdf) + print(f" PDF: {out_pdf} ({len(pdf)} bytes)") + + +def main(): + out_dir = Path("/tmp/thor_render_test"); out_dir.mkdir(exist_ok=True) + cases = [ + # IDFW that decoded to preamble-only under the old codec + ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804154137.IDFW", "UM6047"), + # IDFW that worked under the old codec (validates no regression) + ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804104450.IDFW", "UM6047"), + # IDFH histogram + ("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804190047.IDFH", "UM6047"), + ] + for path, serial in cases: + render_case(Path(path), serial, out_dir / f"{Path(path).name}.pdf") + + +if __name__ == "__main__": + main() diff --git a/sfm/waveform_store.py b/sfm/waveform_store.py index 3b2ba42..3063cf9 100644 --- a/sfm/waveform_store.py +++ b/sfm/waveform_store.py @@ -600,10 +600,28 @@ class WaveformStore: n_samples = max((len(idf_samples.get(ch, [])) for ch in ("Tran", "Vert", "Long", "MicL")), default=0) ev.total_samples = ev.total_samples or n_samples - # 7. Write the .h5 clean-waveform file when we actually have samples. - # Histograms (IDFH) don't have waveform samples — skip h5 for those. + # For IDFH histograms there are no per-sample waveform arrays — the + # device stores one peak ADC count per interval per channel. Synthesise + # a 1-sample-per-interval array so the existing h5+renderer pipeline + # (which groups samples down to ``n_intervals`` bars via max-per-group) + # produces a non-blank histogram chart. Each "sample" is the peak ADC + # count for that interval, so the h5 writer's ``count × geo_fs/32768`` + # conversion yields the right physical value for the bar height. + if is_histogram and idf_intervals: + hist_samples = { + "Tran": [iv.peak_count("Tran") for iv in idf_intervals], + "Vert": [iv.peak_count("Vert") for iv in idf_intervals], + "Long": [iv.peak_count("Long") for iv in idf_intervals], + "MicL": [iv.peak_count("MicL") for iv in idf_intervals], + } + ev.raw_samples = hist_samples + ev.total_samples = ev.total_samples or len(idf_intervals) + + # 7. Write the .h5 clean-waveform file when we have samples to write + # (either the IDFW per-sample stream, or the IDFH synthesised per- + # interval peak array). The renderer treats both shapes the same way. hdf5_filename: Optional[str] = None - if idf_samples is not None and not is_histogram: + if ev.raw_samples: hdf5_path = self.hdf5_path_for(serial, filename) try: event_hdf5.write_event_hdf5(