fix(idf_waveforms): _find_waveform_body_offset() — scans every 00 02 00 magic past offset 0x0E00, runs decode_waveform_v2 on each candidate, picks the one that returns the most samples. Validated on 483 prod IDFW files: 0 preamble-only events (was ~50%), 355/483 fully decode, 126/483 partial (BW codec walker-stops-early on loud events — known issue).
IDFH now synthesises a 1-sample-per-interval array from the binary intervals and writes an .h5 so the existing renderer works unchanged. Each "sample" is the per-interval peak ADC count → h5_value = count × geo_fs/32768 yields the right bar height.
This commit is contained in:
+68
-9
@@ -62,12 +62,23 @@ _THOR_PREFIX = b"\x00\x12\x01\x00\x00\x00"
|
|||||||
_BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00"
|
_BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00"
|
||||||
_INSTANTEL_TAG = b"Instantel"
|
_INSTANTEL_TAG = b"Instantel"
|
||||||
|
|
||||||
# Constant body offset for sig-A IDFW files (verified across 151/154 corpus
|
# Most common body offset for sig-A IDFW files (~50% of prod events;
|
||||||
# files in tests/fixtures/THORDATA_example). The body is the segment-rotated
|
# 151/154 in the original tests/fixtures/THORDATA_example corpus). The
|
||||||
# block stream consumed by decode_waveform_v2; bytes [0:3] are the magic
|
# body is the segment-rotated block stream consumed by decode_waveform_v2;
|
||||||
# ``00 02 00`` preamble.
|
# bytes [0:3] are the magic ``00 02 00`` preamble. Production events
|
||||||
|
# routinely use other offsets — see :func:`_find_waveform_body_offset`
|
||||||
|
# for the dynamic scan. This constant survives only as the priority hint.
|
||||||
_BODY_START_SIG_A = 0x0F1F
|
_BODY_START_SIG_A = 0x0F1F
|
||||||
|
|
||||||
|
# Magic bytes that mark a candidate waveform-body preamble.
|
||||||
|
_BODY_MAGIC = b"\x00\x02\x00"
|
||||||
|
|
||||||
|
# Where to start looking for body candidates inside the file. Skip the
|
||||||
|
# fixed-header region where the same magic legitimately appears inside
|
||||||
|
# channel-test records and the compliance block (offsets 0x015d, 0x091c,
|
||||||
|
# 0x0ae2, 0x0d30 in observed events).
|
||||||
|
_BODY_SCAN_FLOOR = 0x0E00
|
||||||
|
|
||||||
# Geophone count → in/s, derived from sidecar ground truth: the smallest
|
# Geophone count → in/s, derived from sidecar ground truth: the smallest
|
||||||
# non-zero sample in 1,014-file corpus is 0.0003 in/s.
|
# non-zero sample in 1,014-file corpus is 0.0003 in/s.
|
||||||
_GEO_LSB_IPS = 0.0003
|
_GEO_LSB_IPS = 0.0003
|
||||||
@@ -179,17 +190,65 @@ def extract_binary_metadata(buf: bytes) -> IdfBinaryMetadata:
|
|||||||
# ─── Sample decoder + unit conversion ───────────────────────────────────────
|
# ─── Sample decoder + unit conversion ───────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def _find_waveform_body_offset(buf: bytes) -> Optional[int]:
|
||||||
|
"""Pick the file offset of the waveform body by trial-decoding every
|
||||||
|
``00 02 00`` magic position past the fixed-header region.
|
||||||
|
|
||||||
|
The body's location isn't fixed across all sig-A IDFW files — about
|
||||||
|
half the production events use ``0x0f1f``, but the rest have offsets
|
||||||
|
that shift based on header padding / channel-config layout. We
|
||||||
|
auto-detect by:
|
||||||
|
|
||||||
|
1. Find every ``00 02 00`` occurrence past ``_BODY_SCAN_FLOOR``.
|
||||||
|
2. Try ``decode_waveform_v2()`` on each candidate.
|
||||||
|
3. Pick the offset whose decoded sample count is largest.
|
||||||
|
|
||||||
|
Returns the offset, or ``None`` if no candidate yielded more than
|
||||||
|
the trivial 2-sample preamble (= "no real body found").
|
||||||
|
|
||||||
|
Costs ~2-8 trial decodes per file; in practice the first candidate
|
||||||
|
past 0x0e00 is usually the right one.
|
||||||
|
"""
|
||||||
|
if len(buf) < _BODY_SCAN_FLOOR + 8:
|
||||||
|
return None
|
||||||
|
best: Optional[tuple[int, int]] = None # (total_samples, offset)
|
||||||
|
i = _BODY_SCAN_FLOOR
|
||||||
|
while True:
|
||||||
|
j = buf.find(_BODY_MAGIC, i)
|
||||||
|
if j < 0:
|
||||||
|
break
|
||||||
|
i = j + 1
|
||||||
|
try:
|
||||||
|
decoded = decode_waveform_v2(buf[j:])
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
if not decoded:
|
||||||
|
continue
|
||||||
|
total = sum(len(v) for v in decoded.values())
|
||||||
|
# A "real" body has more than just the 2-sample preamble.
|
||||||
|
if total <= 2:
|
||||||
|
continue
|
||||||
|
if best is None or total > best[0]:
|
||||||
|
best = (total, j)
|
||||||
|
return best[1] if best else None
|
||||||
|
|
||||||
|
|
||||||
def _decode_waveform_samples(buf: bytes) -> Optional[dict]:
|
def _decode_waveform_samples(buf: bytes) -> Optional[dict]:
|
||||||
"""Decode samples from the sig-A body starting at file offset 0x0f1f.
|
"""Decode samples from the sig-A waveform body.
|
||||||
|
|
||||||
Returns the raw decoder counts dict — geo LSB = 0.0003 in/s, mic in
|
Returns the raw decoder counts dict — geo LSB = 0.0003 in/s, mic in
|
||||||
its own count unit (see :func:`mic_count_to_psi`). Returns None if
|
its own count unit (see :func:`mic_count_to_psi`). Returns None if
|
||||||
decoding fails.
|
no usable body is found.
|
||||||
|
|
||||||
|
Uses :func:`_find_waveform_body_offset` to locate the body — the
|
||||||
|
file-offset varies across events (~50% sit at the canonical
|
||||||
|
``0x0f1f`` but the rest don't), so the previous hardcoded constant
|
||||||
|
silently produced 2-sample preamble-only output for half the corpus.
|
||||||
"""
|
"""
|
||||||
if len(buf) < _BODY_START_SIG_A + 8:
|
off = _find_waveform_body_offset(buf)
|
||||||
|
if off is None:
|
||||||
return None
|
return None
|
||||||
body = buf[_BODY_START_SIG_A:]
|
return decode_waveform_v2(buf[off:])
|
||||||
return decode_waveform_v2(body)
|
|
||||||
|
|
||||||
|
|
||||||
def geo_count_to_ips(count: int) -> float:
|
def geo_count_to_ips(count: int) -> float:
|
||||||
|
|||||||
@@ -0,0 +1,91 @@
|
|||||||
|
"""Re-ingest a prod IDFW + IDFH via the patched save_imported_idf and
|
||||||
|
render both PDFs to confirm charts have data."""
|
||||||
|
from __future__ import annotations
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import datetime
|
||||||
|
import tempfile
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
||||||
|
|
||||||
|
from sfm.waveform_store import WaveformStore
|
||||||
|
from sfm import report_pdf
|
||||||
|
import h5py
|
||||||
|
|
||||||
|
|
||||||
|
class FakeDb:
|
||||||
|
def __init__(self, event):
|
||||||
|
self.event = event
|
||||||
|
def get_event(self, _id):
|
||||||
|
return self.event
|
||||||
|
|
||||||
|
|
||||||
|
def to_ts_iso(ts):
|
||||||
|
if ts is None:
|
||||||
|
return None
|
||||||
|
try:
|
||||||
|
return datetime.datetime(ts.year, ts.month, ts.day, ts.hour, ts.minute, ts.second).isoformat()
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def render_case(idf_path: Path, serial: str, out_pdf: Path, h5_summary: bool = True):
|
||||||
|
with tempfile.TemporaryDirectory() as td:
|
||||||
|
store = WaveformStore(Path(td))
|
||||||
|
ev, rec = store.save_imported_idf(
|
||||||
|
idf_path.read_bytes(),
|
||||||
|
idf_path,
|
||||||
|
idf_report_text=None, # production worst case: no .txt
|
||||||
|
)
|
||||||
|
print(f"=== {idf_path.name} ===")
|
||||||
|
print(f" h5: {rec['hdf5_filename']}, sidecar: {rec['sidecar_filename']}")
|
||||||
|
|
||||||
|
h5p = Path(td) / serial / f"{idf_path.name}.h5"
|
||||||
|
if h5p.exists() and h5_summary:
|
||||||
|
with h5py.File(h5p) as h:
|
||||||
|
for ch in ("Tran", "Vert", "Long", "MicL"):
|
||||||
|
ds = h.get(f"samples/{ch}")
|
||||||
|
if ds is not None:
|
||||||
|
n = ds.shape[0]
|
||||||
|
mx = float(abs(ds[...]).max()) if n else 0
|
||||||
|
print(f" samples/{ch}: n={n} max_abs={mx:.5f}")
|
||||||
|
|
||||||
|
record_type = "Histogram" if idf_path.suffix.upper() == ".IDFH" else "Waveform"
|
||||||
|
fake_row = {
|
||||||
|
"serial": serial,
|
||||||
|
"blastware_filename": rec["filename"],
|
||||||
|
"record_type": record_type,
|
||||||
|
"timestamp": to_ts_iso(ev.timestamp),
|
||||||
|
"sample_rate": ev.sample_rate,
|
||||||
|
"project": ev.project_info.project if ev.project_info else None,
|
||||||
|
"client": ev.project_info.client if ev.project_info else None,
|
||||||
|
"operator": ev.project_info.operator if ev.project_info else None,
|
||||||
|
"sensor_location": ev.project_info.sensor_location if ev.project_info else None,
|
||||||
|
"created_at": None,
|
||||||
|
}
|
||||||
|
rd = report_pdf.gather_report_data(FakeDb(fake_row), store, event_id="test-1")
|
||||||
|
print(f" ReportData: channels={ {k: len(v) for k,v in rd.channels.items()} }")
|
||||||
|
if rd.is_histogram:
|
||||||
|
print(f" histogram n_intervals={rd.histogram_n_intervals} interval_size={rd.histogram_interval_size}")
|
||||||
|
pdf = report_pdf.render_event_report_pdf(rd)
|
||||||
|
out_pdf.write_bytes(pdf)
|
||||||
|
print(f" PDF: {out_pdf} ({len(pdf)} bytes)")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
out_dir = Path("/tmp/thor_render_test"); out_dir.mkdir(exist_ok=True)
|
||||||
|
cases = [
|
||||||
|
# IDFW that decoded to preamble-only under the old codec
|
||||||
|
("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804154137.IDFW", "UM6047"),
|
||||||
|
# IDFW that worked under the old codec (validates no regression)
|
||||||
|
("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804104450.IDFW", "UM6047"),
|
||||||
|
# IDFH histogram
|
||||||
|
("/home/serversdown/seismo-relay-prod-snap/waveforms/UM6047/UM6047_20250804190047.IDFH", "UM6047"),
|
||||||
|
]
|
||||||
|
for path, serial in cases:
|
||||||
|
render_case(Path(path), serial, out_dir / f"{Path(path).name}.pdf")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
+21
-3
@@ -600,10 +600,28 @@ class WaveformStore:
|
|||||||
n_samples = max((len(idf_samples.get(ch, [])) for ch in ("Tran", "Vert", "Long", "MicL")), default=0)
|
n_samples = max((len(idf_samples.get(ch, [])) for ch in ("Tran", "Vert", "Long", "MicL")), default=0)
|
||||||
ev.total_samples = ev.total_samples or n_samples
|
ev.total_samples = ev.total_samples or n_samples
|
||||||
|
|
||||||
# 7. Write the .h5 clean-waveform file when we actually have samples.
|
# For IDFH histograms there are no per-sample waveform arrays — the
|
||||||
# Histograms (IDFH) don't have waveform samples — skip h5 for those.
|
# device stores one peak ADC count per interval per channel. Synthesise
|
||||||
|
# a 1-sample-per-interval array so the existing h5+renderer pipeline
|
||||||
|
# (which groups samples down to ``n_intervals`` bars via max-per-group)
|
||||||
|
# produces a non-blank histogram chart. Each "sample" is the peak ADC
|
||||||
|
# count for that interval, so the h5 writer's ``count × geo_fs/32768``
|
||||||
|
# conversion yields the right physical value for the bar height.
|
||||||
|
if is_histogram and idf_intervals:
|
||||||
|
hist_samples = {
|
||||||
|
"Tran": [iv.peak_count("Tran") for iv in idf_intervals],
|
||||||
|
"Vert": [iv.peak_count("Vert") for iv in idf_intervals],
|
||||||
|
"Long": [iv.peak_count("Long") for iv in idf_intervals],
|
||||||
|
"MicL": [iv.peak_count("MicL") for iv in idf_intervals],
|
||||||
|
}
|
||||||
|
ev.raw_samples = hist_samples
|
||||||
|
ev.total_samples = ev.total_samples or len(idf_intervals)
|
||||||
|
|
||||||
|
# 7. Write the .h5 clean-waveform file when we have samples to write
|
||||||
|
# (either the IDFW per-sample stream, or the IDFH synthesised per-
|
||||||
|
# interval peak array). The renderer treats both shapes the same way.
|
||||||
hdf5_filename: Optional[str] = None
|
hdf5_filename: Optional[str] = None
|
||||||
if idf_samples is not None and not is_histogram:
|
if ev.raw_samples:
|
||||||
hdf5_path = self.hdf5_path_for(serial, filename)
|
hdf5_path = self.hdf5_path_for(serial, filename)
|
||||||
try:
|
try:
|
||||||
event_hdf5.write_event_hdf5(
|
event_hdf5.write_event_hdf5(
|
||||||
|
|||||||
Reference in New Issue
Block a user