fix(idf_waveforms): _find_waveform_body_offset() — scans every 00 02 00 magic past offset 0x0E00, runs decode_waveform_v2 on each candidate, picks the one that returns the most samples. Validated on 483 prod IDFW files: 0 preamble-only events (was ~50%), 355/483 fully decode, 126/483 partial (BW codec walker-stops-early on loud events — known issue).
IDFH now synthesises a 1-sample-per-interval array from the binary intervals and writes an .h5 so the existing renderer works unchanged. Each "sample" is the per-interval peak ADC count → h5_value = count × geo_fs/32768 yields the right bar height.
This commit is contained in:
+68
-9
@@ -62,12 +62,23 @@ _THOR_PREFIX = b"\x00\x12\x01\x00\x00\x00"
|
||||
_BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00"
|
||||
_INSTANTEL_TAG = b"Instantel"
|
||||
|
||||
# Constant body offset for sig-A IDFW files (verified across 151/154 corpus
|
||||
# files in tests/fixtures/THORDATA_example). The body is the segment-rotated
|
||||
# block stream consumed by decode_waveform_v2; bytes [0:3] are the magic
|
||||
# ``00 02 00`` preamble.
|
||||
# Most common body offset for sig-A IDFW files (~50% of prod events;
|
||||
# 151/154 in the original tests/fixtures/THORDATA_example corpus). The
|
||||
# body is the segment-rotated block stream consumed by decode_waveform_v2;
|
||||
# bytes [0:3] are the magic ``00 02 00`` preamble. Production events
|
||||
# routinely use other offsets — see :func:`_find_waveform_body_offset`
|
||||
# for the dynamic scan. This constant survives only as the priority hint.
|
||||
_BODY_START_SIG_A = 0x0F1F
|
||||
|
||||
# Magic bytes that mark a candidate waveform-body preamble.
|
||||
_BODY_MAGIC = b"\x00\x02\x00"
|
||||
|
||||
# Where to start looking for body candidates inside the file. Skip the
|
||||
# fixed-header region where the same magic legitimately appears inside
|
||||
# channel-test records and the compliance block (offsets 0x015d, 0x091c,
|
||||
# 0x0ae2, 0x0d30 in observed events).
|
||||
_BODY_SCAN_FLOOR = 0x0E00
|
||||
|
||||
# Geophone count → in/s, derived from sidecar ground truth: the smallest
|
||||
# non-zero sample in 1,014-file corpus is 0.0003 in/s.
|
||||
_GEO_LSB_IPS = 0.0003
|
||||
@@ -179,17 +190,65 @@ def extract_binary_metadata(buf: bytes) -> IdfBinaryMetadata:
|
||||
# ─── Sample decoder + unit conversion ───────────────────────────────────────
|
||||
|
||||
|
||||
def _find_waveform_body_offset(buf: bytes) -> Optional[int]:
|
||||
"""Pick the file offset of the waveform body by trial-decoding every
|
||||
``00 02 00`` magic position past the fixed-header region.
|
||||
|
||||
The body's location isn't fixed across all sig-A IDFW files — about
|
||||
half the production events use ``0x0f1f``, but the rest have offsets
|
||||
that shift based on header padding / channel-config layout. We
|
||||
auto-detect by:
|
||||
|
||||
1. Find every ``00 02 00`` occurrence past ``_BODY_SCAN_FLOOR``.
|
||||
2. Try ``decode_waveform_v2()`` on each candidate.
|
||||
3. Pick the offset whose decoded sample count is largest.
|
||||
|
||||
Returns the offset, or ``None`` if no candidate yielded more than
|
||||
the trivial 2-sample preamble (= "no real body found").
|
||||
|
||||
Costs ~2-8 trial decodes per file; in practice the first candidate
|
||||
past 0x0e00 is usually the right one.
|
||||
"""
|
||||
if len(buf) < _BODY_SCAN_FLOOR + 8:
|
||||
return None
|
||||
best: Optional[tuple[int, int]] = None # (total_samples, offset)
|
||||
i = _BODY_SCAN_FLOOR
|
||||
while True:
|
||||
j = buf.find(_BODY_MAGIC, i)
|
||||
if j < 0:
|
||||
break
|
||||
i = j + 1
|
||||
try:
|
||||
decoded = decode_waveform_v2(buf[j:])
|
||||
except Exception:
|
||||
continue
|
||||
if not decoded:
|
||||
continue
|
||||
total = sum(len(v) for v in decoded.values())
|
||||
# A "real" body has more than just the 2-sample preamble.
|
||||
if total <= 2:
|
||||
continue
|
||||
if best is None or total > best[0]:
|
||||
best = (total, j)
|
||||
return best[1] if best else None
|
||||
|
||||
|
||||
def _decode_waveform_samples(buf: bytes) -> Optional[dict]:
|
||||
"""Decode samples from the sig-A body starting at file offset 0x0f1f.
|
||||
"""Decode samples from the sig-A waveform body.
|
||||
|
||||
Returns the raw decoder counts dict — geo LSB = 0.0003 in/s, mic in
|
||||
its own count unit (see :func:`mic_count_to_psi`). Returns None if
|
||||
decoding fails.
|
||||
no usable body is found.
|
||||
|
||||
Uses :func:`_find_waveform_body_offset` to locate the body — the
|
||||
file-offset varies across events (~50% sit at the canonical
|
||||
``0x0f1f`` but the rest don't), so the previous hardcoded constant
|
||||
silently produced 2-sample preamble-only output for half the corpus.
|
||||
"""
|
||||
if len(buf) < _BODY_START_SIG_A + 8:
|
||||
off = _find_waveform_body_offset(buf)
|
||||
if off is None:
|
||||
return None
|
||||
body = buf[_BODY_START_SIG_A:]
|
||||
return decode_waveform_v2(body)
|
||||
return decode_waveform_v2(buf[off:])
|
||||
|
||||
|
||||
def geo_count_to_ips(count: int) -> float:
|
||||
|
||||
Reference in New Issue
Block a user