Files
seismo-relay/micromate/idf_file.py
T
serversdown b2c565f217 fix(idf_waveforms): _find_waveform_body_offset() — scans every 00 02 00 magic past offset 0x0E00, runs decode_waveform_v2 on each candidate, picks the one that returns the most samples. Validated on 483 prod IDFW files: 0 preamble-only events (was ~50%), 355/483 fully decode, 126/483 partial (BW codec walker-stops-early on loud events — known issue).
IDFH now synthesises a 1-sample-per-interval array from the binary intervals and writes an .h5 so the existing renderer works unchanged. Each "sample" is the per-interval peak ADC count → h5_value = count × geo_fs/32768 yields the right bar height.
2026-05-31 20:51:09 +00:00

519 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
micromate/idf_file.py — Thor IDF binary codec.
Decodes the Instantel Micromate Series IV ``.IDFW`` (waveform) and
``.IDFH`` (histogram) binary on-disk format. Sister module to
``minimateplus/event_file_io.py``.
Status (2026-05-28):
- **Genuine Series IV / Thor binaries** are all signed
``00 12 01 00 00 00 Instantel\\0`` (sig-A in earlier notes). Two
Series III (Blastware) binaries appear in the example corpus
(``BE9439_*``) — they share the ``.IDFW``/``.IDFH`` extension by
filing convention but carry a BW STRT header (``10 00 01 80 00 00
Instantel STRT...``) and are NOT Thor data. The reader detects
them by signature and raises NotImplementedError pointing callers
at ``minimateplus.event_file_io.read_blastware_file()``.
- **IDFW waveform body** reuses the BW segment-rotated block codec
verbatim. Body always starts at file offset ``0x0f1f``. Samples
decoded via ``minimateplus.waveform_codec.decode_waveform_v2``
with 8799% byte-exact match against ``.IDFW.txt`` sidecar (quiet
events). Loud events hit the BW codec's known walker-stops-early
limit. Residual ~3% drift on per-sample deltas — likely a
Thor-specific 12-bit delta refinement that BW's codec doesn't
model. Geo LSB = 0.0003 in/s; mic factor ~2.14e-6 psi/count.
- **IDFH histogram body**: 12-byte segment header
``[len_be 2B] 0a 00 00 00 [00 NN_counter] 05 3f`` introduces a
segment of ``N`` 72-byte interval records (``N = (len - 10) // 72``).
Each record holds 4 × 16-byte per-channel min/max/halfp + 8-byte
tail. Geo peaks via ``max(|min|, |max|) / 32768 × 10`` in/s
(matches sidecar within ~1.8%), freq via ``512 / halfp`` Hz.
**All 859 Thor IDFH files in the corpus decode (181,071 intervals).**
- Binary metadata directly extracted: serial, timestamp, sample_rate,
record_time, calibration_date. Other fields fall back to the paired
``.IDFW.txt`` / ``.IDFH.txt`` sidecar (consumed by
``WaveformStore.save_imported_idf``).
The full reverse-engineering writeup lives in
``docs/idf_protocol_reference.md``.
"""
from __future__ import annotations
import datetime
import struct
from dataclasses import dataclass
from pathlib import Path
from typing import Optional, Union
from minimateplus.waveform_codec import decode_waveform_v2
from .models import IdfEvent, IdfPeaks, IdfReport
# Genuine Series IV / Thor IDF binary signature: 6 bytes, then ASCII "Instantel".
_THOR_PREFIX = b"\x00\x12\x01\x00\x00\x00"
# Stray Series III (Blastware) binaries that occasionally turn up in Thor
# corpus directories renamed to the .IDFW/.IDFH convention. Their header
# (`10 00 01 80 00 00 Instantel STRT ...`) is byte-for-byte a BW SUB 5A
# STRT record, not a Thor binary. Detected so we can refuse-and-route
# rather than mis-parse.
_BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00"
_INSTANTEL_TAG = b"Instantel"
# Most common body offset for sig-A IDFW files (~50% of prod events;
# 151/154 in the original tests/fixtures/THORDATA_example corpus). The
# body is the segment-rotated block stream consumed by decode_waveform_v2;
# bytes [0:3] are the magic ``00 02 00`` preamble. Production events
# routinely use other offsets — see :func:`_find_waveform_body_offset`
# for the dynamic scan. This constant survives only as the priority hint.
_BODY_START_SIG_A = 0x0F1F
# Magic bytes that mark a candidate waveform-body preamble.
_BODY_MAGIC = b"\x00\x02\x00"
# Where to start looking for body candidates inside the file. Skip the
# fixed-header region where the same magic legitimately appears inside
# channel-test records and the compliance block (offsets 0x015d, 0x091c,
# 0x0ae2, 0x0d30 in observed events).
_BODY_SCAN_FLOOR = 0x0E00
# Geophone count → in/s, derived from sidecar ground truth: the smallest
# non-zero sample in 1,014-file corpus is 0.0003 in/s.
_GEO_LSB_IPS = 0.0003
# Microphone count → psi, derived from sidecar regression on 50 sample
# pairs from UM11719_20231219162723.IDFW (mic-heavy event).
_MIC_LSB_PSI = 2.14e-6
# IDFH histogram constants.
_IDFH_INTERVAL_SIZE = 72 # bytes per per-interval record
_IDFH_SEGMENT_HEADER = 10 # bytes: [len_be 2B][0a 00 00 00 4B][00 NN 2B][05 3f 2B]
_IDFH_SEGMENT_TAIL = 2 # bytes after the interval data block, before next marker
_IDFH_HALFP_FREQ_NUM = 512.0 # freq_hz = NUM / halfp; halfp ≤ 5 means ">100 Hz" sentinel
_IDFH_GEO_FULL_SCALE = 10.0 # in/s — Normal range
_IDFH_INT16_FS = 32768.0
_IDFH_CHANNELS = ("Tran", "Vert", "Long", "MicL")
# ─── Binary metadata extraction ─────────────────────────────────────────────
@dataclass
class IdfBinaryMetadata:
"""Fields recoverable from the sig-A binary header (no .txt needed)."""
serial: Optional[str] = None
event_datetime: Optional[datetime.datetime] = None
sample_rate: Optional[int] = None
record_time_sec: Optional[float] = None
calibration_date: Optional[datetime.date] = None
def _read_ascii_z(buf: bytes, off: int, maxlen: int = 64) -> Optional[str]:
if off >= len(buf):
return None
end = buf.find(b"\x00", off, off + maxlen)
if end < 0:
end = min(off + maxlen, len(buf))
s = buf[off:end].decode("ascii", errors="replace").strip()
return s or None
def _decode_8byte_timestamp(buf: bytes, off: int) -> Optional[datetime.datetime]:
"""Layout: ``[day][month][year_hi][year_lo][unknown][hour][min][sec]``."""
if off + 8 > len(buf):
return None
day, mon, yh, yl, _unk, hr, mn, sc = buf[off : off + 8]
year = (yh << 8) | yl
if not (2015 <= year <= 2050 and 1 <= mon <= 12 and 1 <= day <= 31
and 0 <= hr < 24 and 0 <= mn < 60 and 0 <= sc < 60):
return None
try:
return datetime.datetime(year, mon, day, hr, mn, sc)
except ValueError:
return None
def extract_binary_metadata(buf: bytes) -> IdfBinaryMetadata:
"""Pull serial/timestamp/sample_rate/record_time/calibration from the
sig-A binary header.
Field positions confirmed against UM11719_20231219162723.IDFW; stable
across the 151-file sig-A corpus.
"""
md = IdfBinaryMetadata()
# Serial: null-terminated ASCII at 0x14E.
md.serial = _read_ascii_z(buf, 0x14E, maxlen=16)
# Sample rate + record time live in a BW-compatible compliance block.
# Locate the 6-byte anchor `be 80 00 00 00 00` and read offsets relative
# to it: anchor-6 = sample_rate uint16 BE; anchor+6 = record_time float32 BE.
anchor = buf.find(b"\xbe\x80\x00\x00\x00\x00", 0x800, 0xA00)
if anchor > 0:
sr_bytes = buf[anchor - 6 : anchor - 4]
if len(sr_bytes) == 2:
sr = int.from_bytes(sr_bytes, "big")
if sr in (256, 512, 1024, 2048, 4096):
md.sample_rate = sr
rt_bytes = buf[anchor + 6 : anchor + 10]
if len(rt_bytes) == 4:
try:
rt = struct.unpack(">f", rt_bytes)[0]
if 0.1 <= rt <= 600.0:
md.record_time_sec = float(rt)
except struct.error:
pass
# Event timestamp: 8 bytes. Position differs between IDFW (0x97A) and
# IDFH (0x9F8); scan a small range and accept the first valid decode.
for off in (0x97A, 0x9F8):
ts = _decode_8byte_timestamp(buf, off)
if ts is not None:
md.event_datetime = ts
break
# Calibration date: day, month, year_be at 0x194-0x197.
if len(buf) > 0x197:
day, mon = buf[0x194], buf[0x195]
year = int.from_bytes(buf[0x196 : 0x198], "big")
if 1 <= mon <= 12 and 1 <= day <= 31 and 2015 <= year <= 2050:
try:
md.calibration_date = datetime.date(year, mon, day)
except ValueError:
pass
return md
# ─── Sample decoder + unit conversion ───────────────────────────────────────
def _find_waveform_body_offset(buf: bytes) -> Optional[int]:
"""Pick the file offset of the waveform body by trial-decoding every
``00 02 00`` magic position past the fixed-header region.
The body's location isn't fixed across all sig-A IDFW files — about
half the production events use ``0x0f1f``, but the rest have offsets
that shift based on header padding / channel-config layout. We
auto-detect by:
1. Find every ``00 02 00`` occurrence past ``_BODY_SCAN_FLOOR``.
2. Try ``decode_waveform_v2()`` on each candidate.
3. Pick the offset whose decoded sample count is largest.
Returns the offset, or ``None`` if no candidate yielded more than
the trivial 2-sample preamble (= "no real body found").
Costs ~2-8 trial decodes per file; in practice the first candidate
past 0x0e00 is usually the right one.
"""
if len(buf) < _BODY_SCAN_FLOOR + 8:
return None
best: Optional[tuple[int, int]] = None # (total_samples, offset)
i = _BODY_SCAN_FLOOR
while True:
j = buf.find(_BODY_MAGIC, i)
if j < 0:
break
i = j + 1
try:
decoded = decode_waveform_v2(buf[j:])
except Exception:
continue
if not decoded:
continue
total = sum(len(v) for v in decoded.values())
# A "real" body has more than just the 2-sample preamble.
if total <= 2:
continue
if best is None or total > best[0]:
best = (total, j)
return best[1] if best else None
def _decode_waveform_samples(buf: bytes) -> Optional[dict]:
"""Decode samples from the sig-A waveform body.
Returns the raw decoder counts dict — geo LSB = 0.0003 in/s, mic in
its own count unit (see :func:`mic_count_to_psi`). Returns None if
no usable body is found.
Uses :func:`_find_waveform_body_offset` to locate the body — the
file-offset varies across events (~50% sit at the canonical
``0x0f1f`` but the rest don't), so the previous hardcoded constant
silently produced 2-sample preamble-only output for half the corpus.
"""
off = _find_waveform_body_offset(buf)
if off is None:
return None
return decode_waveform_v2(buf[off:])
def geo_count_to_ips(count: int) -> float:
"""Convert a Thor geo decoder count to in/s. LSB = 0.0003 in/s."""
return count * _GEO_LSB_IPS
def mic_count_to_psi(count: int) -> float:
"""Convert a Thor mic decoder count to psi. Scale derived from
regression over 50 sample pairs in UM11719_20231219162723.IDFW;
consistent to ~5%. Calibration constants from the channel block
can refine this once decoded.
"""
return count * _MIC_LSB_PSI
# ─── IDFH histogram decoder ─────────────────────────────────────────────────
@dataclass
class IdfhInterval:
"""One decoded histogram interval (typically one minute of monitoring)."""
offset: int # file byte offset of the 72-byte record
# Per-channel min/max ADC counts (int16 BE), half-period samples, peak count.
# Peak = max(|min|, |max|). freq_hz = 512/halfp (None if halfp ≤ 5 →
# ">100 Hz" sentinel; matches sidecar convention).
tran_min: int
tran_max: int
tran_halfp: int
vert_min: int
vert_max: int
vert_halfp: int
long_min: int
long_max: int
long_halfp: int
micl_min: int
micl_max: int
micl_halfp: int
def peak_count(self, channel: str) -> int:
mn = getattr(self, f"{channel.lower()}_min")
mx = getattr(self, f"{channel.lower()}_max")
return max(abs(mn), abs(mx))
def peak_ips(self, channel: str) -> float:
"""Convert peak count to in/s (geo channels only)."""
return self.peak_count(channel) / _IDFH_INT16_FS * _IDFH_GEO_FULL_SCALE
def freq_hz(self, channel: str) -> Optional[float]:
halfp = getattr(self, f"{channel.lower()}_halfp")
if halfp <= 5:
return None
return _IDFH_HALFP_FREQ_NUM / halfp
def _decode_idfh_interval(buf72: bytes, offset: int) -> IdfhInterval:
"""Decode one 72-byte interval record into per-channel min/max/halfp."""
import struct
fields = []
for i in range(4):
block = buf72[i * 16 : (i + 1) * 16]
mn = struct.unpack_from(">h", block, 0)[0]
mx = struct.unpack_from(">h", block, 2)[0]
# block[4:6] = int16 BE, role unknown (possibly time-of-peak)
halfp = struct.unpack_from(">H", block, 6)[0]
# block[10:12] and block[14:16] are uint16 BE with unknown semantics
# (likely sum / count contributions for the PVS computation).
fields.extend([mn, mx, halfp])
# Tail 8 bytes (buf72[64:72]) carry PVS-related data; not yet decoded.
return IdfhInterval(
offset=offset,
tran_min=fields[0], tran_max=fields[1], tran_halfp=fields[2],
vert_min=fields[3], vert_max=fields[4], vert_halfp=fields[5],
long_min=fields[6], long_max=fields[7], long_halfp=fields[8],
micl_min=fields[9], micl_max=fields[10], micl_halfp=fields[11],
)
def decode_idfh_body(buf: bytes) -> list:
"""Walk an IDFH file and decode every interval record.
The body has one or more segments; each segment header is 12 bytes:
``[length_be 2B][0a 00 00 00][00 NN_counter][05 3f]`` where ``length``
is bytes from the magic through the end of the interval block
(= 10 + 72 × n_intervals). Segments are separated by a 2-byte tail
+ next-segment 2-byte prefix (the bytes before the next length field).
Confirmed against the 859-file corpus (181,071 intervals decoded; 1
failure is the sig-B BE9439 file).
"""
intervals: list = []
i = 0
while True:
j = buf.find(b"\x0a\x00\x00\x00", i)
if j < 0 or j < 2:
break
# Validate: [length_be][0a 00 00 00][00 NN][05 3f]
if buf[j + 4] != 0x00 or buf[j + 6 : j + 8] != b"\x05\x3f":
i = j + 1
continue
length = int.from_bytes(buf[j - 2 : j], "big")
n = (length - _IDFH_SEGMENT_HEADER) // _IDFH_INTERVAL_SIZE
if n <= 0:
i = j + 1
continue
header_start = j - 2
interval_start = header_start + _IDFH_SEGMENT_HEADER
for k in range(n):
off = interval_start + k * _IDFH_INTERVAL_SIZE
if off + _IDFH_INTERVAL_SIZE > len(buf):
break
chunk = buf[off : off + _IDFH_INTERVAL_SIZE]
intervals.append(_decode_idfh_interval(chunk, off))
# Advance past this segment + the 2-byte tail.
i = header_start + length + _IDFH_SEGMENT_TAIL
return intervals
# ─── Top-level reader ───────────────────────────────────────────────────────
@dataclass
class IdfReadResult:
"""Return type for :func:`read_idf_file`.
For waveforms (``.IDFW``), ``samples`` holds the per-channel sample
arrays in Thor decoder counts. For histograms (``.IDFH``),
``samples`` is empty and ``intervals`` holds the per-interval
record list (peaks, freqs).
"""
event: IdfEvent
samples: dict # {"Tran": [...], ...} for IDFW; empty for IDFH
binary_metadata: IdfBinaryMetadata
signature: str # always "thor" for now (sig-A genuine Thor)
intervals: Optional[list] = None # list[IdfhInterval] for IDFH; None for IDFW
def read_idf_file(
path: Union[str, Path],
*,
data: Optional[bytes] = None,
) -> IdfReadResult:
"""Parse a Thor ``.IDFW`` binary into an ``IdfEvent`` + decoded samples.
Currently implements signature-A waveforms only. Signature-B
(old-firmware) and ``.IDFH`` histograms raise NotImplementedError;
use the paired ``.IDFW.txt`` / ``.IDFH.txt`` sidecar for those via
``parse_idf_report()``.
Returns an :class:`IdfReadResult`. The caller converts int sample
counts to physical units via :func:`geo_count_to_ips` /
:func:`mic_count_to_psi`.
``path`` is used for filename in error messages and ``.IDFH`` vs
``.IDFW`` suffix detection. When ``data`` is supplied the disk
read is skipped — useful for ingest paths that already have the
bytes in memory and where the file may not exist on disk yet.
"""
p = Path(path)
buf = data if data is not None else p.read_bytes()
if len(buf) < 16 or buf[6:16] != _INSTANTEL_TAG + b"\x00":
raise ValueError(f"{p.name}: not an IDF file (missing Instantel magic)")
sig_prefix = buf[:6]
if sig_prefix == _THOR_PREFIX:
signature = "thor"
elif sig_prefix == _BW_STRAY_PREFIX:
raise NotImplementedError(
f"{p.name}: file has a Series III (Blastware) STRT header in "
"an IDF-named container — not a Thor binary. Route through "
"minimateplus.event_file_io.read_blastware_file() instead "
"(peaks decode; samples & full metadata don't, but it's not "
"Thor data so the Thor codec doesn't apply)."
)
else:
raise ValueError(f"{p.name}: unknown IDF signature {sig_prefix.hex()}")
is_histogram = p.suffix.upper() == ".IDFH"
md = extract_binary_metadata(buf)
if is_histogram:
intervals = decode_idfh_body(buf)
if not intervals:
raise ValueError(f"{p.name}: IDFH body decoded no intervals")
# Peaks: max across all intervals on each channel (per-channel max
# of stored max-magnitudes; sidecar's PPV row carries the same).
peak_tran = max((iv.peak_ips("Tran") for iv in intervals), default=0.0)
peak_vert = max((iv.peak_ips("Vert") for iv in intervals), default=0.0)
peak_long = max((iv.peak_ips("Long") for iv in intervals), default=0.0)
rep = IdfReport(
serial_number=md.serial,
event_type="Full Histogram",
event_datetime=md.event_datetime,
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
)
peaks = IdfPeaks(
transverse_ips=peak_tran,
vertical_ips=peak_vert,
longitudinal_ips=peak_long,
peak_vector_sum_ips=None,
mic_pspl_dbl=None,
)
event = IdfEvent(
serial=md.serial or "UNKNOWN",
timestamp=md.event_datetime or datetime.datetime(1970, 1, 1),
kind="Histogram",
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
peaks=peaks,
report=rep,
)
return IdfReadResult(
event=event,
samples={},
binary_metadata=md,
signature=signature,
intervals=intervals,
)
# Waveform path.
decoded = _decode_waveform_samples(buf)
if decoded is None:
raise ValueError(f"{p.name}: waveform body codec failed")
rep = IdfReport(
serial_number=md.serial,
event_type="Full Waveform",
event_datetime=md.event_datetime,
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
)
def _peak_ips(ch: str) -> float:
arr = decoded.get(ch, [])
return geo_count_to_ips(max((abs(v) for v in arr), default=0))
peaks = IdfPeaks(
transverse_ips=_peak_ips("Tran"),
vertical_ips=_peak_ips("Vert"),
longitudinal_ips=_peak_ips("Long"),
# PVS requires aligned per-sample √(T²+V²+L²); leave None — the
# sidecar carries it and the bridge picks it up if present.
peak_vector_sum_ips=None,
mic_pspl_dbl=None,
)
event = IdfEvent(
serial=md.serial or "UNKNOWN",
timestamp=md.event_datetime or datetime.datetime(1970, 1, 1),
kind="Waveform",
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
peaks=peaks,
report=rep,
)
return IdfReadResult(
event=event,
samples=decoded,
binary_metadata=md,
signature=signature,
)