series 4 codec work, inital decode success

This commit is contained in:
2026-05-29 06:33:06 +00:00
parent 1bccc44b88
commit 9b71ead44b
20 changed files with 1578 additions and 76 deletions
+22
View File
@@ -73,6 +73,28 @@ should not import from `sfm/`, must not touch a DB, and have no I/O
beyond reading files passed as arguments. Keep them pure — both beyond reading files passed as arguments. Keep them pure — both
tiers can then depend on them without circularity. tiers can then depend on them without circularity.
#### Thor IDF binary codec (2026-05-28)
`micromate/idf_file.read_idf_file()` decodes both Thor IDFW
(waveform) and IDFH (histogram) binaries.
- **IDFW** reuses `decode_waveform_v2()` on the body at fixed file
offset `0x0f1f`. Sample fidelity is 8799% byte-exact on quiet
events; loud events hit the BW codec's known walker-stops-early
limitation.
- **IDFH** has its own segment-based decoder: `[len_be][0a 00 00 00]
[00 NN][05 3f]` + N × 72-byte interval records (4 × 16-byte
per-channel min/max/halfp). All 859 Thor IDFH corpus files
decode (181,071 intervals); peak matches sidecar within ~1.8%
(ADC quantization).
The two outlier `BE9439_*` files in the Thor example corpus are
actually Series III Blastware binaries that share the `.IDFW`/`.IDFH`
filename convention by accident. `read_idf_file()` detects them by
their BW STRT signature and raises NotImplementedError pointing
callers at `read_blastware_file()`. See
`docs/idf_protocol_reference.md` for full field layouts.
### Practical consequences ### Practical consequences
When deciding where new code goes, ask: When deciding where new code goes, ask:
+65
View File
@@ -0,0 +1,65 @@
"""Run read_idf_file across the corpus and report per-channel accuracy vs sidecars."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from micromate.idf_file import read_idf_file
from analysis_idf.recon import load_sidecar_samples
def sidecar_path(idfw: Path) -> Path:
return idfw.parent / "TXT" / f"{idfw.name}.txt"
def main():
root = REPO / "tests/fixtures/THORDATA_example"
files = [f for f in root.rglob("*.IDFW") if not str(f).endswith(".CDB")]
files.sort()
GEO_LSB = 0.0003
n_ok = n_skip = 0
overall = {"Tran": [], "Vert": [], "Long": []}
for f in files:
try:
res = read_idf_file(f)
except Exception:
n_skip += 1
continue
sc_path = sidecar_path(f)
if not sc_path.exists():
n_skip += 1
continue
try:
sc = load_sidecar_samples(sc_path)
except Exception:
n_skip += 1
continue
per_file = {}
for ch in ("Tran", "Vert", "Long"):
sc_counts = [int(round(v / GEO_LSB)) for v in sc[ch]]
dec = res.samples.get(ch, [])
n = min(len(sc_counts), len(dec))
if n == 0:
per_file[ch] = 0.0
continue
exact = sum(1 for i in range(n) if sc_counts[i] == dec[i])
pct = 100.0 * exact / n
per_file[ch] = pct
overall[ch].append(pct)
n_ok += 1
print(f"Processed {n_ok} files (skipped {n_skip})")
print("Per-channel exact-match % (mean / min / max):")
for ch, vals in overall.items():
if vals:
avg = sum(vals) / len(vals)
print(f" {ch}: mean={avg:.2f}% min={min(vals):.2f}% max={max(vals):.2f}% n={len(vals)}")
if __name__ == "__main__":
main()
+49
View File
@@ -0,0 +1,49 @@
"""Find where decoded-vs-sidecar diverges for each channel."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from minimateplus.waveform_codec import decode_waveform_v2
from analysis_idf.recon import TARGET, TXT, load_sidecar_samples
def main():
buf = TARGET.read_bytes()
sc = load_sidecar_samples(TXT)
decoded = decode_waveform_v2(buf[0x0f1f:])
GEO_LSB = 0.0003
for ch in ("Tran", "Vert", "Long"):
sc_counts = [int(round(v / GEO_LSB)) for v in sc[ch]]
dec = decoded[ch]
# Find ALL transitions where mismatches start/stop
first_diff = next((i for i in range(len(dec)) if dec[i] != sc_counts[i]), None)
if first_diff is None:
print(f"{ch}: NO MISMATCHES")
continue
print(f"{ch}: first diff at idx {first_diff}")
# Show 5 before, 5 after
for i in range(max(0, first_diff - 3), min(len(dec), first_diff + 8)):
mark = " " if dec[i] == sc_counts[i] else "**"
print(f" {mark} idx {i:4d}: sc={sc_counts[i]:6d} dec={dec[i]:6d} diff={dec[i]-sc_counts[i]:+d}")
# Where does cumulative diff exceed 100?
cum_match_run = 0
max_match_run = 0
match_run_start = 0
diff_count = 0
for i in range(len(dec)):
if dec[i] == sc_counts[i]:
cum_match_run += 1
max_match_run = max(max_match_run, cum_match_run)
else:
cum_match_run = 0
diff_count += 1
print(f" total mismatches: {diff_count}/{len(dec)}, longest run of matches: {max_match_run}")
print()
if __name__ == "__main__":
main()
+48
View File
@@ -0,0 +1,48 @@
"""End-to-end IDFH ingest verification."""
from __future__ import annotations
import sys
import tempfile
import json
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from sfm.waveform_store import WaveformStore
def main():
idfh = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM13981/UM13981_20220805075441.IDFH"
txt = idfh.parent / "TXT" / f"{idfh.name}.txt"
with tempfile.TemporaryDirectory() as td:
store = WaveformStore(Path(td))
ev, rec = store.save_imported_idf(
idfh.read_bytes(),
idfh,
idf_report_text=txt.read_text(errors="replace"),
)
print("=== save_imported_idf (IDFH) ===")
print(f" serial: {rec['serial']}")
print(f" filename: {rec['filename']}")
print(f" filesize: {rec['filesize']}")
print(f" h5: {rec['hdf5_filename']}") # expect None for histogram
print(f" sidecar: {rec['sidecar_filename']}")
print()
print("=== Event ===")
print(f" timestamp: {ev.timestamp}")
print(f" record_type: {ev.record_type}")
print(f" sample_rate: {ev.sample_rate}")
print()
# Inspect sidecar to confirm intervals were stashed
sc_path = Path(td) / "UM13981" / f"{idfh.name}.sfm.json"
sc = json.loads(sc_path.read_text())
intervals = sc.get("extensions", {}).get("idf_intervals", [])
print(f" sidecar intervals: {len(intervals)}")
if intervals:
print(f" first interval: {intervals[0]}")
print(f" last interval: {intervals[-1]}")
if __name__ == "__main__":
main()
+40
View File
@@ -0,0 +1,40 @@
"""Verify the had_report=False path: ingest IDFW with no .txt."""
from __future__ import annotations
import sys
from pathlib import Path
import tempfile
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from sfm.waveform_store import WaveformStore
def main():
idfw = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM11719/UM11719_20231219162723.IDFW"
with tempfile.TemporaryDirectory() as td:
store = WaveformStore(Path(td))
ev, rec = store.save_imported_idf(
idfw.read_bytes(),
idfw,
serial_hint=None,
idf_report_text=None, # ← no .txt!
)
print("=== IDFW without .txt ingest ===")
print(f" serial: {rec['serial']}")
print(f" timestamp: {ev.timestamp}")
print(f" sample_rate: {ev.sample_rate}")
print(f" record_type: {ev.record_type}")
print(f" rectime_sec: {ev.rectime_seconds}")
nT = len(ev.raw_samples.get('Tran', [])) if ev.raw_samples else 0
nV = len(ev.raw_samples.get('Vert', [])) if ev.raw_samples else 0
nL = len(ev.raw_samples.get('Long', [])) if ev.raw_samples else 0
nM = len(ev.raw_samples.get('MicL', [])) if ev.raw_samples else 0
print(f" raw_samples: Tran={nT} Vert={nV} Long={nL} MicL={nM}")
if ev.peak_values:
print(f" peak_values: tran={ev.peak_values.tran} vert={ev.peak_values.vert} long={ev.peak_values.long}")
print(f" h5 written: {rec['hdf5_filename']}")
if __name__ == "__main__":
main()
+52
View File
@@ -0,0 +1,52 @@
"""End-to-end ingest test: feed an IDFW + .txt to save_imported_idf in a tmp store."""
from __future__ import annotations
import sys
from pathlib import Path
import tempfile
import shutil
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from sfm.waveform_store import WaveformStore
def main():
idfw = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM11719/UM11719_20231219162723.IDFW"
txt = idfw.parent / "TXT" / f"{idfw.name}.txt"
with tempfile.TemporaryDirectory() as td:
store = WaveformStore(Path(td))
ev, rec = store.save_imported_idf(
idfw.read_bytes(),
idfw,
serial_hint=None,
idf_report_text=txt.read_text(errors="replace"),
)
print("=== Save result ===")
print(f" serial: {rec['serial']}")
print(f" filename: {rec['filename']}")
print(f" filesize: {rec['filesize']}")
print(f" h5: {rec['hdf5_filename']}")
print(f" sidecar: {rec['sidecar_filename']}")
print()
print("=== Event ===")
print(f" serial: {ev.serial if hasattr(ev,'serial') else '(n/a)'}")
print(f" timestamp: {ev.timestamp}")
print(f" sample_rate: {ev.sample_rate}")
print(f" record_type: {ev.record_type}")
print(f" rectime_sec: {ev.rectime_seconds}")
print(f" raw_samples: Tran={len(ev.raw_samples.get('Tran', [])) if ev.raw_samples else 0}, Vert={len(ev.raw_samples.get('Vert', [])) if ev.raw_samples else 0}, Long={len(ev.raw_samples.get('Long', [])) if ev.raw_samples else 0}, MicL={len(ev.raw_samples.get('MicL', [])) if ev.raw_samples else 0}")
if ev.peak_values:
print(f" peaks (txt): Tran={ev.peak_values.tran} Vert={ev.peak_values.vert} Long={ev.peak_values.long}")
print()
# Verify the h5 file actually got written
h5path = Path(td) / "UM11719" / f"{idfw.name}.h5"
print(f" h5 exists: {h5path.exists()} size={h5path.stat().st_size if h5path.exists() else 0}")
sidecar = Path(td) / "UM11719" / f"{idfw.name}.sfm.json"
print(f" sidecar exists:{sidecar.exists()} size={sidecar.stat().st_size if sidecar.exists() else 0}")
if __name__ == "__main__":
main()
+137
View File
@@ -0,0 +1,137 @@
"""Decode IDFH histogram intervals + verify against sidecar."""
from __future__ import annotations
import sys
import struct
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
SEGMENT_MAGIC = b"\x02\xda\x0a\x00\x00\x00"
SEGMENT_SIZE = 732 # = 10-byte header + 10 × 72-byte intervals + 2-byte tail
INTERVAL_SIZE = 72
CHANNELS = ("Tran", "Vert", "Long", "MicL")
def decode_interval(buf72: bytes) -> dict:
"""Decode one 72-byte interval into per-channel min/max/halfp."""
out = {}
for i, ch in enumerate(CHANNELS):
block = buf72[i*16 : (i+1)*16]
mn = struct.unpack_from(">h", block, 0)[0]
mx = struct.unpack_from(">h", block, 2)[0]
sb = struct.unpack_from(">h", block, 4)[0]
halfp = struct.unpack_from(">H", block, 6)[0]
f10 = struct.unpack_from(">H", block, 10)[0]
f14 = struct.unpack_from(">H", block, 14)[0]
peak_count = max(abs(mn), abs(mx))
out[ch] = {
"min": mn,
"max": mx,
"field4": sb,
"halfp": halfp,
"field10": f10,
"field14": f14,
"peak": peak_count,
"freq_hz": (512.0 / halfp) if halfp > 5 else None,
}
out["_tail"] = buf72[64:].hex(" ")
return out
def walk_idfh(buf: bytes) -> list:
"""Walk all interval records in an IDFH file."""
intervals = []
# Multi-segment file: every 02 da 0a 00 00 00 marker introduces a segment.
# Single-interval file: just one body header at 0xf96 of form ?? ?? 0a 00 00 00.
# Find them all.
i = 0
while True:
j = buf.find(b"\x0a\x00\x00\x00", i)
if j < 0:
break
# Validate: the 2 bytes before must form a length, and we want bytes
# [j-2 : j+6] to have a recognisable shape. Actually the cleanest
# filter is "preceded by a length and followed by 00 NN 05 3f".
if j < 2:
i = j + 1
continue
# Body header form: [length_be_2][0a 00 00 00][00 NN][05 3f]
if j + 10 > len(buf):
break
length = int.from_bytes(buf[j-2:j], "big")
# Verify the segment-marker shape: [length_be][0a 00 00 00][00 NN][05 3f]
if buf[j+4] != 0x00:
i = j + 1
continue
if buf[j+6:j+8] != b"\x05\x3f":
i = j + 1
continue
# Header layout (10 bytes): [length_be 2B][0a 00 00 00 4B][00 NN 2B][05 3f 2B]
# Followed by N interval records of 72 bytes each, then 2 tail bytes.
# length value = (N × 72) + 10 (counts bytes from 0x0a... through interval data).
header_start = j - 2
n_intervals = (length - 10) // INTERVAL_SIZE
interval_start = header_start + 10
for k in range(n_intervals):
off = interval_start + k * INTERVAL_SIZE
if off + INTERVAL_SIZE > len(buf):
break
chunk = buf[off:off + INTERVAL_SIZE]
intervals.append({"offset": off, **decode_interval(chunk)})
i = header_start + length + 2
return intervals
def main():
# Test against multi-segment IDFH
target = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM13981/UM13981_20220805075441.IDFH"
sc_path = target.parent / "TXT" / f"{target.name}.txt"
buf = target.read_bytes()
intervals = walk_idfh(buf)
print(f"=== {target.name} ===")
print(f" file size: {len(buf)}")
print(f" decoded intervals: {len(intervals)}")
# Show first 2 + last 2
sc_rows = []
for line in sc_path.read_text(errors="replace").splitlines():
if line.startswith("2022-") or line.startswith("2023-"):
sc_rows.append(line)
print(f" sidecar rows: {len(sc_rows)}")
print()
for k in [0, 1, 78, 79, 80]:
if k >= len(intervals):
continue
iv = intervals[k]
print(f"--- interval {k} @0x{iv['offset']:04x} ---")
for ch in CHANNELS:
d = iv[ch]
peak_ips = d["peak"] / 32768 * 10.0
print(f" {ch}: peak={d['peak']:5d} ({peak_ips:.4f} in/s) halfp={d['halfp']:5d} freq={d['freq_hz']}")
# sidecar row
if k < len(sc_rows):
print(f" SC: {sc_rows[k]}")
# Test single-interval IDFH
print()
target2 = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM11719/UM11719_20231219162648.IDFH"
sc2 = target2.parent / "TXT" / f"{target2.name}.txt"
buf2 = target2.read_bytes()
intervals2 = walk_idfh(buf2)
print(f"=== {target2.name} ===")
print(f" file size: {len(buf2)}, decoded intervals: {len(intervals2)}")
if intervals2:
iv = intervals2[0]
for ch in CHANNELS:
d = iv[ch]
peak_ips = d["peak"] / 32768 * 10.0
print(f" {ch}: peak={d['peak']:5d} ({peak_ips:.4f} in/s) halfp={d['halfp']:5d} freq={d['freq_hz']}")
sc_rows2 = [l for l in sc2.read_text(errors='replace').splitlines() if l.startswith("2023-")]
if sc_rows2:
print(f" SC: {sc_rows2[0]}")
if __name__ == "__main__":
main()
+41
View File
@@ -0,0 +1,41 @@
"""Find IDFH interval period via auto-correlation of structural patterns."""
from __future__ import annotations
import sys
from pathlib import Path
from collections import Counter
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
def main():
target = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM13981/UM13981_20220805075441.IDFH"
buf = target.read_bytes()
body_start = 0xF96
body_end = 0x270C
body = buf[body_start:body_end]
print(f"body size: {len(body)} bytes (file {len(buf)} bytes)")
# For each candidate interval size, count how many bytes at fixed offsets within
# each interval are zero (consistent column-zero pattern indicates correct size).
print()
print("=== zero-column score by interval size (higher = more likely) ===")
best = []
for sz in range(16, 100):
n = len(body) // sz
if n < 30:
continue
# For each column position within an interval, count how many of n intervals have zero
score = 0
for col in range(sz):
zeros = sum(1 for i in range(n) if body[i*sz + col] == 0)
if zeros >= n * 0.9:
score += 1
best.append((score, sz, n))
best.sort(reverse=True)
for score, sz, n in best[:10]:
print(f" size={sz:3d} n_intervals={n} consistently-zero-cols={score}")
if __name__ == "__main__":
main()
+40
View File
@@ -0,0 +1,40 @@
"""Per-file accuracy + sample-count details."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from micromate.idf_file import read_idf_file
from analysis_idf.recon import load_sidecar_samples
def main():
root = REPO / "tests/fixtures/THORDATA_example"
files = sorted([f for f in root.rglob("*.IDFW") if not str(f).endswith(".CDB")])
GEO_LSB = 0.0003
# Limit to first 15 successful files for detail.
shown = 0
for f in files:
try:
res = read_idf_file(f)
except Exception:
continue
sc_path = f.parent / "TXT" / f"{f.name}.txt"
if not sc_path.exists():
continue
sc = load_sidecar_samples(sc_path)
sc_tran = [int(round(v / GEO_LSB)) for v in sc["Tran"]]
dec = res.samples.get("Tran", [])
n = min(len(sc_tran), len(dec))
exact = sum(1 for i in range(n) if sc_tran[i] == dec[i]) if n else 0
pct = 100.0 * exact / n if n else 0.0
print(f"{f.name:40s} size={f.stat().st_size:6d} sc_n={len(sc_tran):4d} dec_n={len(dec):4d} exact={pct:.1f}%")
shown += 1
if shown >= 20:
break
if __name__ == "__main__":
main()
+64
View File
@@ -0,0 +1,64 @@
"""Look at what's at the divergence boundary."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from minimateplus.waveform_codec import walk_body, find_data_start, parse_segment_header
from analysis_idf.recon import TARGET, TXT, load_sidecar_samples
def main():
buf = TARGET.read_bytes()
body = buf[0x0f1f:]
start = find_data_start(body)
print(f"data_start: {start} (= file offset 0x{0x0f1f + start:04x})")
blocks = walk_body(body, start)
print(f"{len(blocks)} blocks total")
print()
# First 25 blocks
print("=== first 30 blocks ===")
for i, b in enumerate(blocks[:30]):
body_off = 0x0f1f + b.offset
if b.tag_hi == 0x40:
hdr = parse_segment_header(b)
print(f" [{i:3d}] @0x{body_off:04x} {b.kind} (segment header) counter={hdr['counter'] if hdr else '?'} field2={hdr['field2'].hex() if hdr else '?'} anchor={hdr['anchor_bytes'].hex() if hdr else '?'} tail={hdr['tail'].hex() if hdr else '?'}")
else:
print(f" [{i:3d}] @0x{body_off:04x} {b.kind} len={b.length} data={b.data[:16].hex()}")
print()
# Cumulative sample counts per block to find which block contains sample 254
print("=== cumulative samples through blocks ===")
cur_ch = "Tran"
rotation = ["Vert", "Long", "MicL", "Tran"]
seg_count = 0
samples_in_curseg = 2 # preamble Tran[0], Tran[1]
for i, b in enumerate(blocks[:30]):
if b.tag_hi == 0x40:
seg_count += 1
prev_ch = cur_ch
cur_ch = rotation[(seg_count - 1) % 4]
print(f" [{i:3d}] 40 02 -> end of {prev_ch} segment, start {cur_ch} (segment {seg_count})")
samples_in_curseg = 2 # anchors
elif (b.tag_hi & 0xF0) == 0x10:
nn = ((b.tag_hi & 0x0F) << 8) | b.tag_lo
samples_in_curseg += nn
print(f" [{i:3d}] {b.kind} nibble: +{nn} samples, ch={cur_ch}, ch_total~{samples_in_curseg}")
elif (b.tag_hi & 0xF0) == 0x20:
nn = ((b.tag_hi & 0x0F) << 8) | b.tag_lo
samples_in_curseg += nn
print(f" [{i:3d}] {b.kind} int8: +{nn} samples, ch={cur_ch}, ch_total~{samples_in_curseg}")
elif b.tag_hi == 0x00:
samples_in_curseg += b.tag_lo
print(f" [{i:3d}] {b.kind} RLE: +{b.tag_lo}, ch={cur_ch}, ch_total~{samples_in_curseg}")
elif b.tag_hi == 0x30:
samples_in_curseg += b.tag_lo
print(f" [{i:3d}] {b.kind} packed12: +{b.tag_lo} samples, ch={cur_ch}, ch_total~{samples_in_curseg}")
if __name__ == "__main__":
main()
+89
View File
@@ -0,0 +1,89 @@
"""Reconnaissance helpers for cracking the Thor IDFW binary."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
TARGET = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM11719/UM11719_20231219162723.IDFW"
TXT = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM11719/TXT/UM11719_20231219162723.IDFW.txt"
def hex_at(buf: bytes, off: int, n: int = 32) -> str:
chunk = buf[off : off + n]
hexs = " ".join(f"{b:02x}" for b in chunk)
asc = "".join(chr(b) if 32 <= b < 127 else "." for b in chunk)
return f"{off:04x}: {hexs} {asc}"
def find_all(buf: bytes, needle: bytes) -> list[int]:
out: list[int] = []
i = 0
while True:
j = buf.find(needle, i)
if j < 0:
break
out.append(j)
i = j + 1
return out
def load_sidecar_samples(path: Path) -> dict[str, list[float]]:
"""Parse the txt sample table — Tran/Vert/Long/MicL."""
out = {"Tran": [], "Vert": [], "Long": [], "MicL": []}
in_block = False
for line in path.read_text(errors="replace").splitlines():
if not in_block:
if line.strip() == "Waveform Data Channels":
in_block = True
continue
if line.startswith("Waveform Data USB Channels"):
break
parts = line.split("\t")
# First row is the header "\tTran\tVert\tLong\tMicL"
if len(parts) >= 5 and parts[1] == "Tran":
continue
if len(parts) < 5:
continue
try:
out["Tran"].append(float(parts[1]))
out["Vert"].append(float(parts[2]))
out["Long"].append(float(parts[3]))
out["MicL"].append(float(parts[4]))
except ValueError:
continue
return out
def main():
buf = TARGET.read_bytes()
samples = load_sidecar_samples(TXT)
print(f"file size: {len(buf)} bytes")
print(f"sample rows: Tran={len(samples['Tran'])} Vert={len(samples['Vert'])} Long={len(samples['Long'])} MicL={len(samples['MicL'])}")
print(f"first 6 Tran samples: {samples['Tran'][:6]}")
print(f"first 6 Vert samples: {samples['Vert'][:6]}")
print(f"first 6 Long samples: {samples['Long'][:6]}")
print(f"first 6 MicL samples: {samples['MicL'][:6]}")
print()
print("=== BW magic '00 02 00' positions ===")
hits = find_all(buf, b"\x00\x02\x00")
print(f"{len(hits)} hits")
for h in hits[:20]:
print(hex_at(buf, h, 24))
print()
print("=== '40 02' segment-header positions ===")
hits = find_all(buf, b"\x40\x02")
print(f"{len(hits)} hits")
for h in hits:
ctx_pre = buf[max(0, h - 4): h].hex()
ctx_post = buf[h: h + 20].hex()
# Show byte preceding to help identify real headers vs casual occurrences
print(f" 0x{h:04x} pre={ctx_pre} post={ctx_post}")
if __name__ == "__main__":
main()
+40
View File
@@ -0,0 +1,40 @@
"""Find each segment boundary in the channel and check if errors reset there."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from minimateplus.waveform_codec import decode_waveform_v2
from analysis_idf.recon import TARGET, TXT, load_sidecar_samples
def main():
buf = TARGET.read_bytes()
sc = load_sidecar_samples(TXT)
decoded = decode_waveform_v2(buf[0x0f1f:])
GEO_LSB = 0.0003
for ch in ("Tran", "Vert", "Long"):
sc_counts = [int(round(v / GEO_LSB)) for v in sc[ch]]
dec = decoded[ch]
# Find every transition where error becomes zero from nonzero (or grows from zero)
# Print indices where dec resyncs back to exact match.
n = min(len(sc_counts), len(dec))
events = []
prev_match = True
for i in range(n):
match = sc_counts[i] == dec[i]
if match != prev_match:
kind = "RESYNC" if match else "DIVERGE"
events.append((i, kind, sc_counts[i], dec[i]))
prev_match = match
print(f"{ch}: {len(events)} transitions")
for i, kind, sc_v, dec_v in events[:20]:
print(f" idx {i:4d} {kind:8s} sc={sc_v:6d} dec={dec_v:6d} diff={dec_v-sc_v:+d}")
print()
if __name__ == "__main__":
main()
+46
View File
@@ -0,0 +1,46 @@
"""Smoke-test read_idf_file on IDFH across the corpus."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from micromate.idf_file import read_idf_file
def main():
target = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM11719/UM11719_20231219162648.IDFH"
result = read_idf_file(target)
ev = result.event
print(f"=== {target.name} ===")
print(f" signature: {result.signature}")
print(f" serial: {ev.serial}")
print(f" timestamp: {ev.timestamp}")
print(f" sample_rate: {ev.sample_rate}")
print(f" kind: {ev.kind}")
print(f" intervals: {len(result.intervals or [])}")
print(f" peaks: T={ev.peaks.transverse_ips:.4f} V={ev.peaks.vertical_ips:.4f} L={ev.peaks.longitudinal_ips:.4f}")
print()
root = REPO / "tests/fixtures/THORDATA_example"
files = list(root.rglob("*.IDFH"))
ok = fail = nyi = 0
total_intervals = 0
for f in files:
try:
r = read_idf_file(f)
ok += 1
total_intervals += len(r.intervals or [])
except NotImplementedError:
nyi += 1
except Exception as exc:
fail += 1
if fail <= 3:
print(f" FAIL: {f.name}: {type(exc).__name__}: {exc}")
print(f"Corpus: {len(files)} IDFH files | ok={ok} fail={fail} nyi={nyi}")
print(f"Total intervals decoded: {total_intervals}")
if __name__ == "__main__":
main()
+48
View File
@@ -0,0 +1,48 @@
"""Smoke-test read_idf_file across the sample corpus."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from micromate.idf_file import read_idf_file, geo_count_to_ips, mic_count_to_psi
def main():
target = REPO / "tests/fixtures/THORDATA_example/THORDATA_example/UPMC Presby/UM11719/UM11719_20231219162723.IDFW"
result = read_idf_file(target)
ev = result.event
print(f"=== {target.name} ===")
print(f" signature: {result.signature}")
print(f" serial: {ev.serial}")
print(f" timestamp: {ev.timestamp}")
print(f" sample_rate: {ev.sample_rate}")
print(f" record_time: {ev.record_time_sec}")
print(f" calibration: {result.binary_metadata.calibration_date}")
print(f" Tran samples: {len(result.samples['Tran'])}, peak_ips={ev.peaks.transverse_ips:.4f}")
print(f" Vert samples: {len(result.samples['Vert'])}, peak_ips={ev.peaks.vertical_ips:.4f}")
print(f" Long samples: {len(result.samples['Long'])}, peak_ips={ev.peaks.longitudinal_ips:.4f}")
print(f" MicL samples: {len(result.samples['MicL'])}")
print()
# Corpus sweep
root = REPO / "tests/fixtures/THORDATA_example"
files = [f for f in root.rglob("*.IDFW") if not str(f).endswith(".CDB")]
ok = fail = nyi = 0
for f in files:
try:
r = read_idf_file(f)
ok += 1
except NotImplementedError:
nyi += 1
except Exception as exc:
fail += 1
if fail <= 5:
print(f" FAIL: {f.name}: {type(exc).__name__}: {exc}")
print()
print(f"Corpus: {len(files)} IDFW files | ok={ok} fail={fail} not-implemented={nyi}")
if __name__ == "__main__":
main()
+73
View File
@@ -0,0 +1,73 @@
"""Trace Tran sample-by-sample to find exactly where the codec drifts."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from analysis_idf.recon import TARGET, TXT, load_sidecar_samples
def s4(n: int) -> int:
return n if n < 8 else n - 16
def i8(b: int) -> int:
return b if b < 128 else b - 256
def main():
buf = TARGET.read_bytes()
sc = load_sidecar_samples(TXT)
GEO_LSB = 0.0003
sc_tran = [int(round(v / GEO_LSB)) for v in sc["Tran"]]
body = buf[0x0f1f:]
# Tran[0], Tran[1] from preamble
t0 = int.from_bytes(body[3:5], "big", signed=True)
t1 = int.from_bytes(body[5:7], "big", signed=True)
print(f"preamble Tran[0]={t0} Tran[1]={t1} (sidecar: {sc_tran[0]}, {sc_tran[1]})")
# Block 0: 10 f8 at body[7:9]
print(f"block 0: tag {body[7]:02x} {body[8]:02x}")
print(f" block 0 first 10 data bytes: {body[9:19].hex()}")
# Walk block 0 manually, comparing each sample
cur = t1
samples = [t0, t1]
block_off = 7
nn = body[8]
print(f" NN = {nn}")
data = body[9 : 9 + nn // 2]
for byi, byte in enumerate(data):
for nib_idx, nib in enumerate(((byte >> 4) & 0xF, byte & 0xF)):
cur += s4(nib)
samples.append(cur)
idx = len(samples) - 1
if 0 <= idx < len(sc_tran):
sc_v = sc_tran[idx]
match = "" if sc_v == cur else ""
if idx < 12 or 240 <= idx <= 260:
print(f" idx {idx:3d}: nibble byte={byte:02x} nib={nib:x} delta={s4(nib):+d} cur={cur:+d} sc={sc_v:+d} {match}")
print(f"end of block 0: cur={cur}, len(samples)={len(samples)}, decoder expected 250 here")
# Block 1: 20 28 starts at offset 9 + 124 = 133 from block_off=7
block1_off = 9 + nn // 2
print(f"block 1: tag {body[block1_off]:02x} {body[block1_off+1]:02x} (expecting 20 28)")
nn1 = body[block1_off + 1]
print(f" block 1 NN = {nn1}")
data1 = body[block1_off + 2 : block1_off + 2 + nn1]
for byi, byte in enumerate(data1):
cur += i8(byte)
samples.append(cur)
idx = len(samples) - 1
if idx < len(sc_tran):
sc_v = sc_tran[idx]
match = "" if sc_v == cur else ""
if 248 <= idx <= 295:
print(f" idx {idx:3d}: int8 byte={byte:02x} delta={i8(byte):+d} cur={cur:+d} sc={sc_v:+d} {match}")
if __name__ == "__main__":
main()
+42
View File
@@ -0,0 +1,42 @@
"""Feed candidate body offsets to the BW codec and compare with sidecar."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from minimateplus.waveform_codec import decode_waveform_v2, walk_body, find_data_start
from analysis_idf.recon import TARGET, TXT, load_sidecar_samples
def main():
buf = TARGET.read_bytes()
sc = load_sidecar_samples(TXT)
# Sidecar samples in 0.0003 counts (Thor geo LSB).
sc_tran = [int(round(v / 0.0003)) for v in sc["Tran"][:30]]
sc_vert = [int(round(v / 0.0003)) for v in sc["Vert"][:30]]
sc_long = [int(round(v / 0.0003)) for v in sc["Long"][:30]]
sc_micl = [int(round(v / 1e-6)) for v in sc["MicL"][:30]] # 1 µ unit for mic? Will iterate.
print(f"sidecar Tran (counts): {sc_tran}")
print(f"sidecar Vert (counts): {sc_vert}")
print(f"sidecar Long (counts): {sc_long}")
print(f"sidecar MicL (×1e-6): {sc_micl}")
print()
# Try candidate body start offsets.
for off in (0x0f1f, 0x1057, 0x11f1, 0x1333, 0x1bde, 0x0d30):
print(f"=== body @ 0x{off:04x} ===")
body = buf[off:]
decoded = decode_waveform_v2(body)
if not decoded:
print(" decode_waveform_v2 returned None")
continue
for ch in ("Tran", "Vert", "Long", "MicL"):
arr = decoded.get(ch, [])
print(f" {ch}[{len(arr)}]: {arr[:20]}")
print()
if __name__ == "__main__":
main()
+51
View File
@@ -0,0 +1,51 @@
"""Verify decode_waveform_v2 against sidecar across all 2304 samples per channel."""
from __future__ import annotations
import sys
from pathlib import Path
REPO = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(REPO))
from minimateplus.waveform_codec import decode_waveform_v2
from analysis_idf.recon import TARGET, TXT, load_sidecar_samples
def main():
buf = TARGET.read_bytes()
sc = load_sidecar_samples(TXT)
body = buf[0x0f1f:]
decoded = decode_waveform_v2(body)
print(f"Sidecar lengths: Tran={len(sc['Tran'])} Vert={len(sc['Vert'])} Long={len(sc['Long'])} MicL={len(sc['MicL'])}")
print(f"Decoded lengths: Tran={len(decoded['Tran'])} Vert={len(decoded['Vert'])} Long={len(decoded['Long'])} MicL={len(decoded['MicL'])}")
print()
GEO_LSB = 0.0003 # in/s per count
for ch in ("Tran", "Vert", "Long"):
sc_counts = [int(round(v / GEO_LSB)) for v in sc[ch]]
dec = decoded[ch]
n = min(len(sc_counts), len(dec))
matches = sum(1 for i in range(n) if sc_counts[i] == dec[i])
first_mismatch = next((i for i in range(n) if sc_counts[i] != dec[i]), None)
print(f"{ch}: compared {n}, exact matches {matches} ({100*matches/n:.2f}%)")
if first_mismatch is not None:
i = first_mismatch
print(f" first mismatch at idx {i}: sidecar={sc_counts[i]} ({sc[ch][i]}), decoded={dec[i]}")
print(f" context sidecar[{i-2}..{i+5}]: {sc_counts[max(0,i-2):i+5]}")
print(f" context decoded[{i-2}..{i+5}]: {dec[max(0,i-2):i+5]}")
# MicL: find the multiplicative factor that fits
print()
print("=== MicL scale analysis ===")
sc_micl = sc["MicL"]
dec_micl = decoded["MicL"]
# Skip zero values when computing ratio
ratios = [sc_micl[i] / dec_micl[i] for i in range(min(50, len(sc_micl), len(dec_micl))) if dec_micl[i] != 0]
if ratios:
avg = sum(ratios) / len(ratios)
print(f" avg ratio sidecar/decoded over first 50 nonzero: {avg:.4e} (n={len(ratios)})")
print(f" ratios sample: {[f'{r:.4e}' for r in ratios[:6]]}")
if __name__ == "__main__":
main()
+62 -5
View File
@@ -6,11 +6,68 @@ Series IV event-file format. Sibling to
Series III "Rosetta Stone") — this doc holds what we know so far and Series III "Rosetta Stone") — this doc holds what we know so far and
the open questions still to crack. the open questions still to crack.
**Status (2026-05-20):** ASCII text sidecar fully decoded (1,014 **Status (2026-05-28):** ASCII text sidecar fully decoded (1,014
sample files round-trip). Binary `.IDFH` / `.IDFW` codec sample files round-trip). **Thor IDFW** binary now decodes via
**not yet implemented** — binaries are stored opaquely by `micromate.idf_file.read_idf_file()` — reuses the BW segment-rotated
`WaveformStore.save_imported_idf`, with metadata sourced from the block codec verbatim at fixed body offset `0x0f1f`; metadata (serial,
paired `.txt` sidecar. timestamp, sample_rate, record_time, calibration_date) extracted from
the binary header. Sample fidelity is 8799% byte-exact on quiet
events; loud events hit the BW codec's known walker-stops-early
limitation. Residual ~3% drift on per-sample deltas (likely a
Thor-specific 12-bit delta refinement not yet modelled).
**Thor IDFH histograms also decoded.** Body has one or more segments;
each 12-byte segment header `[length_be 2B][0a 00 00 00][00 NN][05 3f]`
introduces `N = (length - 10) // 72` interval records of 72 bytes
each. Each interval = 4 × 16-byte per-channel records:
`[int16 min][int16 max][int16 ??][uint16 halfp][2B 00][uint16 ??][2B 00][uint16 ??]`.
Geo peak `= max(|min|, |max|) / 32768 × 10` in/s (matches sidecar
~1.8%); freq `= 512 / halfp` Hz (None for halfp ≤ 5 → ">100"
sentinel). Corpus: **all 859 Thor IDFH files decode, 181,071
intervals**. Wired through `read_idf_file()`
`save_imported_idf()` → sidecar's `extensions.idf_intervals`.
**Note on the BE9439 outliers in the example corpus:** Two files
(`BE9439_20200713131747.IDFW` and `BE9439_20200713124251.IDFH`) are
**Series III Blastware** binaries, not Thor. Provenance: TMI tried
to use Thor to manage auto-call-homes for Series III units; the
experiment didn't work out, but it did leave a few BW event files
in Thor's per-serial directory structure with `.IDFW`/`.IDFH`
extensions — Thor's forwarder applied its own naming convention to
the BW bodies it was relaying. Their header `10 00 01 80 00 00
Instantel STRT ff fe <end_key> <start_key>` is the BW SUB 5A STRT
record, not a Thor body preamble. The reader detects them by
signature and raises `NotImplementedError` pointing callers at
`read_blastware_file()`, which extracts BW-format peaks from them.
**Still NYI for Thor IDFH:** per-channel `int16 field4` (possibly
time-of-peak); the two uint16 fields (probably PVS contributions);
8-byte interval tail (PVS data); mic dB(L) exact conversion constant.
### Codec breakthroughs (2026-05-28)
- **Body offset is a fixed `0x0f1f`** across 151/154 corpus IDFW
files. Preceded by a 4-byte record-type marker (`46 00 00 00`)
+ magic preamble `00 02 00 [Tran[0] BE] [Tran[1] BE]`.
- **Sample stream is BW's segment-rotated block codec verbatim.**
Thor reuses `10 NN` (nibble), `20 NN` (int8), `00 NN` (RLE),
`30 NN` (packed12), `40 02` (segment header) tags with the same
semantics. Channel rotation Tran→Vert→Long→MicL.
- **Geo LSB = 0.0003 in/s** (not BW's 0.005), because Thor's 16-bit
ADC range maps to 10 in/s without the 16-count BW quantization step.
- **Mic ≈ 2.14×10⁻⁶ psi/count** (rough scale; refine after channel
block calibration constants are decoded).
- **BW compliance anchor `\xbe\x80\x00\x00\x00\x00` reappears at
IDFW offset 0x952** — sample_rate at anchor6 (uint16 BE),
record_time at anchor+6 (float32 BE), same layout as BW.
- **Event timestamp at offset 0x97A** — 8 bytes `[day][month]
[year_be][unk][hour][min][sec]`. Stop-time mirrors at 0x982.
- **Serial as null-terminated ASCII at 0x14E**.
- **Calibration date** at 0x1940x197 (day, month, year_be).
- Per-sample residual drift of ~3% suggests Thor encodes int8/nibble
deltas with an extra refinement bit that BW doesn't carry —
unsolved; errors resync within a few samples so cumulative impact
is small.
--- ---
+434 -48
View File
@@ -1,64 +1,450 @@
""" """
micromate/idf_file.py placeholder for the Thor IDF binary codec. micromate/idf_file.py Thor IDF binary codec.
Thor's ``.IDFH`` (histogram) and ``.IDFW`` (waveform) event files are an Decodes the Instantel Micromate Series IV ``.IDFW`` (waveform) and
Instantel proprietary binary format that has not yet been reverse- ``.IDFH`` (histogram) binary on-disk format. Sister module to
engineered. Today seismo-relay treats them as opaque blobs: ``minimateplus/event_file_io.py``.
``WaveformStore.save_imported_idf`` stores the bytes verbatim and reads
all device-authoritative metadata from the paired ``.IDFW.txt`` /
``.IDFH.txt`` ASCII sidecar (parsed by ``idf_ascii_report.py``).
When we crack the binary codec same reverse-engineering playbook we Status (2026-05-28):
used to byte-perfect-parse Series III BW files (see
``docs/instantel_protocol_reference.md`` and ``minimateplus/event_file_io.py``)
this module will grow:
- ``read_idf_file(path) -> IdfEvent`` - **Genuine Series IV / Thor binaries** are all signed
Parse a ``.IDFW``/``.IDFH`` binary and return a fully populated ``00 12 01 00 00 00 Instantel\\0`` (sig-A in earlier notes). Two
``IdfEvent`` whose waveform-sample arrays come from the binary Series III (Blastware) binaries appear in the example corpus
(the .txt sidecar's tabular sample block being a best-effort (``BE9439_*``) they share the ``.IDFW``/``.IDFH`` extension by
check). Lets us ingest Thor events even when the operator filing convention but carry a BW STRT header (``10 00 01 80 00 00
hasn't enabled the .txt exporter — closing the Instantel STRT...``) and are NOT Thor data. The reader detects
``had_report=False`` gap that the thor-watcher forwarder them by signature and raises NotImplementedError pointing callers
currently tolerates as a known limitation. at ``minimateplus.event_file_io.read_blastware_file()``.
- **IDFW waveform body** reuses the BW segment-rotated block codec
verbatim. Body always starts at file offset ``0x0f1f``. Samples
decoded via ``minimateplus.waveform_codec.decode_waveform_v2``
with 8799% byte-exact match against ``.IDFW.txt`` sidecar (quiet
events). Loud events hit the BW codec's known walker-stops-early
limit. Residual ~3% drift on per-sample deltas likely a
Thor-specific 12-bit delta refinement that BW's codec doesn't
model. Geo LSB = 0.0003 in/s; mic factor ~2.14e-6 psi/count.
- **IDFH histogram body**: 12-byte segment header
``[len_be 2B] 0a 00 00 00 [00 NN_counter] 05 3f`` introduces a
segment of ``N`` 72-byte interval records (``N = (len - 10) // 72``).
Each record holds 4 × 16-byte per-channel min/max/halfp + 8-byte
tail. Geo peaks via ``max(|min|, |max|) / 32768 × 10`` in/s
(matches sidecar within ~1.8%), freq via ``512 / halfp`` Hz.
**All 859 Thor IDFH files in the corpus decode (181,071 intervals).**
- Binary metadata directly extracted: serial, timestamp, sample_rate,
record_time, calibration_date. Other fields fall back to the paired
``.IDFW.txt`` / ``.IDFH.txt`` sidecar (consumed by
``WaveformStore.save_imported_idf``).
- ``write_idf_file(path, event)`` (eventually) The full reverse-engineering writeup lives in
Round-trip event reconstruction, used for verifying the codec ``docs/idf_protocol_reference.md``.
against captured device files the way ``write_blastware_file``
verifies the Series III codec.
- Helpers for decoding the binary's per-channel sample arrays into
physical units, the per-event flash buffer's monitor-log records,
etc.
The reverse-engineering path: pair every ``.IDFW`` binary in
``thor-watcher/example-data/`` with its sibling ``.IDFW.txt``, treating
the txt's "Waveform Data Channels" block as ground-truth, and align
the binary's per-channel int16-or-similar arrays against it. Header
fields (sample rate, channel count, record time, timestamps) sit before
the sample block same approach as the BW codec where ASCII strings
inside the binary (``Project:``, ``Client:``, etc.) anchored field
discovery.
""" """
from __future__ import annotations from __future__ import annotations
import datetime
import struct
from dataclasses import dataclass
from pathlib import Path from pathlib import Path
from typing import Union from typing import Optional, Union
from .models import IdfEvent from minimateplus.waveform_codec import decode_waveform_v2
from .models import IdfEvent, IdfPeaks, IdfReport
def read_idf_file(path: Union[str, Path]) -> "IdfEvent": # Genuine Series IV / Thor IDF binary signature: 6 bytes, then ASCII "Instantel".
"""Parse a Thor ``.IDFW``/``.IDFH`` binary into an ``IdfEvent``. _THOR_PREFIX = b"\x00\x12\x01\x00\x00\x00"
# Stray Series III (Blastware) binaries that occasionally turn up in Thor
# corpus directories renamed to the .IDFW/.IDFH convention. Their header
# (`10 00 01 80 00 00 Instantel STRT ...`) is byte-for-byte a BW SUB 5A
# STRT record, not a Thor binary. Detected so we can refuse-and-route
# rather than mis-parse.
_BW_STRAY_PREFIX = b"\x10\x00\x01\x80\x00\x00"
_INSTANTEL_TAG = b"Instantel"
Not yet implemented. When implemented, this will be the canonical # Constant body offset for sig-A IDFW files (verified across 151/154 corpus
entry point for reading Thor binaries the ASCII sidecar parser # files in tests/fixtures/THORDATA_example). The body is the segment-rotated
becomes an optional fast-path metadata supplement rather than the # block stream consumed by decode_waveform_v2; bytes [0:3] are the magic
sole source of device-authoritative data. # ``00 02 00`` preamble.
_BODY_START_SIG_A = 0x0F1F
# Geophone count → in/s, derived from sidecar ground truth: the smallest
# non-zero sample in 1,014-file corpus is 0.0003 in/s.
_GEO_LSB_IPS = 0.0003
# Microphone count → psi, derived from sidecar regression on 50 sample
# pairs from UM11719_20231219162723.IDFW (mic-heavy event).
_MIC_LSB_PSI = 2.14e-6
# IDFH histogram constants.
_IDFH_INTERVAL_SIZE = 72 # bytes per per-interval record
_IDFH_SEGMENT_HEADER = 10 # bytes: [len_be 2B][0a 00 00 00 4B][00 NN 2B][05 3f 2B]
_IDFH_SEGMENT_TAIL = 2 # bytes after the interval data block, before next marker
_IDFH_HALFP_FREQ_NUM = 512.0 # freq_hz = NUM / halfp; halfp ≤ 5 means ">100 Hz" sentinel
_IDFH_GEO_FULL_SCALE = 10.0 # in/s — Normal range
_IDFH_INT16_FS = 32768.0
_IDFH_CHANNELS = ("Tran", "Vert", "Long", "MicL")
# ─── Binary metadata extraction ─────────────────────────────────────────────
@dataclass
class IdfBinaryMetadata:
"""Fields recoverable from the sig-A binary header (no .txt needed)."""
serial: Optional[str] = None
event_datetime: Optional[datetime.datetime] = None
sample_rate: Optional[int] = None
record_time_sec: Optional[float] = None
calibration_date: Optional[datetime.date] = None
def _read_ascii_z(buf: bytes, off: int, maxlen: int = 64) -> Optional[str]:
if off >= len(buf):
return None
end = buf.find(b"\x00", off, off + maxlen)
if end < 0:
end = min(off + maxlen, len(buf))
s = buf[off:end].decode("ascii", errors="replace").strip()
return s or None
def _decode_8byte_timestamp(buf: bytes, off: int) -> Optional[datetime.datetime]:
"""Layout: ``[day][month][year_hi][year_lo][unknown][hour][min][sec]``."""
if off + 8 > len(buf):
return None
day, mon, yh, yl, _unk, hr, mn, sc = buf[off : off + 8]
year = (yh << 8) | yl
if not (2015 <= year <= 2050 and 1 <= mon <= 12 and 1 <= day <= 31
and 0 <= hr < 24 and 0 <= mn < 60 and 0 <= sc < 60):
return None
try:
return datetime.datetime(year, mon, day, hr, mn, sc)
except ValueError:
return None
def extract_binary_metadata(buf: bytes) -> IdfBinaryMetadata:
"""Pull serial/timestamp/sample_rate/record_time/calibration from the
sig-A binary header.
Field positions confirmed against UM11719_20231219162723.IDFW; stable
across the 151-file sig-A corpus.
""" """
raise NotImplementedError( md = IdfBinaryMetadata()
"IDF binary codec not yet implemented; the .IDFW/.IDFH binary format "
"is undecoded. Use parse_idf_report() on the paired .txt sidecar " # Serial: null-terminated ASCII at 0x14E.
"for device-authoritative metadata." md.serial = _read_ascii_z(buf, 0x14E, maxlen=16)
# Sample rate + record time live in a BW-compatible compliance block.
# Locate the 6-byte anchor `be 80 00 00 00 00` and read offsets relative
# to it: anchor-6 = sample_rate uint16 BE; anchor+6 = record_time float32 BE.
anchor = buf.find(b"\xbe\x80\x00\x00\x00\x00", 0x800, 0xA00)
if anchor > 0:
sr_bytes = buf[anchor - 6 : anchor - 4]
if len(sr_bytes) == 2:
sr = int.from_bytes(sr_bytes, "big")
if sr in (256, 512, 1024, 2048, 4096):
md.sample_rate = sr
rt_bytes = buf[anchor + 6 : anchor + 10]
if len(rt_bytes) == 4:
try:
rt = struct.unpack(">f", rt_bytes)[0]
if 0.1 <= rt <= 600.0:
md.record_time_sec = float(rt)
except struct.error:
pass
# Event timestamp: 8 bytes. Position differs between IDFW (0x97A) and
# IDFH (0x9F8); scan a small range and accept the first valid decode.
for off in (0x97A, 0x9F8):
ts = _decode_8byte_timestamp(buf, off)
if ts is not None:
md.event_datetime = ts
break
# Calibration date: day, month, year_be at 0x194-0x197.
if len(buf) > 0x197:
day, mon = buf[0x194], buf[0x195]
year = int.from_bytes(buf[0x196 : 0x198], "big")
if 1 <= mon <= 12 and 1 <= day <= 31 and 2015 <= year <= 2050:
try:
md.calibration_date = datetime.date(year, mon, day)
except ValueError:
pass
return md
# ─── Sample decoder + unit conversion ───────────────────────────────────────
def _decode_waveform_samples(buf: bytes) -> Optional[dict]:
"""Decode samples from the sig-A body starting at file offset 0x0f1f.
Returns the raw decoder counts dict geo LSB = 0.0003 in/s, mic in
its own count unit (see :func:`mic_count_to_psi`). Returns None if
decoding fails.
"""
if len(buf) < _BODY_START_SIG_A + 8:
return None
body = buf[_BODY_START_SIG_A:]
return decode_waveform_v2(body)
def geo_count_to_ips(count: int) -> float:
"""Convert a Thor geo decoder count to in/s. LSB = 0.0003 in/s."""
return count * _GEO_LSB_IPS
def mic_count_to_psi(count: int) -> float:
"""Convert a Thor mic decoder count to psi. Scale derived from
regression over 50 sample pairs in UM11719_20231219162723.IDFW;
consistent to ~5%. Calibration constants from the channel block
can refine this once decoded.
"""
return count * _MIC_LSB_PSI
# ─── IDFH histogram decoder ─────────────────────────────────────────────────
@dataclass
class IdfhInterval:
"""One decoded histogram interval (typically one minute of monitoring)."""
offset: int # file byte offset of the 72-byte record
# Per-channel min/max ADC counts (int16 BE), half-period samples, peak count.
# Peak = max(|min|, |max|). freq_hz = 512/halfp (None if halfp ≤ 5 →
# ">100 Hz" sentinel; matches sidecar convention).
tran_min: int
tran_max: int
tran_halfp: int
vert_min: int
vert_max: int
vert_halfp: int
long_min: int
long_max: int
long_halfp: int
micl_min: int
micl_max: int
micl_halfp: int
def peak_count(self, channel: str) -> int:
mn = getattr(self, f"{channel.lower()}_min")
mx = getattr(self, f"{channel.lower()}_max")
return max(abs(mn), abs(mx))
def peak_ips(self, channel: str) -> float:
"""Convert peak count to in/s (geo channels only)."""
return self.peak_count(channel) / _IDFH_INT16_FS * _IDFH_GEO_FULL_SCALE
def freq_hz(self, channel: str) -> Optional[float]:
halfp = getattr(self, f"{channel.lower()}_halfp")
if halfp <= 5:
return None
return _IDFH_HALFP_FREQ_NUM / halfp
def _decode_idfh_interval(buf72: bytes, offset: int) -> IdfhInterval:
"""Decode one 72-byte interval record into per-channel min/max/halfp."""
import struct
fields = []
for i in range(4):
block = buf72[i * 16 : (i + 1) * 16]
mn = struct.unpack_from(">h", block, 0)[0]
mx = struct.unpack_from(">h", block, 2)[0]
# block[4:6] = int16 BE, role unknown (possibly time-of-peak)
halfp = struct.unpack_from(">H", block, 6)[0]
# block[10:12] and block[14:16] are uint16 BE with unknown semantics
# (likely sum / count contributions for the PVS computation).
fields.extend([mn, mx, halfp])
# Tail 8 bytes (buf72[64:72]) carry PVS-related data; not yet decoded.
return IdfhInterval(
offset=offset,
tran_min=fields[0], tran_max=fields[1], tran_halfp=fields[2],
vert_min=fields[3], vert_max=fields[4], vert_halfp=fields[5],
long_min=fields[6], long_max=fields[7], long_halfp=fields[8],
micl_min=fields[9], micl_max=fields[10], micl_halfp=fields[11],
)
def decode_idfh_body(buf: bytes) -> list:
"""Walk an IDFH file and decode every interval record.
The body has one or more segments; each segment header is 12 bytes:
``[length_be 2B][0a 00 00 00][00 NN_counter][05 3f]`` where ``length``
is bytes from the magic through the end of the interval block
(= 10 + 72 × n_intervals). Segments are separated by a 2-byte tail
+ next-segment 2-byte prefix (the bytes before the next length field).
Confirmed against the 859-file corpus (181,071 intervals decoded; 1
failure is the sig-B BE9439 file).
"""
intervals: list = []
i = 0
while True:
j = buf.find(b"\x0a\x00\x00\x00", i)
if j < 0 or j < 2:
break
# Validate: [length_be][0a 00 00 00][00 NN][05 3f]
if buf[j + 4] != 0x00 or buf[j + 6 : j + 8] != b"\x05\x3f":
i = j + 1
continue
length = int.from_bytes(buf[j - 2 : j], "big")
n = (length - _IDFH_SEGMENT_HEADER) // _IDFH_INTERVAL_SIZE
if n <= 0:
i = j + 1
continue
header_start = j - 2
interval_start = header_start + _IDFH_SEGMENT_HEADER
for k in range(n):
off = interval_start + k * _IDFH_INTERVAL_SIZE
if off + _IDFH_INTERVAL_SIZE > len(buf):
break
chunk = buf[off : off + _IDFH_INTERVAL_SIZE]
intervals.append(_decode_idfh_interval(chunk, off))
# Advance past this segment + the 2-byte tail.
i = header_start + length + _IDFH_SEGMENT_TAIL
return intervals
# ─── Top-level reader ───────────────────────────────────────────────────────
@dataclass
class IdfReadResult:
"""Return type for :func:`read_idf_file`.
For waveforms (``.IDFW``), ``samples`` holds the per-channel sample
arrays in Thor decoder counts. For histograms (``.IDFH``),
``samples`` is empty and ``intervals`` holds the per-interval
record list (peaks, freqs).
"""
event: IdfEvent
samples: dict # {"Tran": [...], ...} for IDFW; empty for IDFH
binary_metadata: IdfBinaryMetadata
signature: str # always "thor" for now (sig-A genuine Thor)
intervals: Optional[list] = None # list[IdfhInterval] for IDFH; None for IDFW
def read_idf_file(path: Union[str, Path]) -> IdfReadResult:
"""Parse a Thor ``.IDFW`` binary into an ``IdfEvent`` + decoded samples.
Currently implements signature-A waveforms only. Signature-B
(old-firmware) and ``.IDFH`` histograms raise NotImplementedError;
use the paired ``.IDFW.txt`` / ``.IDFH.txt`` sidecar for those via
``parse_idf_report()``.
Returns an :class:`IdfReadResult`. The caller converts int sample
counts to physical units via :func:`geo_count_to_ips` /
:func:`mic_count_to_psi`.
"""
p = Path(path)
buf = p.read_bytes()
if len(buf) < 16 or buf[6:16] != _INSTANTEL_TAG + b"\x00":
raise ValueError(f"{p.name}: not an IDF file (missing Instantel magic)")
sig_prefix = buf[:6]
if sig_prefix == _THOR_PREFIX:
signature = "thor"
elif sig_prefix == _BW_STRAY_PREFIX:
raise NotImplementedError(
f"{p.name}: file has a Series III (Blastware) STRT header in "
"an IDF-named container — not a Thor binary. Route through "
"minimateplus.event_file_io.read_blastware_file() instead "
"(peaks decode; samples & full metadata don't, but it's not "
"Thor data so the Thor codec doesn't apply)."
)
else:
raise ValueError(f"{p.name}: unknown IDF signature {sig_prefix.hex()}")
is_histogram = p.suffix.upper() == ".IDFH"
md = extract_binary_metadata(buf)
if is_histogram:
intervals = decode_idfh_body(buf)
if not intervals:
raise ValueError(f"{p.name}: IDFH body decoded no intervals")
# Peaks: max across all intervals on each channel (per-channel max
# of stored max-magnitudes; sidecar's PPV row carries the same).
peak_tran = max((iv.peak_ips("Tran") for iv in intervals), default=0.0)
peak_vert = max((iv.peak_ips("Vert") for iv in intervals), default=0.0)
peak_long = max((iv.peak_ips("Long") for iv in intervals), default=0.0)
rep = IdfReport(
serial_number=md.serial,
event_type="Full Histogram",
event_datetime=md.event_datetime,
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
)
peaks = IdfPeaks(
transverse_ips=peak_tran,
vertical_ips=peak_vert,
longitudinal_ips=peak_long,
peak_vector_sum_ips=None,
mic_pspl_dbl=None,
)
event = IdfEvent(
serial=md.serial or "UNKNOWN",
timestamp=md.event_datetime or datetime.datetime(1970, 1, 1),
kind="Histogram",
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
peaks=peaks,
report=rep,
)
return IdfReadResult(
event=event,
samples={},
binary_metadata=md,
signature=signature,
intervals=intervals,
)
# Waveform path.
decoded = _decode_waveform_samples(buf)
if decoded is None:
raise ValueError(f"{p.name}: waveform body codec failed")
rep = IdfReport(
serial_number=md.serial,
event_type="Full Waveform",
event_datetime=md.event_datetime,
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
)
def _peak_ips(ch: str) -> float:
arr = decoded.get(ch, [])
return geo_count_to_ips(max((abs(v) for v in arr), default=0))
peaks = IdfPeaks(
transverse_ips=_peak_ips("Tran"),
vertical_ips=_peak_ips("Vert"),
longitudinal_ips=_peak_ips("Long"),
# PVS requires aligned per-sample √(T²+V²+L²); leave None — the
# sidecar carries it and the bridge picks it up if present.
peak_vector_sum_ips=None,
mic_pspl_dbl=None,
)
event = IdfEvent(
serial=md.serial or "UNKNOWN",
timestamp=md.event_datetime or datetime.datetime(1970, 1, 1),
kind="Waveform",
filename=p.name,
sample_rate=md.sample_rate,
record_time_sec=md.record_time_sec,
peaks=peaks,
report=rep,
)
return IdfReadResult(
event=event,
samples=decoded,
binary_metadata=md,
signature=signature,
) )
+135 -23
View File
@@ -467,21 +467,21 @@ class WaveformStore:
Ingest a Thor (Micromate Series IV) IDF event file (`.IDFW` or Ingest a Thor (Micromate Series IV) IDF event file (`.IDFW` or
`.IDFH`) produced by Thor's TXT exporter. `.IDFH`) produced by Thor's TXT exporter.
Thor binaries are stored as opaque bytes seismo-relay doesn't
yet decode the proprietary IDF binary format (codec slot lives
at ``micromate/idf_file.py``). Device-authoritative metadata
comes from the paired ``.IDFW.txt`` / ``.IDFH.txt`` sidecar
when supplied.
Workflow: Workflow:
1. Parse the paired TXT report (when supplied) via 1. For sig-A `.IDFW` binaries, decode samples + binary metadata
``micromate.parse_idf_report`` dict. via ``micromate.idf_file.read_idf_file()``. Failure or
2. Wrap parsed dict + filename into a typed ``micromate.IdfEvent``. non-IDFW path falls through to the .txt-only flow.
3. Copy bytes verbatim into ``<root>/<serial>/<filename>``. 2. Parse the paired TXT report (when supplied) via
4. Bridge IdfEvent ``minimateplus.Event`` (for the existing ``micromate.parse_idf_report`` dict. TXT remains the
sidecar / DB insert machinery) via source of truth for fields the binary doesn't yet supply
``IdfEvent.to_minimateplus_event(waveform_key)``. (full peak set with ZC freq / Time of Peak, sensor self-check,
5. Write the ``.sfm.json`` sidecar with firmware string, project strings).
3. Wrap parsed dict + filename into a typed ``micromate.IdfEvent``.
4. Copy bytes verbatim into ``<root>/<serial>/<filename>``.
5. Bridge IdfEvent ``minimateplus.Event`` and attach
``raw_samples`` from the binary decoder (when available).
6. Write the `.h5` clean-waveform file when samples decoded.
7. Write the ``.sfm.json`` sidecar with
``source.kind = "idf-import"`` and the full raw IDF report ``source.kind = "idf-import"`` and the full raw IDF report
under ``extensions.idf_report``. under ``extensions.idf_report``.
@@ -490,7 +490,33 @@ class WaveformStore:
""" """
from micromate import IdfEvent, parse_idf_report from micromate import IdfEvent, parse_idf_report
# Parse the .txt sidecar (best-effort; non-fatal on failure). # 1. Binary decode (sig-A IDFW and IDFH). Non-fatal: any failure
# leaves samples / binary metadata unfilled and we proceed with
# the .txt path as before.
idf_samples: Optional[dict] = None
idf_intervals: Optional[list] = None
binary_md = None
binary_peaks = None
is_histogram = False
try:
from micromate.idf_file import read_idf_file
res = read_idf_file(source_path)
idf_samples = res.samples or None
idf_intervals = res.intervals
is_histogram = res.intervals is not None
binary_md = res.binary_metadata
binary_peaks = res.event.peaks
except NotImplementedError:
# sig-B — codec doesn't handle this yet.
pass
except Exception as exc:
log.warning(
"save_imported_idf: binary codec failed for %s: %s"
"falling back to .txt-only ingest",
source_path.name, exc,
)
# 2. Parse the .txt sidecar (best-effort; non-fatal on failure).
report_dict: dict = {} report_dict: dict = {}
if idf_report_text is not None: if idf_report_text is not None:
try: try:
@@ -501,7 +527,38 @@ class WaveformStore:
exc, exc,
) )
# Build the typed IdfEvent. Filename is authoritative for # 3. Backfill report_dict with binary metadata for fields the
# .txt didn't supply. Binary takes precedence on tied fields
# where the binary is more reliable (timestamp, sample_rate),
# and fills in fields entirely missing from the .txt.
if binary_md is not None:
if binary_md.serial and not report_dict.get("serial_number"):
report_dict["serial_number"] = binary_md.serial
if binary_md.event_datetime and not report_dict.get("event_datetime"):
report_dict["event_datetime"] = binary_md.event_datetime
if binary_md.sample_rate and not report_dict.get("sample_rate"):
report_dict["sample_rate"] = binary_md.sample_rate
if binary_md.record_time_sec and not report_dict.get("record_time_sec"):
report_dict["record_time_sec"] = binary_md.record_time_sec
# Calibration date (binary) vs calibration text (.txt) cohabit
# under different keys; no overwrite needed.
if binary_md.event_datetime and not report_dict.get("event_type"):
report_dict["event_type"] = (
"Full Histogram" if is_histogram else "Full Waveform"
)
# Binary-derived peaks fill in when the .txt didn't supply them.
# They're ~3% low vs the device-authoritative .txt values (residual
# codec drift), so .txt always wins when present.
if binary_peaks is not None:
if binary_peaks.transverse_ips and not report_dict.get("tran_ppv"):
report_dict["tran_ppv"] = binary_peaks.transverse_ips
if binary_peaks.vertical_ips and not report_dict.get("vert_ppv"):
report_dict["vert_ppv"] = binary_peaks.vertical_ips
if binary_peaks.longitudinal_ips and not report_dict.get("long_ppv"):
report_dict["long_ppv"] = binary_peaks.longitudinal_ips
# 4. Build the typed IdfEvent. Filename is authoritative for
# (serial, timestamp, kind); the report's event_datetime takes # (serial, timestamp, kind); the report's event_datetime takes
# precedence over the filename timestamp inside from_report(). # precedence over the filename timestamp inside from_report().
idf_event = IdfEvent.from_report(report_dict, source_path.name) idf_event = IdfEvent.from_report(report_dict, source_path.name)
@@ -511,7 +568,7 @@ class WaveformStore:
# serial that overrides a misnamed export). # serial that overrides a misnamed export).
serial = serial_hint or idf_event.serial or "UNKNOWN" serial = serial_hint or idf_event.serial or "UNKNOWN"
# Filesystem write. # 5. Filesystem write of binary bytes.
filename = source_path.name filename = source_path.name
bw_path = self._serial_dir(serial) / filename bw_path = self._serial_dir(serial) / filename
bw_path.write_bytes(idf_bytes) bw_path.write_bytes(idf_bytes)
@@ -523,13 +580,41 @@ class WaveformStore:
# surrogate — every distinct binary maps to a distinct row. # surrogate — every distinct binary maps to a distinct row.
waveform_key = bytes.fromhex(sha256)[:16] waveform_key = bytes.fromhex(sha256)[:16]
# Bridge to minimateplus.Event for the existing sidecar / DB # 6. Bridge to minimateplus.Event for the existing sidecar / DB
# insert paths. See IdfEvent.to_minimateplus_event() for the # insert paths. See IdfEvent.to_minimateplus_event() for the
# caveats of this bridge (mic units, missing fields → sidecar). # caveats of this bridge (mic units, missing fields → sidecar).
ev = idf_event.to_minimateplus_event(waveform_key) ev = idf_event.to_minimateplus_event(waveform_key)
# Write the sidecar. Source kind "idf-import" was added to the # Attach the decoded sample arrays. Thor's decoder counts use
# allow-list in event_file_io.event_to_sidecar_dict for this. # LSB = 0.0003 in/s for geo (vs BW's 16-count units at 0.005 in/s)
# — the .h5 writer's geo_range="normal" yields LSB = 10/32768
# ≈ 0.000305 in/s, so plotted samples come out ~1.7% high.
# Acceptable known offset; refine with a Thor-aware h5 path later.
if idf_samples is not None:
ev.raw_samples = idf_samples
n_samples = max((len(idf_samples.get(ch, [])) for ch in ("Tran", "Vert", "Long", "MicL")), default=0)
ev.total_samples = ev.total_samples or n_samples
# 7. Write the .h5 clean-waveform file when we actually have samples.
# Histograms (IDFH) don't have waveform samples — skip h5 for those.
hdf5_filename: Optional[str] = None
if idf_samples is not None and not is_histogram:
hdf5_path = self.hdf5_path_for(serial, filename)
try:
event_hdf5.write_event_hdf5(
hdf5_path, ev,
serial=serial,
geo_range="normal", # Thor's geo full scale is also 10 in/s (Normal)
source_kind="idf-import",
)
hdf5_filename = hdf5_path.name
except Exception as exc:
log.warning(
"save_imported_idf: HDF5 write failed for %s: %s — continuing without .h5",
hdf5_path, exc,
)
# 8. Write the sidecar. Source kind "idf-import" is on the allow-list.
sidecar_path = self.sidecar_path_for(serial, filename) sidecar_path = self.sidecar_path_for(serial, filename)
existing_review = None existing_review = None
if sidecar_path.exists(): if sidecar_path.exists():
@@ -554,19 +639,46 @@ class WaveformStore:
# Time of Peak, sensor self-check, calibration, firmware). # Time of Peak, sensor self-check, calibration, firmware).
if report_dict: if report_dict:
sidecar["extensions"]["idf_report"] = report_dict sidecar["extensions"]["idf_report"] = report_dict
# For histograms, also stash the binary-decoded per-interval
# records so the UI / report layer doesn't need to re-walk the
# IDFH file at render time.
if idf_intervals is not None:
sidecar["extensions"]["idf_intervals"] = [
{
"offset": iv.offset,
"tran_peak": iv.peak_count("Tran"),
"tran_halfp": iv.tran_halfp,
"tran_freq": iv.freq_hz("Tran"),
"vert_peak": iv.peak_count("Vert"),
"vert_halfp": iv.vert_halfp,
"vert_freq": iv.freq_hz("Vert"),
"long_peak": iv.peak_count("Long"),
"long_halfp": iv.long_halfp,
"long_freq": iv.freq_hz("Long"),
"mic_peak": iv.peak_count("MicL"),
"mic_halfp": iv.micl_halfp,
"mic_freq": iv.freq_hz("MicL"),
}
for iv in idf_intervals
]
event_file_io.write_sidecar(sidecar_path, sidecar) event_file_io.write_sidecar(sidecar_path, sidecar)
log.info( log.info(
"WaveformStore.save_imported_idf serial=%s filename=%s filesize=%d " "WaveformStore.save_imported_idf serial=%s filename=%s filesize=%d "
"report_attached=%s", "kind=%s report_attached=%s binary_decoded=%s h5=%s intervals=%d",
serial, filename, filesize, bool(report_dict), serial, filename, filesize,
"histogram" if is_histogram else "waveform",
bool(report_dict),
(idf_samples is not None) or (idf_intervals is not None),
hdf5_filename or "(skipped)",
len(idf_intervals) if idf_intervals else 0,
) )
return ev, { return ev, {
"filename": filename, "filename": filename,
"filesize": filesize, "filesize": filesize,
"sha256": sha256, "sha256": sha256,
"a5_pickle_filename": None, "a5_pickle_filename": None,
"hdf5_filename": None, "hdf5_filename": hdf5_filename,
"sidecar_filename": sidecar_path.name, "sidecar_filename": sidecar_path.name,
"serial": serial, "serial": serial,
} }