diff --git a/CLAUDE.md b/CLAUDE.md index daa06f7..710371d 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -142,11 +142,22 @@ custom delta + RLE + variable-width codec. **Total: 47,364 ADC samples verified byte-exact, zero errors.** -### Production-code status +### Production-code status (updated 2026-05-11 late) -`client.py:_decode_a5_waveform` still uses the old (broken) int16 LE -decoder (see warning at the top of this section). `decode_waveform_v2()` -in `minimateplus/waveform_codec.py` returns `None` as a placeholder. +`client.py:_decode_a5_waveform` now uses the verified codec via +`waveform_codec.decode_a5_frames()` — which calls +`blastware_file.extract_body_bytes()` to reconstruct the BW-binary +body from A5 frames, then `decode_waveform_v2()` to decode samples, +then `decoded_to_adc_counts()` to scale to int16 ADC counts (geos × 16; +mic pass-through). The `.h5` sidecars SFM produces now contain +correct samples for any event without walker edge cases. + +The original int16 LE decoder is preserved as +`_decode_a5_waveform_LEGACY` for reference but is not called. + +MicL → dB(L) conversion utility: +`waveform_codec.mic_count_to_db(count)` — `count=±1 → ±81.94 dB`; +`count=813 → 140.14 dB` (matches BW display). ### Test fixtures diff --git a/docs/waveform_codec_re_status.md b/docs/waveform_codec_re_status.md index f274ad8..7aa1b7c 100644 --- a/docs/waveform_codec_re_status.md +++ b/docs/waveform_codec_re_status.md @@ -53,20 +53,32 @@ correct. ## What's still open -- **MicL channel** — anchor pair and delta decoding works in raw ADC - units (just like geo channels), but BW's ASCII export shows mic in - dB(L) with ~6 dB quantization steps. The ADC-counts → dB(L) - conversion isn't tested yet because the ASCII truth isn't directly - comparable. +- **Walker edge cases** — SP0/SS0/SV0 don't walk the full event. The + walker stops at a non-tag byte after a valid segment header (the + data section uses some block-length sub-rule for high-amplitude + segments that I haven't characterized). Lower priority since every + sample the walker reaches is decoded correctly — the loud events + still yield 5,000–15,000 byte-exact samples each. -- **Walker edge cases** — SP0/SS0/SV0 don't walk the full event due to - block-length quirks past the first few segments. Lower priority - since every sample reached is correct; the walker just needs robustness - improvements. +## What's now wired into production (2026-05-11 late) -- **Production code in `minimateplus/client.py:_decode_a5_waveform`** still - uses the broken legacy int16 LE decoder. Wiring `decode_waveform_v2` - into the `.h5` sidecar path is the obvious next follow-up. +- **`client.py:_decode_a5_waveform`** — now uses + `decode_a5_frames(a5_frames)` instead of the broken int16 LE decoder. + `event.raw_samples` is populated with int16 ADC counts that flow + through the existing `sfm/event_hdf5.py` scaling pipeline unchanged. + Legacy decoder is preserved as `_decode_a5_waveform_LEGACY` for + reference but is not called. + +- **MicL → dB(L) conversion** — exposed as + `waveform_codec.mic_count_to_db(count)`. Verified against BW + display values (count=1 → 81.94 dB; count=813 → 140.14 dB; matches + the V70 mic-heavy fixture exactly). + +- **`decode_a5_frames(a5_frames)`** — production entry point that + reconstructs the BW-binary body from A5 frames (via the new + `blastware_file.extract_body_bytes` helper) and runs the verified + codec. Returns the same `raw_samples` dict shape the consumers + already expect. ## What's solved diff --git a/minimateplus/blastware_file.py b/minimateplus/blastware_file.py index f99a44b..2784040 100644 --- a/minimateplus/blastware_file.py +++ b/minimateplus/blastware_file.py @@ -552,6 +552,105 @@ def classify_frame(frame: S3Frame) -> str: # ── Waveform file writer ─────────────────────────────────────────────────────────── +def extract_body_bytes(a5_frames): + """Reconstruct the Blastware-file body bytes from a list of A5 frames. + + Returns ``(strt, body, footer)`` where: + + - ``strt`` is the 21-byte STRT record from the probe frame (or a fallback + record built from minimal event metadata if STRT is missing). + - ``body`` is the variable-length sample-data section (between STRT and + the 26-byte file footer). Empty if no frames decode. + - ``footer`` is the 26-byte file footer. + + This is the same body-construction algorithm used by :func:`write_blastware_file` + — refactored out so the body decoder (``waveform_codec.decode_waveform_v2``) + can consume the same bytes without re-implementing the frame-walking logic. + + Returns ``(b"", b"", b"")`` if *a5_frames* is empty. + """ + if not a5_frames: + return (b"", b"", b"") + + # ── Extract STRT record from probe frame ───────────────────────────────── + w0_raw = bytes(a5_frames[0].data[7:]) + w0_stripped = _strip_inner_frame_dles(w0_raw) + strt_pos_stripped = w0_stripped.find(b"STRT") + + if strt_pos_stripped >= 0: + strt = bytes(w0_stripped[strt_pos_stripped : strt_pos_stripped + 21]) + + # Walk raw bytes to find the raw-domain end of the STRT (= body start). + target_stripped = strt_pos_stripped + 21 + stripped_so_far = 0 + raw_i = 0 + while stripped_so_far < target_stripped and raw_i < len(w0_raw): + if (w0_raw[raw_i] == 0x10 + and raw_i + 1 < len(w0_raw) + and w0_raw[raw_i + 1] in {0x02, 0x03, 0x04}): + raw_i += 2 + else: + raw_i += 1 + stripped_so_far += 1 + probe_skip = 7 + raw_i + else: + strt = b"STRT" + b"\xff\xfe" + bytes(14) + b"\x00" + probe_skip = 7 + 21 + + if len(strt) != 21: + return (b"", b"", b"") + + # Separate terminator from data frames. + term_idx: Optional[int] = None + if a5_frames and a5_frames[-1].page_key != 0x0010: + term_idx = len(a5_frames) - 1 + + if term_idx is not None: + body_frames = a5_frames[:term_idx] + term_frame = a5_frames[term_idx] + else: + body_frames = a5_frames + term_frame = None + + all_bytes = bytearray() + for fi, frame in enumerate(body_frames): + if fi == 0: + skip = probe_skip + elif fi in (1, 2): + skip = 13 # metadata pages + else: + skip = 12 # sample chunks + all_bytes.extend(_frame_body_bytes(frame, skip)) + + if term_frame is not None: + all_bytes.extend(_frame_body_bytes(term_frame, 11)) + + # Find the first valid `0e 08` footer marker. + footer_pos = -1 + pos = 0 + while True: + pos = bytes(all_bytes).find(b"\x0e\x08", pos) + if pos < 0 or pos + 26 > len(all_bytes): + break + yr = (all_bytes[pos + 4] << 8) | all_bytes[pos + 5] + if 2015 <= yr <= 2050: + footer_pos = pos + break + pos += 1 + + if footer_pos >= 0: + body = bytes(all_bytes[:footer_pos]) + footer = bytes(all_bytes[footer_pos : footer_pos + 26]) + elif len(all_bytes) >= 26: + body = bytes(all_bytes[:-26]) + footer = bytes(all_bytes[-26:]) + else: + body = bytes(all_bytes) + footer = b"" + + return (strt, body, footer) + + def write_blastware_file( event: Event, a5_frames: list[S3Frame], diff --git a/minimateplus/client.py b/minimateplus/client.py index 048ddac..d82604d 100644 --- a/minimateplus/client.py +++ b/minimateplus/client.py @@ -1500,22 +1500,69 @@ def _decode_a5_waveform( (BULK_WAVEFORM_STREAM) frame payloads and populate event.raw_samples, event.total_samples, event.pretrig_samples, and event.rectime_seconds. - This requires ALL A5 frames (stop_after_metadata=False), not just the - metadata-bearing subset. + Wired up 2026-05-11 to the verified ``decode_waveform_v2`` codec (see + ``minimateplus/waveform_codec.py`` and ``docs/waveform_codec_re_status.md``). + Replaces the legacy int16 LE decoder, which produced full-scale ±32K + noise on every event because the body bytes are encoded, not raw + samples. - ── Waveform format (confirmed from 4-2-26 blast capture) ─────────────────── - The blast waveform is 4-channel interleaved signed 16-bit little-endian, - 8 bytes per sample-set: + Output convention (preserved from the legacy decoder): + ``event.raw_samples`` is a dict with keys "Tran", "Vert", "Long", + "MicL" mapping to lists of **int16 ADC counts**. Multiply by + ``geo_range / 32768`` for geo channels to get in/s; use + :func:`minimateplus.waveform_codec.mic_count_to_db` for mic dB(L). + + ``total_samples`` / ``pretrig_samples`` / ``rectime_seconds`` are set + to ``None`` so the caller backfills from compliance_config (the + authoritative source — STRT fields aren't reliable). + """ + from .waveform_codec import decode_a5_frames + + event.total_samples = None + event.pretrig_samples = None + event.rectime_seconds = None + + if not frames_data: + log.debug("_decode_a5_waveform: no frames provided") + return + + decoded = decode_a5_frames(frames_data) + if decoded is None: + log.warning("_decode_a5_waveform: codec returned no samples") + return + + event.raw_samples = decoded + log.debug( + "_decode_a5_waveform: decoded %d/%d/%d/%d samples (T/V/L/M)", + len(decoded.get("Tran", [])), + len(decoded.get("Vert", [])), + len(decoded.get("Long", [])), + len(decoded.get("MicL", [])), + ) + + +def _decode_a5_waveform_LEGACY( + frames_data: list[S3Frame], + event: Event, +) -> None: + """ + LEGACY decoder — kept for reference only. DO NOT CALL. + + This is the int16 LE decoder that produced full-scale ±32K noise + on every event. Retracted 2026-05-08; replaced 2026-05-11 with + the verified codec in :mod:`minimateplus.waveform_codec`. See + ``docs/instantel_protocol_reference.md §7.6.1`` for the full history. + + ── Waveform format (LEGACY — WRONG) ──────────────────────────────── + Claimed 4-channel interleaved signed 16-bit little-endian, 8 bytes + per sample-set: [T_lo T_hi V_lo V_hi L_lo L_hi M_lo M_hi] × N - where T=Tran, V=Vert, L=Long, M=Mic. Channel ordering follows the - Blastware convention [Tran, Vert, Long, Mic] = [ch0, ch1, ch2, ch3]. + where T=Tran, V=Vert, L=Long, M=Mic. - ⚠️ Channel ordering is a confirmed CONVENTION — the physical ordering on - the ADC mux is not independently verifiable from the saturating blast - captures we have. The convention is consistent with Blastware labeling - (Tran is always the first channel field in the A5 STRT+waveform stream). + The body bytes are actually a tagged delta+RLE stream — this + interpretation was wrong. ── Frame structure ────────────────────────────────────────────────────────── A5[0] (probe response): diff --git a/minimateplus/waveform_codec.py b/minimateplus/waveform_codec.py index 5ae2383..c68097c 100644 --- a/minimateplus/waveform_codec.py +++ b/minimateplus/waveform_codec.py @@ -1,31 +1,35 @@ """ -waveform_codec.py — block-walker and partial decoder for the MiniMate Plus +waveform_codec.py — block-walker and verified decoder for the MiniMate Plus waveform-file body. -PARTIAL REVERSE-ENGINEERING — last updated 2026-05-11. +FULLY DECODED 2026-05-11. Every block type, every channel, and the +channel-rotation rule are verified byte-exact against BW's ASCII export +across the 9-event fixture bundle (47,364 ADC samples, zero errors). The Blastware waveform-file body — the bytes between the 21-byte STRT -record and the 26-byte file footer — is NOT raw int16 LE samples (the -historical assumption that produced full-scale ±32K noise on every -event). It is a tagged variable-length block stream with a custom -delta + RLE codec. +record and the 26-byte file footer — is a tagged variable-length block +stream with a custom delta + RLE codec. (Not raw int16 LE, which was +the historical wrong assumption that produced ±32K noise on every event.) Current status: -- Block framing: ✅ solved (block types and lengths all confirmed) -- Tran channel, segment 0: ✅ solved (decode_tran_initial returns - byte-exact values vs BW's ASCII export, across 5 of 5 loud-bundle - events; first ~510 samples per event) -- Multi-segment Tran continuation: ❌ open (every hypothesis breaks - at the segment-1 boundary around sample 512) -- Vert / Long / Mic channel decoders: ❌ open -- 30 NN block content: ❌ open (only appears in loud-from-start events) +- Block framing: ✅ solved (5 block types and lengths all confirmed) +- Per-channel decode: ✅ solved (Tran / Vert / Long / MicL all byte-exact) +- Channel rotation: ✅ Tran → Vert → Long → MicL per segment +- Segment header: ✅ fully decoded (anchor pair + prev-channel extension) +- 30 NN packed-delta block: ✅ NN × 12-bit signed deltas in NN/4 groups +- MicL → dB(L) conversion: ✅ ``mic_count_to_db`` matches BW display +- Production wiring: ✅ ``client.py:_decode_a5_waveform`` uses the new + codec (via ``decode_a5_frames``). ``.h5`` sidecars now render + correctly. -Production code in client.py still uses the broken int16 LE decoder. -``decode_waveform_v2`` here returns ``None`` as a placeholder. Callers -that need sample arrays should treat the legacy decoder's output as -"unverified" — the BW binary write path is the only sample-bearing -output that is currently trustworthy. +Known limitations: + +- Walker stops early on the loudest events (SP0, SS0, SV0, event-b) at + some mid-segment edge cases not yet fully characterized. Every + sample reached IS correct; the walker just doesn't reach all of + them yet. The cleanly-decoded subset is still ~5000–15000 samples + per loud event. ──────────────────────────────────────────────────────────────────────────── Body layout (CONFIRMED 2026-05-11 against 8 fixture events) @@ -132,6 +136,7 @@ and the suggested next experiment ("segment-channel scoring analyzer"). from __future__ import annotations +import math from dataclasses import dataclass from typing import List, Optional, Tuple @@ -446,6 +451,12 @@ def decode_waveform_v2(body: bytes) -> Optional[dict]: header = blocks[hi] if len(header.data) < 18: continue + # Validate: real segment headers have bytes [12:14] = `02 00`. + # Trailer/footer "40 02" markers contain ASCII serial bytes or other + # non-header data there and would otherwise be mis-interpreted as + # segment headers, adding spurious samples at the tail. + if header.data[12:14] != b"\x02\x00": + break # Extend the PREVIOUS channel by 2 more samples (deltas in bytes [0:4]). prev_d0 = int.from_bytes(header.data[0:2], "big", signed=True) prev_d1 = int.from_bytes(header.data[2:4], "big", signed=True) @@ -464,3 +475,88 @@ def decode_waveform_v2(body: bytes) -> Optional[dict]: last_value[channel] = apply_blocks(channel, c1, hi + 1, next_hi) return out + + +# ── ADC-scale conversion helpers ──────────────────────────────────────────── + + +# Scaling factor: decode_waveform_v2 produces geo-channel samples in the BW +# display quantization (16-count units, LSB = 0.005 in/s at Normal range). +# The legacy consumer pipeline (sfm/event_hdf5.py) expects raw_samples in +# 1-count ADC units (× full_scale / 32768 → physical). To plug the new +# decoder in without rewriting consumers, multiply geo values by 16. +# +# Mic samples are already in raw ADC counts (decoded value 1 = 1 mic ADC count +# = -81.94 dB on the BW display). Mic values pass through unchanged. +_GEO_DECODER_TO_ADC = 16 + + +def decoded_to_adc_counts(decoded: dict) -> dict: + """Convert :func:`decode_waveform_v2` output to int16 ADC counts. + + Geo channels are scaled by ×16 (decoder produces 16-count units, + consumer expects 1-count ADC). Mic is passed through as raw counts. + """ + if not decoded: + return {} + return { + "Tran": [v * _GEO_DECODER_TO_ADC for v in decoded.get("Tran", [])], + "Vert": [v * _GEO_DECODER_TO_ADC for v in decoded.get("Vert", [])], + "Long": [v * _GEO_DECODER_TO_ADC for v in decoded.get("Long", [])], + "MicL": list(decoded.get("MicL", [])), + } + + +def mic_count_to_db(count: int) -> float: + """Convert a MicL ADC count to dB(L) for BW-display-compatible output. + + Empirical formula (confirmed 2026-05-11 against V70 fixture: count=813 + → 140.1 dB; count=±1 → ±81.94 dB; count=±24 → ±109.5 dB): + + dB = sign(count) × (81.94 + 20 × log10(|count|)) for |count| ≥ 1 + dB = 0.0 for count == 0 + + The constant 81.94 corresponds to 10^(81.94/20) ≈ 12490 mic ADC counts + being the dB(L) reference level — almost certainly a calibration + constant from the device's mic. + """ + if count == 0: + return 0.0 + sign = 1.0 if count > 0 else -1.0 + return sign * (81.94 + 20.0 * math.log10(abs(count))) + + +# ── A5-frame entry point ──────────────────────────────────────────────────── + + +def decode_a5_frames(a5_frames) -> Optional[dict]: + """Decode a list of A5 (BULK_WAVEFORM_STREAM) frames into per-channel + int16 ADC samples. + + Returns ``{"Tran": [...], "Vert": [...], "Long": [...], "MicL": [...]}`` + with each channel's samples in **1-count ADC units** (the legacy + ``event.raw_samples`` convention — multiply by ``full_scale / 32768`` + to convert to physical units; for mic, use :func:`mic_count_to_db` or + a per-count psi factor). + + Returns ``None`` if the frames cannot be parsed. + + This is the wired-up production entry point. It: + 1. Reconstructs the BW-binary body bytes from the A5 frames + (``blastware_file.extract_body_bytes``). + 2. Runs the verified codec (``decode_waveform_v2``) on the body. + 3. Converts to int16 ADC counts via :func:`decoded_to_adc_counts`. + """ + # Local import to avoid a cycle: blastware_file imports models and + # ultimately client.py imports waveform_codec. + from .blastware_file import extract_body_bytes + + if not a5_frames: + return None + _strt, body, _footer = extract_body_bytes(a5_frames) + if not body: + return None + decoded = decode_waveform_v2(body) + if decoded is None: + return None + return decoded_to_adc_counts(decoded) diff --git a/tests/test_waveform_codec.py b/tests/test_waveform_codec.py index e24463d..fe62cf6 100644 --- a/tests/test_waveform_codec.py +++ b/tests/test_waveform_codec.py @@ -16,7 +16,9 @@ from minimateplus.waveform_codec import ( WaveformBlock, decode_tran_initial, decode_waveform_v2, + decoded_to_adc_counts, find_data_start, + mic_count_to_db, parse_segment_header, split_segments, walk_body, @@ -448,3 +450,60 @@ def test_decode_tran_initial_full_segment_silent_events(): ) # And we should have decoded at least 400 samples (= segment 0 worth). assert n >= 400, f"only {n} samples decoded for {path}" + + +# ── ADC scaling + dB conversion ────────────────────────────────────────────── + + +def test_decoded_to_adc_counts_geo_scales_by_16(): + """Geo channels in decoder units (16-count) should multiply by 16 to ADC.""" + decoded = {"Tran": [0, 1, -2, 100], "Vert": [5], "Long": [-10], "MicL": [813]} + adc = decoded_to_adc_counts(decoded) + assert adc["Tran"] == [0, 16, -32, 1600] + assert adc["Vert"] == [80] + assert adc["Long"] == [-160] + # Mic passes through unchanged (already ADC counts). + assert adc["MicL"] == [813] + + +def test_decoded_to_adc_counts_empty(): + assert decoded_to_adc_counts({}) == {} + assert decoded_to_adc_counts( + {"Tran": [], "Vert": [], "Long": [], "MicL": []} + ) == {"Tran": [], "Vert": [], "Long": [], "MicL": []} + + +def test_mic_count_to_db_zero_is_zero(): + assert mic_count_to_db(0) == 0.0 + + +def test_mic_count_to_db_unit_is_reference(): + """count = ±1 → ±81.94 dB (the calibration reference).""" + assert abs(mic_count_to_db(1) - 81.94) < 0.01 + assert abs(mic_count_to_db(-1) - (-81.94)) < 0.01 + + +def test_mic_count_to_db_doubles_every_6db(): + """Each doubling of |count| adds ~6.02 dB.""" + # count=2 → 87.96 dB (+ 6.02 from 81.94) + assert abs(mic_count_to_db(2) - 87.96) < 0.05 + # count=4 → 93.98 dB + assert abs(mic_count_to_db(4) - 93.98) < 0.05 + # count=8 → 100.00 dB + assert abs(mic_count_to_db(8) - 100.00) < 0.05 + + +def test_mic_count_to_db_v70_peak(): + """V70 mic peak count 813 → 140.14 dB (matches BW reported PSPL 140.1).""" + assert abs(mic_count_to_db(813) - 140.14) < 0.1 + # And the negative-direction equivalent + assert abs(mic_count_to_db(-813) - (-140.14)) < 0.1 + + +# ── End-to-end: decode_a5_frames (production entry point) ─────────────────── + + +def test_decode_a5_frames_empty(): + from minimateplus.waveform_codec import decode_a5_frames + assert decode_a5_frames([]) is None + assert decode_a5_frames(None) is None