2026-05-28 17:54:34 -04:00
4 changed files with 138 additions and 51 deletions
@@ -1,6 +1,6 @@
 /bridges/captures/
 /example-events/
-
+/tests/fixtures/
 /manuals/

 # Python build artifacts
@@ -12,7 +12,21 @@ implementation lives in `minimateplus/histogram_codec.py`.
 in-repo histogram fixture corpus decodes byte-exact against BW's
 ASCII export.

-24 regression tests pass against ~3,500 blocks across 5 fixtures.
+26 regression tests pass against ~3,500 blocks across 5 in-repo
+fixtures, plus a synthetic regression block taken from a real
+BE9558 prod event to lock in the uint8-peak interpretation.
+
+**Important correction (2026-05-21):** the per-channel peak count
+is `uint8` at byte[6]/[10]/[14]/[18], NOT `uint16 LE` at byte[6:8]
+etc.  The N844 fixture corpus the original RE was done against has
+zero values in bytes [7]/[11]/[15]/[19] for every block, so the
+two interpretations happened to be equivalent.  Cross-correlating
+non-N844 events (BE9558 Tran-drift, BE18003 Histogram+Continuous)
+against BW's per-interval ASCII export — 4 channels × ~1400 blocks
+per event × multiple events = 100% byte-exact only when the peak
+is read as uint8.  Reading as uint16 LE produced peaks up to 268
+in/s per channel and 35× inflated PVS sums when first deployed to
+prod (rolled back, root-caused, and fixed in commit 7183b95+1).

 ## Body format

@@ -27,15 +41,21 @@ Each block represents one histogram interval.  Block layout:
 [1]    segment_id (uint8)        0x00..0x03 — 256 blocks per segment
 [2:4]  block_ctr (uint16 LE)     resets each segment (0x0100, 0x0101, …)
 [4:6]  0x000a (uint16 LE)        constant marker (= 10)
-[6:8]  T_peak_count   uint16 LE  Tran peak (count × 0.005 → in/s at Normal)
+[6]    T_peak_count   uint8      Tran peak (count × 0.005 → in/s at Normal,
+                                  max 1.275 in/s — fits in uint8)
+[7]    T_annotation   uint8      empirically non-zero on intervals with sub-Hz
+                                  or unmeasurable freq; meaning not fully RE'd
 [8:10] T_halfperiod   uint16 LE  Tran half-period in samples
                                  (freq_Hz = 512 / halfp; ≤ 5 means ">100 Hz")
-[10:12] V_peak_count  uint16 LE  Vert peak
+[10]   V_peak_count   uint8      Vert peak
+[11]   V_annotation   uint8
 [12:14] V_halfperiod  uint16 LE  Vert freq half-period
-[14:16] L_peak_count  uint16 LE  Long peak
+[14]   L_peak_count   uint8      Long peak
+[15]   L_annotation   uint8
 [16:18] L_halfperiod  uint16 LE  Long freq half-period
-[18:20] M_peak_count  uint16 LE  MicL peak count
+[18]   M_peak_count   uint8      MicL peak count
                                  (dB via waveform_codec.mic_count_to_db)
+[19]   M_annotation   uint8
 [20:22] M_halfperiod  uint16 LE  MicL freq half-period
 [22:24] 0x00 0x00                constant
 [24:28] 4-byte variable          purpose unknown — possibly CRC,
@@ -99,6 +119,16 @@ slot[8] = 9  → 512/9 = 56.9 → 57 Hz       ✓ M_freq

 ## What's NOT yet decoded

+- **Annotation bytes (`block[7]/[11]/[15]/[19]`)**.  Empirically
+  non-zero on intervals where the per-channel ZC frequency comes
+  out as `N/A` or sub-Hz (`<1.0`, `1.X`).  Hypothesis tested in the
+  RE session: byte != 0 ↔ sub-Hz freq.  Only ~50% correlation
+  across the K558 corpus, so the relationship is more complex.
+  Possibilities: time-of-peak-within-interval, halfp extension for
+  very-long-period signals, or a debug/diagnostic field the firmware
+  writes opportunistically.  Doesn't affect peak amplitudes or
+  waveform reconstruction.  Captured as `record["annotations"]` for
+  future RE.
 - **4-byte variable metadata field (bytes 24:28)**.  Not needed for
  waveform reconstruction.  Speculation: per-block CRC, sub-second
  timestamp offset, or a Mic psi(L) count not in the 9 samples.
@@ -28,18 +28,32 @@ iterate 32-stride and stop before the tail.
    [1]    segment_id  (uint8)       0x00..0x03 — 256 blocks per segment
    [2:4]  block_ctr  (uint16 LE)    resets each segment (0x0100, 0x0101, …)
    [4:6]  0x000a (uint16 LE)        constant marker (= 10)
-    [6:8]  T_peak_count   uint16 LE  Tran peak (count × 0.005 → in/s)
+    [6]    T_peak_count   uint8      Tran peak (count × 0.005 → in/s, max 1.275 in/s)
+    [7]    T_annotation   uint8      empirically non-zero on intervals with sub-Hz
+                                     or unmeasurable Tran freq; meaning not fully RE'd
    [8:10] T_halfperiod   uint16 LE  Tran half-period in samples (freq = 512 / halfp Hz)
-    [10:12] V_peak_count  uint16 LE
+    [10]   V_peak_count   uint8
+    [11]   V_annotation   uint8
    [12:14] V_halfperiod  uint16 LE
-    [14:16] L_peak_count  uint16 LE
+    [14]   L_peak_count   uint8
+    [15]   L_annotation   uint8
    [16:18] L_halfperiod  uint16 LE
-    [18:20] M_peak_count  uint16 LE  MicL peak (count → dB via mic_count_to_db)
+    [18]   M_peak_count   uint8      MicL peak (count → dB via mic_count_to_db)
+    [19]   M_annotation   uint8
    [20:22] M_halfperiod  uint16 LE  MicL half-period in samples (freq = 512 / halfp Hz)
    [22:24] 0x00 0x00                constant
    [24:28] 4-byte variable          purpose unknown (possibly CRC or timestamp delta)
    [28:32] 0x1e 0x0a 0x00 0x00      constant block-end signature

+NOTE on peak-count width: an earlier interpretation treated the peak
+fields as uint16 LE spanning [6:8] / [10:12] / [14:16] / [18:20].
+That happened to be byte-exact against the N844 fixture corpus only
+because every annotation byte in those fixtures was zero, making
+``uint16 LE == uint8``.  Cross-correlating BE9558 (K558) Tran-drift
+and BE18003 (T003) Histogram+Continuous events against the BW ASCII
+export proved peak is uint8 alone — see test_histogram_codec.py
+and docs/histogram_codec_re_status.md.
+
 Block-identification anchor: ``block[22:24] == b"\\x00\\x00"`` AND
 ``block[28:32] == b"\\x1e\\x0a\\x00\\x00"``.  This is the reliable
 distinguisher from non-block content in the file.
@@ -101,30 +115,6 @@ _BLOCK_SIZE = 32
 # additional validation that we're looking at a real block.
 _BLOCK_MARKER = 10

-# Maximum plausible peak-count value.  The geophone tops out at 10 in/s
-# at Normal range = 2000 counts at the 0.005 in/s per count scale.
-# Sensitive range (1.25 in/s FS) tops at ~250.  Mic peak counts have
-# been observed up to ~400 (≈ 100 dB(L)) and per the protocol doc can
-# reach ~813 (140 dB(L)).  2200 covers Normal full-scale plus ~10%
-# headroom for quantization edge cases while keeping every physically
-# implausible value out of the PVS computation.
-#
-# Some prod blocks have been observed with peak-count fields whose
-# HIGH byte is non-zero (block[7] != 0 etc.) — observed across BE9558
-# and BE18003 units in Histogram-mode events.  Reading these as
-# uint16 LE produces values like 30981 / 41733 / 62469, which scale
-# to physically impossible peaks (150+ in/s).  Best guess: an
-# undocumented "time-of-peak-within-interval" extension byte the
-# device writes in some sub-mode (possibly Histogram+Continuous).
-# Until reverse-engineered, blocks exceeding this bound are skipped
-# rather than propagating bogus values into PVS computations.
-#
-# Earlier we tried 4096 — that allowed peak counts up to 4096 × 0.005
-# = 20.48 in/s per channel, which produced 35× inflated PVS sums when
-# the extension-byte blocks slipped through.  See feat/wire-histogram-codec
-# branch history for the rollback.
-_MAX_PEAK_COUNT = 2200
-
 # Geo peak scaling: stored as "count × 0.005 in/s" where 1 count = one
 # 0.005 in/s display quantum.  Equivalent to the waveform codec's
 # 16-count-unit output (1 unit = 0.005 in/s = 16 ADC counts).
@@ -156,23 +146,36 @@ def _decode_block(block: bytes) -> Optional[dict]:
    """Decode one 32-byte histogram block.  Caller must have validated
    with ``_is_data_block`` first.

-    Returns ``None`` if any peak field exceeds ``_MAX_PEAK_COUNT`` —
-    those blocks contain an undocumented extension byte format whose
-    naive uint16 LE interpretation gives physically impossible peaks.
-    Skipping the block is safer than propagating bogus values into
-    PVS computations downstream.
+    Returns a record with per-channel peak counts (uint8) and
+    half-periods (uint16 LE).
    """
-    # All 16-bit fields are little-endian unsigned.  Peak counts are
-    # always non-negative; half-periods are always positive when valid.
-    t_peak, t_halfp, v_peak, v_halfp, l_peak, l_halfp, m_peak, m_halfp = struct.unpack_from(
-        "<HHHHHHHH", block, 6
-    )
-    if (t_peak > _MAX_PEAK_COUNT or v_peak > _MAX_PEAK_COUNT
-            or l_peak > _MAX_PEAK_COUNT or m_peak > _MAX_PEAK_COUNT):
-        return None
+    # Peak counts are uint8 at bytes [6] / [10] / [14] / [18].  The
+    # adjacent bytes [7] / [11] / [15] / [19] hold an annotation field
+    # whose meaning isn't fully understood (empirically non-zero in
+    # intervals with sub-Hz or unmeasurable geo frequencies, mostly
+    # zero otherwise — see test fixtures from BE9558/BE18003 corpora).
+    # Crucially, those annotation bytes are NOT the high byte of the
+    # peak count: cross-correlating against BW's per-interval ASCII
+    # export proves the peak is uint8 alone.
+    #
+    # Reading the peak as uint16 LE (the original interpretation) was
+    # accidentally correct only because every block in the N844 fixture
+    # corpus had a zero annotation byte; non-N844 events with non-zero
+    # annotation bytes decoded to physically impossible peaks (e.g.
+    # 268 in/s per channel) and produced 35× inflated PVS sums when
+    # first run against prod data.  See histogram_codec_re_status.md.
+    t_peak = block[6]
+    v_peak = block[10]
+    l_peak = block[14]
+    m_peak = block[18]
+    t_halfp = block[8]  | (block[9]  << 8)
+    v_halfp = block[12] | (block[13] << 8)
+    l_halfp = block[16] | (block[17] << 8)
+    m_halfp = block[20] | (block[21] << 8)
    segment_id = block[1]
    block_ctr  = block[2] | (block[3] << 8)
    var_meta   = bytes(block[24:28])
+    annotations = (block[7], block[11], block[15], block[19])
    return {
        "segment_id":  segment_id,
        "block_ctr":   block_ctr,
@@ -185,6 +188,7 @@ def _decode_block(block: bytes) -> Optional[dict]:
        "m_peak":      m_peak,
        "m_halfp":     m_halfp,
        "meta_var":    var_meta,
+        "annotations": annotations,
    }


@@ -192,10 +196,15 @@ def walk_body(body: bytes) -> List[dict]:
    """Walk the body and return one dict per histogram interval.

    Iterates 32-byte strides from offset 0.  Yields a decoded record
-    for every block that passes ``_is_data_block`` validation AND has
-    plausible peak values (``_decode_block`` returns None for blocks
-    with out-of-bound peaks).  Stops when the remaining bytes are too
-    short to form a complete block.
+    for every block that passes ``_is_data_block`` validation.  Stops
+    when the remaining bytes are too short to form a complete block.
+
+    In Histogram+Continuous mode the body interleaves data blocks with
+    other 32-byte content (likely continuous-mode waveform blocks) that
+    fail the data-block validation; the walker naturally skips them
+    without losing 32-byte alignment.  Use ``block_ctr`` from each
+    returned record to map back to the original interval index — the
+    record list is sparse when other block types are interleaved.
    """
    records: List[dict] = []
    for off in range(0, len(body) - _BLOCK_SIZE + 1, _BLOCK_SIZE):
@@ -335,3 +335,51 @@ def test_geo_count_to_ins_scale():
    assert geo_count_to_ins(1)  == pytest.approx(0.005)
    assert geo_count_to_ins(10) == pytest.approx(0.050)
    assert geo_count_to_ins(0)  == 0.0
+
+
+# ── Regression: peak is uint8 byte[N], NOT uint16 LE byte[N:N+2] ────────────
+#
+# Block taken verbatim from K558LKZU.RE0H (BE9558) interval 12 — a real
+# field event where the Tran channel had developed a DC offset and was
+# producing sub-Hz drift content the device couldn't characterize.
+# The annotation byte at [7] = 0xd2 is non-zero in that case.  The
+# legacy codec read [6:8] as uint16 LE, producing T_peak = 53763 →
+# 268 in/s — physically impossible and 35× too high for the actual
+# 0.015 in/s value (T_lo = 3 alone gives the correct count).
+# Verified against the paired BW ASCII export.
+_K558_INTERVAL_12_BLOCK = bytes.fromhex(
+    "00 00 0c 01 0a 00 03 d2 45 00 02 00 02 00 02 00"
+    "02 00 10 00 06 00 00 00 0e 91 2f 00 1e 0a 00 00".replace(" ", "")
+)
+
+
+def test_extension_byte_does_not_inflate_peak():
+    """The annotation byte at [7]/[11]/[15]/[19] must NOT contribute to
+    the peak count.  Decoded T_peak must be 3 (uint8 byte[6]), NOT
+    53763 (uint16 LE byte[6:8])."""
+    body = _K558_INTERVAL_12_BLOCK
+    records = decode_histogram_body_full(body)
+    assert records is not None
+    assert len(records) == 1
+    r = records[0]
+    assert r["t_peak"] == 3,    f"T_peak should be 3 (uint8), got {r['t_peak']}"
+    assert r["v_peak"] == 2
+    assert r["l_peak"] == 2
+    assert r["m_peak"] == 16
+    # Half-periods unchanged — still uint16 LE.
+    assert r["t_halfp"] == 0x0045  # 69 → 7.4 Hz
+    assert r["m_halfp"] == 6       # → 85.3 Hz
+    # Annotation byte is preserved (for future RE) but does not affect peak.
+    assert r["annotations"] == (0xd2, 0x00, 0x00, 0x00)
+
+
+def test_extension_byte_decoded_to_correct_in_s():
+    """End-to-end: the channel-grouped output for the K558 ext block
+    should give T = 3 counts = 0.015 in/s, not 53763 counts = 268 in/s."""
+    channels = decode_histogram_body(_K558_INTERVAL_12_BLOCK)
+    assert channels is not None
+    assert channels["Tran"] == [3]
+    assert geo_count_to_ins(channels["Tran"][0]) == pytest.approx(0.015)
+    assert channels["Vert"] == [2]
+    assert channels["Long"] == [2]
+    assert channels["MicL"] == [16]