seismo-relay/sfm/waveform_store.py

"""
sfm/waveform_store.py — On-disk store for Blastware-format event files.

Layout (flat per-serial, four files per event):

    <root>/<serial>/<filename>            ← event file (BW-readable binary)
    <root>/<serial>/<filename>.a5.pkl     ← pickled list of A5 S3Frame dicts
    <root>/<serial>/<filename>.h5         ← clean waveform arrays (HDF5)
    <root>/<serial>/<filename>.sfm.json   ← modern sidecar (peaks, project,
                                            review state, extensions)

`<filename>` is whatever `minimateplus.blastware_file.blastware_filename`
produces for the event.  The extension is NOT a fixed type tag — it
encodes the event timestamp (`AB0T` format).

Roles:
  - BW binary: what Blastware reads.  Untouched.  The user-facing review
    waveform viewer.
  - .a5.pkl: regenerative source.  Lets the BW binary be rebuilt
    byte-for-byte if the encoder changes.  Never delete.
  - .h5: clean per-channel waveform arrays in physical units (in/s for
    geo, psi for mic) plus event metadata.  Canonical format for
    downstream analysis tools and the `/device/event/{idx}/waveform`
    endpoint's plot-JSON output.
  - .sfm.json: small, queryable metadata + review state.  SQL
    `events.false_trigger` is a derived index kept in sync via
    `patch_sidecar()`.
"""

from __future__ import annotations

import datetime
import logging
import pickle
import shutil
from pathlib import Path
from typing import Optional, Union

from minimateplus import event_file_io
from minimateplus.blastware_file import blastware_filename, write_blastware_file
from minimateplus.framing import S3Frame
from minimateplus.models import Event
from sfm import event_hdf5

log = logging.getLogger("sfm.waveform_store")

A5_PICKLE_VERSION = 1


def _frame_to_dict(f: S3Frame) -> dict:
    return {
        "sub": f.sub,
        "page_hi": f.page_hi,
        "page_lo": f.page_lo,
        "data": bytes(f.data),
        "chk_byte": f.chk_byte,
        "checksum_valid": f.checksum_valid,
    }


def _dict_to_frame(d: dict) -> S3Frame:
    return S3Frame(
        sub=d["sub"],
        page_hi=d["page_hi"],
        page_lo=d["page_lo"],
        data=bytes(d["data"]),
        checksum_valid=d.get("checksum_valid", True),
        chk_byte=d.get("chk_byte", 0),
    )


class WaveformStore:
    """
    Persistent store for Blastware-format waveform files + their A5 source frames.

    Thread safety: write_blastware_file is single-shot; concurrent saves of the
    *same* filename would race, but the filename encodes second-resolution
    timestamps + serial, so collisions across threads/processes are vanishingly
    unlikely in practice.
    """

    def __init__(self, root: str | Path) -> None:
        self.root = Path(root)
        self.root.mkdir(parents=True, exist_ok=True)
        log.info("WaveformStore root=%s", self.root)

    # ── path helpers ────────────────────────────────────────────────────────────

    def _serial_dir(self, serial: str) -> Path:
        d = self.root / serial
        d.mkdir(parents=True, exist_ok=True)
        return d

    def paths_for(self, serial: str, filename: str) -> tuple[Path, Path]:
        """Return (blastware_path, a5_pickle_path) for a given serial+filename.

        For the sidecar path use `sidecar_path_for()` — kept separate so
        existing callers don't need to unpack a 3-tuple.
        """
        d = self._serial_dir(serial)
        return d / filename, d / f"{filename}.a5.pkl"

    def sidecar_path_for(self, serial: str, filename: str) -> Path:
        """Return absolute path to the .sfm.json sidecar for a given event."""
        return self._serial_dir(serial) / f"{filename}.sfm.json"

    def hdf5_path_for(self, serial: str, filename: str) -> Path:
        """Return absolute path to the .h5 clean-waveform file for a given event."""
        return self._serial_dir(serial) / f"{filename}.h5"

    def txt_path_for(self, serial: str, filename: str) -> Path:
        """Return absolute path to the preserved BW ASCII report (.TXT)
        for a given event.

        We name it ``<filename>_ASCII.TXT`` to match BW's own filename
        convention in the ACH folder.  Saved at ingest time alongside
        the binary so the parser bug fixes can be applied retroactively
        by re-parsing without needing to re-forward from the watcher PC.
        """
        return self._serial_dir(serial) / f"{filename}_ASCII.TXT"

    def open_blastware(self, serial: str, filename: str) -> Optional[Path]:
        """Return absolute path to an existing event file or None."""
        bw_path, _ = self.paths_for(serial, filename)
        return bw_path if bw_path.exists() else None

    def open_txt(self, serial: str, filename: str) -> Optional[Path]:
        """Return absolute path to the preserved BW ASCII report for an
        event, or None if the .TXT wasn't saved at ingest time (events
        ingested before .TXT preservation landed will show None until
        re-forwarded)."""
        p = self.txt_path_for(serial, filename)
        return p if p.exists() else None

    # ── save / load ─────────────────────────────────────────────────────────────

    def save(
        self,
        ev: Event,
        serial: str,
        a5_frames: list[S3Frame],
        *,
        source_kind: str = "sfm-live",
        geo_range = "normal",
    ) -> dict:
        """
        Write all four event-file artifacts for one event:
          - <filename>             BW binary
          - <filename>.a5.pkl      raw A5 frame pickle
          - <filename>.h5          clean waveform (HDF5)
          - <filename>.sfm.json    modern sidecar (metadata + review)

        Returns a record dict suitable for persisting alongside the DB row:

            {
              "filename":           "M529LKIQ.7M0W",
              "filesize":           8708,
              "sha256":             "a1b2c3...",
              "a5_pickle_filename": "M529LKIQ.7M0W.a5.pkl",
              "hdf5_filename":      "M529LKIQ.7M0W.h5",
              "sidecar_filename":   "M529LKIQ.7M0W.sfm.json",
            }

        `source_kind` flows into `sidecar.source.kind` — callers should
        pass "sfm-live" (default) for the live endpoint and "sfm-ach" for
        the ACH ingestion path.  BW-imported events use save_imported_bw()
        instead.

        `geo_range` controls the ADC-counts → in/s scaling in the HDF5
        file ("normal" = 10 in/s FS, "sensitive" = 1.25 in/s FS).
        Defaults to "normal" — callers with compliance-config access
        should pass the actual unit setting so the saved samples are in
        the right units.

        Idempotent: if the event file already exists, it is overwritten
        with the freshly-encoded version (same bytes for the same
        a5_frames) and the sidecar's review block is preserved across
        re-saves.
        """
        if not a5_frames:
            raise ValueError("WaveformStore.save: a5_frames is empty")
        if not serial:
            raise ValueError("WaveformStore.save: serial is required")

        filename = blastware_filename(ev, serial)
        bw_path, a5_path = self.paths_for(serial, filename)
        sidecar_path = self.sidecar_path_for(serial, filename)
        hdf5_path    = self.hdf5_path_for(serial, filename)

        # 1. encode the event file (defensive unlink prevents trailing-byte
        # leaks from a previous larger file on synced/odd filesystems).
        try:
            bw_path.unlink()
        except FileNotFoundError:
            pass
        write_blastware_file(ev, a5_frames, bw_path)
        filesize = bw_path.stat().st_size
        sha256   = event_file_io.file_sha256(bw_path)

        # 2. write the .a5.pkl sidecar
        try:
            a5_path.unlink()
        except FileNotFoundError:
            pass
        payload = {
            "version": A5_PICKLE_VERSION,
            "frames": [_frame_to_dict(f) for f in a5_frames],
        }
        with a5_path.open("wb") as fp:
            pickle.dump(payload, fp, protocol=pickle.HIGHEST_PROTOCOL)

        # 3. write the .h5 clean-waveform file (samples in physical units).
        # Best-effort: a write failure shouldn't sink the rest of the save
        # (the HDF5 can be regenerated later from the .a5.pkl).
        hdf5_filename: Optional[str] = None
        try:
            event_hdf5.write_event_hdf5(
                hdf5_path, ev,
                serial=serial,
                geo_range=geo_range,
                source_kind=source_kind,
            )
            hdf5_filename = hdf5_path.name
        except Exception as exc:
            log.warning(
                "save: HDF5 write failed for %s: %s — continuing without .h5",
                hdf5_path, exc,
            )

        # 4. write the .sfm.json sidecar.  Preserve any existing review
        # block + extensions across re-saves so user edits aren't lost
        # when the same event is re-downloaded (e.g. via Force refresh).
        existing_review     = None
        existing_extensions = None
        if sidecar_path.exists():
            try:
                old = event_file_io.read_sidecar(sidecar_path)
                existing_review     = old.get("review")
                existing_extensions = old.get("extensions")
            except Exception as exc:
                log.warning(
                    "save: existing sidecar at %s unreadable (%s); overwriting",
                    sidecar_path, exc,
                )

        sidecar = event_file_io.event_to_sidecar_dict(
            ev,
            serial=serial,
            blastware_filename=filename,
            blastware_filesize=filesize,
            blastware_sha256=sha256,
            source_kind=source_kind,
            a5_pickle_filename=a5_path.name,
            review=existing_review,
            extensions=existing_extensions,
        )
        event_file_io.write_sidecar(sidecar_path, sidecar)

        log.info(
            "WaveformStore.save serial=%s filename=%s filesize=%d frames=%d "
            "h5=%s sidecar=%s",
            serial, filename, filesize, len(a5_frames),
            hdf5_filename or "(skipped)", sidecar_path.name,
        )
        return {
            "filename":            filename,
            "filesize":            filesize,
            "sha256":              sha256,
            "a5_pickle_filename":  a5_path.name,
            "hdf5_filename":       hdf5_filename,
            "sidecar_filename":    sidecar_path.name,
        }

    def save_imported_bw(
        self,
        bw_bytes: bytes,
        source_path: Path,
        *,
        serial_hint: Optional[str] = None,
        bw_report_text: Optional[Union[str, bytes]] = None,
    ) -> tuple[Event, dict]:
        """
        Ingest a Blastware event file produced by an external tool
        (Blastware's own ACH, manual download, etc.) where the source A5
        frames aren't available.

        Workflow:
          1. Parse the bytes via event_file_io.read_blastware_file (writes
             a temp file to do that, since the parser takes a path).
          2. Optionally parse a paired BW ASCII event report (the .TXT
             file BW writes alongside the binary).  When supplied, its
             decoded fields land in the sidecar's `bw_report` block AND
             overlay the device-authoritative peak values into the
             top-level `peak_values` block.  This is the right path for
             the ACH-forwarder daemon use case where Blastware's own
             ACH writes both files into the watch folder.
          3. Resolve serial from BW filename (`<P><serial3>...`) or use
             serial_hint.  Falls back to "UNKNOWN".
          4. Copy the BW bytes verbatim into <root>/<serial>/<filename>.
          5. Write the .sfm.json sidecar with source.kind = "bw-import"
             and a5_pickle_filename = None.  Does NOT write a .a5.pkl
             (no A5 source available; byte-for-byte regeneration not
             possible — the on-disk BW file IS the byte-for-byte source).

        Returns (event, record_dict) so callers can both insert into
        SeismoDb and surface the parsed Event.
        """
        # Stash the bytes to a temp path so read_blastware_file (path-based)
        # can parse without us duplicating its logic.
        import tempfile
        with tempfile.NamedTemporaryFile(suffix=".bw", delete=False) as tmp:
            tmp.write(bw_bytes)
            tmp_path = Path(tmp.name)
        try:
            ev = event_file_io.read_blastware_file(tmp_path)
        finally:
            try:
                tmp_path.unlink()
            except FileNotFoundError:
                pass

        # read_blastware_file derives record_type from its path arg, but
        # that arg is the tmp file (suffix ".bw") — so override with the
        # original filename's encoded type (H/W/M/E/C in the BW AB0T
        # scheme).  Without this override every BW-imported event lands
        # in the DB with record_type="Waveform" regardless of the actual
        # type (Histogram, Manual, etc.).
        ev.record_type = event_file_io.derive_record_type_from_filename(
            source_path.name
        )

        # Parse the BW ASCII report if one was supplied.  Failures here
        # are non-fatal: we still write the binary + sidecar without the
        # rich derived fields.
        bw_report = None
        if bw_report_text is not None:
            try:
                from minimateplus.bw_ascii_report import parse_report
                bw_report = parse_report(bw_report_text)
            except Exception as exc:
                log.warning(
                    "save_imported_bw: BW report parse failed: %s — continuing without it",
                    exc,
                )

        # If we have a report, overlay its device-authoritative fields
        # (peaks, project, sample_rate, record_time) onto the Event
        # BEFORE handing it to db.insert_events().  Without this overlay
        # the DB row gets `peak_values` from _peaks_from_samples(), which
        # runs the still-undecoded waveform codec on the BW body and
        # produces ±10 in/s saturation values on every channel for every
        # event.  The sidecar JSON had the correct values via
        # event_to_sidecar_dict(bw_report=...) but the DB columns didn't.
        if bw_report is not None:
            try:
                event_file_io.apply_report_to_event(ev, bw_report)
            except Exception as exc:
                log.warning(
                    "save_imported_bw: failed to overlay report onto event: %s",
                    exc,
                )

        # Resolve serial.  blastware_filename derives a 4-char prefix from
        # the numeric serial (e.g. BE11529 → M529); we go the other way
        # via the source filename if a hint wasn't given.
        serial = serial_hint or _serial_from_bw_filename(source_path.name) or "UNKNOWN"

        # Use the source filename verbatim — it already encodes timestamp
        # + record type per BW's AB0T scheme, and we want to preserve it
        # so the file BW knows about can be opened back in BW.
        filename = source_path.name
        bw_path = self._serial_dir(serial) / filename

        # 1. copy bytes
        bw_path.write_bytes(bw_bytes)
        filesize = bw_path.stat().st_size
        sha256   = event_file_io.file_sha256(bw_path)

        # 1b. preserve the raw BW ASCII report (.TXT) alongside the binary.
        # Saved at <root>/<serial>/<filename>_ASCII.TXT.  Lets us re-parse
        # offline after parser fixes without needing to re-forward from
        # the watcher PC.  Negligible storage cost (~15 KB per event).
        # Skipped silently when no report was supplied (live download path,
        # manual upload without paired TXT).
        txt_filename: Optional[str] = None
        if bw_report_text is not None:
            try:
                txt_path = self.txt_path_for(serial, filename)
                if isinstance(bw_report_text, bytes):
                    txt_path.write_bytes(bw_report_text)
                else:
                    txt_path.write_text(bw_report_text)
                txt_filename = txt_path.name
            except Exception as exc:
                log.warning(
                    "save_imported_bw: failed to save TXT for %s: %s — "
                    "continuing without it",
                    filename, exc,
                )

        # 2. write the .h5 clean-waveform file from the parsed Event.
        # Note: peaks here are computed from raw samples (the BW file
        # doesn't carry the device-authoritative 0C peaks).  Best-effort.
        hdf5_path = self.hdf5_path_for(serial, filename)
        hdf5_filename: Optional[str] = None
        try:
            event_hdf5.write_event_hdf5(
                hdf5_path, ev,
                serial=serial,
                geo_range="normal",   # BW file doesn't carry the range; assume Normal
                source_kind="bw-import",
            )
            hdf5_filename = hdf5_path.name
        except Exception as exc:
            log.warning(
                "save_imported_bw: HDF5 write failed for %s: %s — continuing",
                hdf5_path, exc,
            )

        # 3. write sidecar with source.kind = bw-import
        sidecar_path = self.sidecar_path_for(serial, filename)
        existing_review = None
        if sidecar_path.exists():
            try:
                existing_review = event_file_io.read_sidecar(sidecar_path).get("review")
            except Exception:
                pass

        sidecar = event_file_io.event_to_sidecar_dict(
            ev,
            serial=serial,
            blastware_filename=filename,
            blastware_filesize=filesize,
            blastware_sha256=sha256,
            source_kind="bw-import",
            a5_pickle_filename=None,
            txt_filename=txt_filename,
            review=existing_review,
            bw_report=bw_report,
        )
        event_file_io.write_sidecar(sidecar_path, sidecar)

        log.info(
            "WaveformStore.save_imported_bw serial=%s filename=%s filesize=%d "
            "h5=%s (no .a5.pkl — A5 source unavailable for BW-imported files)",
            serial, filename, filesize, hdf5_filename or "(skipped)",
        )
        return ev, {
            "filename":           filename,
            "filesize":           filesize,
            "sha256":             sha256,
            "a5_pickle_filename": None,
            "hdf5_filename":      hdf5_filename,
            "sidecar_filename":   sidecar_path.name,
            "serial":             serial,
        }

    def save_imported_idf(
        self,
        idf_bytes: bytes,
        source_path: Path,
        *,
        serial_hint: Optional[str] = None,
        idf_report_text: Optional[Union[str, bytes]] = None,
    ) -> tuple[Optional["Event"], dict]:
        """
        Ingest a Thor (Micromate Series IV) IDF event file (`.IDFW` or
        `.IDFH`) produced by Thor's TXT exporter.

        Workflow:
          1. For sig-A `.IDFW` binaries, decode samples + binary metadata
             via ``micromate.idf_file.read_idf_file()``.  Failure or
             non-IDFW path falls through to the .txt-only flow.
          2. Parse the paired TXT report (when supplied) via
             ``micromate.parse_idf_report`` → dict.  TXT remains the
             source of truth for fields the binary doesn't yet supply
             (full peak set with ZC freq / Time of Peak, sensor self-check,
             firmware string, project strings).
          3. Wrap parsed dict + filename into a typed ``micromate.IdfEvent``.
          4. Copy bytes verbatim into ``<root>/<serial>/<filename>``.
          5. Bridge IdfEvent → ``minimateplus.Event`` and attach
             ``raw_samples`` from the binary decoder (when available).
          6. Write the `.h5` clean-waveform file when samples decoded.
          7. Write the ``.sfm.json`` sidecar with
             ``source.kind = "idf-import"`` and the full raw IDF report
             under ``extensions.idf_report``.

        Returns ``(event, record_dict)`` so the endpoint can both insert
        into SeismoDb and surface the parsed event.
        """
        from micromate import IdfEvent, parse_idf_report

        # 1. Binary decode (sig-A IDFW and IDFH).  Non-fatal: any failure
        # leaves samples / binary metadata unfilled and we proceed with
        # the .txt path as before.
        idf_samples: Optional[dict] = None
        idf_intervals: Optional[list] = None
        binary_md = None
        binary_peaks = None
        is_histogram = False
        try:
            from micromate.idf_file import read_idf_file
            # Pass idf_bytes through `data=` — at this point in the flow
            # the binary hasn't been written to disk yet, so the codec
            # can't read from source_path.  We still pass source_path so
            # the codec has the filename for error messages + .IDFH
            # suffix detection.
            res = read_idf_file(source_path, data=idf_bytes)
            idf_samples = res.samples or None
            idf_intervals = res.intervals
            is_histogram = res.intervals is not None
            binary_md = res.binary_metadata
            binary_peaks = res.event.peaks
        except NotImplementedError:
            # sig-B — codec doesn't handle this yet.
            pass
        except Exception as exc:
            log.warning(
                "save_imported_idf: binary codec failed for %s: %s — "
                "falling back to .txt-only ingest",
                source_path.name, exc,
            )

        # 2. Parse the .txt sidecar (best-effort; non-fatal on failure).
        report_dict: dict = {}
        if idf_report_text is not None:
            try:
                report_dict = parse_idf_report(idf_report_text)
            except Exception as exc:
                log.warning(
                    "save_imported_idf: report parse failed: %s — continuing without it",
                    exc,
                )

        # 3. Backfill report_dict with binary metadata for fields the
        # .txt didn't supply.  Binary takes precedence on tied fields
        # where the binary is more reliable (timestamp, sample_rate),
        # and fills in fields entirely missing from the .txt.
        if binary_md is not None:
            if binary_md.serial and not report_dict.get("serial_number"):
                report_dict["serial_number"] = binary_md.serial
            if binary_md.event_datetime and not report_dict.get("event_datetime"):
                report_dict["event_datetime"] = binary_md.event_datetime
            if binary_md.sample_rate and not report_dict.get("sample_rate"):
                report_dict["sample_rate"] = binary_md.sample_rate
            if binary_md.record_time_sec and not report_dict.get("record_time_sec"):
                report_dict["record_time_sec"] = binary_md.record_time_sec
            # Calibration date (binary) vs calibration text (.txt) cohabit
            # under different keys; no overwrite needed.
            if binary_md.event_datetime and not report_dict.get("event_type"):
                report_dict["event_type"] = (
                    "Full Histogram" if is_histogram else "Full Waveform"
                )

        # Binary-derived peaks fill in when the .txt didn't supply them.
        # They're ~3% low vs the device-authoritative .txt values (residual
        # codec drift), so .txt always wins when present.
        if binary_peaks is not None:
            if binary_peaks.transverse_ips and not report_dict.get("tran_ppv"):
                report_dict["tran_ppv"] = binary_peaks.transverse_ips
            if binary_peaks.vertical_ips and not report_dict.get("vert_ppv"):
                report_dict["vert_ppv"] = binary_peaks.vertical_ips
            if binary_peaks.longitudinal_ips and not report_dict.get("long_ppv"):
                report_dict["long_ppv"] = binary_peaks.longitudinal_ips

        # 4. Build the typed IdfEvent.  Filename is authoritative for
        # (serial, timestamp, kind); the report's event_datetime takes
        # precedence over the filename timestamp inside from_report().
        idf_event = IdfEvent.from_report(report_dict, source_path.name)

        # The binary mic peak (psi) isn't carried through from_report() —
        # IdfReport.from_dict only sees the .txt's dB(L) value.  Pull the
        # binary-derived ``mic_pspl_psi`` onto the typed IdfEvent so the
        # downstream bridge can populate ``PeakValues.micl`` (psi-shaped)
        # and the h5 writer's per-count mic factor lands at a sensible
        # value.  Without this, the h5 mic chart auto-scales against the
        # dB(L) value-as-pseudo-psi and renders ~flat.
        if binary_peaks is not None and binary_peaks.mic_pspl_psi is not None:
            idf_event.peaks.mic_pspl_psi = binary_peaks.mic_pspl_psi

        # Operator-supplied serial_hint wins over the binary's filename
        # prefix when both are present (e.g. callers passing a known-good
        # serial that overrides a misnamed export).
        serial = serial_hint or idf_event.serial or "UNKNOWN"

        # 5. Filesystem write of binary bytes.
        filename = source_path.name
        bw_path = self._serial_dir(serial) / filename
        bw_path.write_bytes(idf_bytes)
        filesize = bw_path.stat().st_size
        sha256   = event_file_io.file_sha256(bw_path)

        # _waveform_key dedups (serial, timestamp) rows in the events
        # table.  Use the binary's sha256 (first 16 bytes) as a stable
        # surrogate — every distinct binary maps to a distinct row.
        waveform_key = bytes.fromhex(sha256)[:16]

        # 6. Bridge to minimateplus.Event for the existing sidecar / DB
        # insert paths.  See IdfEvent.to_minimateplus_event() for the
        # caveats of this bridge (mic units, missing fields → sidecar).
        ev = idf_event.to_minimateplus_event(waveform_key)

        # Attach the decoded sample arrays.  Thor's decoder counts use
        # LSB = 0.0003 in/s for geo (vs BW's 16-count units at 0.005 in/s)
        # — the .h5 writer's geo_range="normal" yields LSB = 10/32768
        # ≈ 0.000305 in/s, so plotted samples come out ~1.7% high.
        # Acceptable known offset; refine with a Thor-aware h5 path later.
        if idf_samples is not None:
            ev.raw_samples = idf_samples
            n_samples = max((len(idf_samples.get(ch, [])) for ch in ("Tran", "Vert", "Long", "MicL")), default=0)
            ev.total_samples = ev.total_samples or n_samples

        # For IDFH histograms there are no per-sample waveform arrays — the
        # device stores one peak ADC count per interval per channel.  Synthesise
        # a 1-sample-per-interval array so the existing h5+renderer pipeline
        # (which groups samples down to ``n_intervals`` bars via max-per-group)
        # produces a non-blank histogram chart.  Each "sample" is the peak ADC
        # count for that interval, so the h5 writer's ``count × geo_fs/32768``
        # conversion yields the right physical value for the bar height.
        if is_histogram and idf_intervals:
            hist_samples = {
                "Tran": [iv.peak_count("Tran") for iv in idf_intervals],
                "Vert": [iv.peak_count("Vert") for iv in idf_intervals],
                "Long": [iv.peak_count("Long") for iv in idf_intervals],
                "MicL": [iv.peak_count("MicL") for iv in idf_intervals],
            }
            ev.raw_samples = hist_samples
            ev.total_samples = ev.total_samples or len(idf_intervals)

        # 7. Write the .h5 clean-waveform file when we have samples to write
        # (either the IDFW per-sample stream, or the IDFH synthesised per-
        # interval peak array).  The renderer treats both shapes the same way.
        hdf5_filename: Optional[str] = None
        if ev.raw_samples:
            hdf5_path = self.hdf5_path_for(serial, filename)
            try:
                event_hdf5.write_event_hdf5(
                    hdf5_path, ev,
                    serial=serial,
                    geo_range="normal",   # Thor's geo full scale is also 10 in/s (Normal)
                    source_kind="idf-import",
                )
                hdf5_filename = hdf5_path.name
            except Exception as exc:
                log.warning(
                    "save_imported_idf: HDF5 write failed for %s: %s — continuing without .h5",
                    hdf5_path, exc,
                )

        # 8. Write the sidecar.  Source kind "idf-import" is on the allow-list.
        sidecar_path = self.sidecar_path_for(serial, filename)
        existing_review = None
        if sidecar_path.exists():
            try:
                existing_review = event_file_io.read_sidecar(sidecar_path).get("review")
            except Exception:
                pass

        sidecar = event_file_io.event_to_sidecar_dict(
            ev,
            serial=serial,
            blastware_filename=filename,
            blastware_filesize=filesize,
            blastware_sha256=sha256,
            source_kind="idf-import",
            a5_pickle_filename=None,
            review=existing_review,
        )
        # Stash the full parsed IDF report under extensions so downstream
        # consumers can recover the rich derived fields that don't fit
        # the BW-shaped event model (Peak Acceleration / Displacement,
        # Time of Peak, sensor self-check, calibration, firmware).
        if report_dict:
            sidecar["extensions"]["idf_report"] = report_dict

        # Project the IDF report into the BW report sidecar shape so the
        # existing Event Report PDF pipeline (sfm/report_pdf.py) can
        # render Thor events without needing a separate code path.  Thor
        # data is 95% the same metric set as BW — the adapter handles
        # the field-name mapping.
        if report_dict or binary_md is not None:
            try:
                from micromate.idf_to_bw_report import build_bw_report_from_idf
                sidecar["bw_report"] = build_bw_report_from_idf(
                    report_dict or {},
                    binary_md=binary_md,
                    intervals=idf_intervals,
                    is_histogram=is_histogram,
                )
            except Exception as exc:
                log.warning(
                    "save_imported_idf: idf→bw_report adapter failed for %s: %s — "
                    "report PDF will fall back to DB-only fields",
                    filename, exc,
                )
        # For histograms, also stash the binary-decoded per-interval
        # records so the UI / report layer doesn't need to re-walk the
        # IDFH file at render time.
        if idf_intervals is not None:
            sidecar["extensions"]["idf_intervals"] = [
                {
                    "offset":     iv.offset,
                    "tran_peak":  iv.peak_count("Tran"),
                    "tran_halfp": iv.tran_halfp,
                    "tran_freq":  iv.freq_hz("Tran"),
                    "vert_peak":  iv.peak_count("Vert"),
                    "vert_halfp": iv.vert_halfp,
                    "vert_freq":  iv.freq_hz("Vert"),
                    "long_peak":  iv.peak_count("Long"),
                    "long_halfp": iv.long_halfp,
                    "long_freq":  iv.freq_hz("Long"),
                    "mic_peak":   iv.peak_count("MicL"),
                    "mic_halfp":  iv.micl_halfp,
                    "mic_freq":   iv.freq_hz("MicL"),
                }
                for iv in idf_intervals
            ]
        event_file_io.write_sidecar(sidecar_path, sidecar)

        log.info(
            "WaveformStore.save_imported_idf serial=%s filename=%s filesize=%d "
            "kind=%s report_attached=%s binary_decoded=%s h5=%s intervals=%d",
            serial, filename, filesize,
            "histogram" if is_histogram else "waveform",
            bool(report_dict),
            (idf_samples is not None) or (idf_intervals is not None),
            hdf5_filename or "(skipped)",
            len(idf_intervals) if idf_intervals else 0,
        )
        return ev, {
            "filename":           filename,
            "filesize":           filesize,
            "sha256":             sha256,
            "a5_pickle_filename": None,
            "hdf5_filename":      hdf5_filename,
            "sidecar_filename":   sidecar_path.name,
            "serial":             serial,
        }

    def load_a5(self, serial: str, filename: str) -> Optional[list[S3Frame]]:
        """
        Re-hydrate the pickled A5 frame stream for a stored event.
        Returns None if the sidecar is missing.
        """
        _, a5_path = self.paths_for(serial, filename)
        if not a5_path.exists():
            return None
        with a5_path.open("rb") as fp:
            payload = pickle.load(fp)
        if not isinstance(payload, dict) or "frames" not in payload:
            log.warning("WaveformStore.load_a5: malformed sidecar at %s", a5_path)
            return None
        return [_dict_to_frame(d) for d in payload["frames"]]

    # ── modern .sfm.json sidecar accessors ──────────────────────────────────────

    def load_sidecar(self, serial: str, filename: str) -> Optional[dict]:
        """Return the parsed .sfm.json sidecar dict, or None if missing."""
        path = self.sidecar_path_for(serial, filename)
        if not path.exists():
            return None
        try:
            return event_file_io.read_sidecar(path)
        except Exception as exc:
            log.warning("load_sidecar: failed to read %s: %s", path, exc)
            return None

    def patch_sidecar(
        self,
        serial: str,
        filename: str,
        *,
        review: Optional[dict] = None,
        extensions: Optional[dict] = None,
        reviewer_now: bool = True,
    ) -> Optional[dict]:
        """
        JSON-merge-patch the .sfm.json sidecar's review/extensions blocks.
        Returns the new full dict, or None if the sidecar doesn't exist.
        """
        path = self.sidecar_path_for(serial, filename)
        if not path.exists():
            return None
        return event_file_io.patch_sidecar(
            path,
            review=review,
            extensions=extensions,
            reviewer_now=reviewer_now,
        )


# ── helpers ─────────────────────────────────────────────────────────────────────

def _serial_from_bw_filename(name: str) -> Optional[str]:
    """
    Reverse of `blastware_filename`'s serial-prefix encoding.

    BW filename format (V10.72): `<P><serial3><stem4>.<ext>`
    where P = chr(ord('B') + floor(serial // 1000))
    and serial3 = f"{serial % 1000:03d}".

    Examples (from CLAUDE.md verification archive):
      P036... → BE14036       H907... → BE6907
      M529... → BE11529       T003... → BE18003

    Returns the inferred BE-prefix serial (e.g. "BE11529") or None when
    the filename doesn't match the expected pattern.
    """
    if not name:
        return None
    # First letter encodes the thousands group; next 3 chars encode the
    # last 3 digits of the serial.
    base = name.split(".", 1)[0]
    if len(base) < 4 or not base[0].isalpha() or not base[1:4].isdigit():
        return None
    prefix_letter = base[0].upper()
    if prefix_letter < "B":
        return None
    thousands = ord(prefix_letter) - ord("B")
    serial_num = thousands * 1000 + int(base[1:4])
    return f"BE{serial_num}"