seismo-relay/minimateplus/event_file_io.py

"""
minimateplus/event_file_io.py — modern event-file (.sfm.json sidecar) IO.

This module is the single home for event-file conversion code that doesn't
fit cleanly inside `blastware_file.py` (which is the BW binary codec):

  - sidecar JSON read/write (the modern per-event metadata file)
  - read_blastware_file() — reverse of write_blastware_file, used by
    the BW-importer flow when SFM is ingesting files produced by
    Blastware's own ACH (where the source A5 frames aren't available).

Sidecar schema v1 layout — see docs in the project plan or the schema
declared in `event_to_sidecar_dict()`.
"""

from __future__ import annotations

import datetime
import hashlib
import json
import logging
import os
import struct
from pathlib import Path
from typing import Optional, Union

from .models import Event, PeakValues, ProjectInfo, Timestamp
from . import blastware_file as _bw  # avoid circular reference at module load
from .bw_ascii_report import BwAsciiReport
from .waveform_codec import decode_waveform_v2, decoded_to_adc_counts
from .histogram_codec import decode_histogram_body

# Reference pressure for dB(L) → psi conversion (20 µPa expressed in psi).
# Same constant as sfm/sfm_webapp.html so server-side and browser-side
# conversions agree.
_DBL_REF_PSI = 2.9e-9

log = logging.getLogger(__name__)

# Schema version for the sidecar JSON.  Bump when fields change shape.
# Older readers must reject anything > SCHEMA_VERSION; newer fields added
# inside `extensions` are forward-compatible without a bump.
SCHEMA_VERSION = 1
SIDECAR_KIND   = "sfm.event"

# Default tool_version stamp; callers can override.  Hard-coded here
# rather than read via importlib.metadata because the latter reflects the
# *installed* dist-info, which doesn't update when pyproject.toml is
# bumped without a `pip install` re-run — leading to confusing stale
# version stamps in sidecars.  Bump this constant and CHANGELOG.md
# together at release time.
TOOL_VERSION = "0.21.1"

try:
    # Best-effort: prefer the installed metadata when it's NEWER than the
    # baked-in constant (e.g. a downstream packager bumped the wheel
    # without editing this file).  Otherwise fall back to TOOL_VERSION.
    from importlib.metadata import version as _pkg_version
    _meta_v = _pkg_version("seismo-relay")
    def _vtuple(s):
        try:
            return tuple(int(p) for p in s.split(".")[:3])
        except Exception:
            return (0, 0, 0)
    _TOOL_VERSION_DEFAULT = (
        _meta_v if _vtuple(_meta_v) > _vtuple(TOOL_VERSION) else TOOL_VERSION
    )
except Exception:
    _TOOL_VERSION_DEFAULT = TOOL_VERSION


# ── Sidecar dict construction ─────────────────────────────────────────────────


def _ts_iso(ts: Optional[Timestamp]) -> Optional[str]:
    if ts is None:
        return None
    try:
        return datetime.datetime(
            ts.year, ts.month, ts.day,
            ts.hour or 0, ts.minute or 0, ts.second or 0,
        ).isoformat()
    except Exception:
        return str(ts)


def _peak_values_to_dict(pv: Optional[PeakValues]) -> dict:
    if pv is None:
        return {
            "transverse":   None,
            "vertical":     None,
            "longitudinal": None,
            "vector_sum":   None,
            "mic_psi":      None,
        }
    return {
        "transverse":   pv.tran,
        "vertical":     pv.vert,
        "longitudinal": pv.long,
        "vector_sum":   pv.peak_vector_sum,
        "mic_psi":      pv.micl,
    }


def _bw_report_to_dict(report: BwAsciiReport) -> dict:
    """Project a parsed BW ASCII report into the sidecar's `bw_report` block.

    All fields are rendered as plain JSON-compatible types (no datetime
    objects).  Channels are uniformly lowercased for stable JSON keys.
    """
    def _ch(ch_name: str) -> dict:
        cs = report.channels.get(ch_name)
        if cs is None:
            return {}
        out = {
            "ppv_ips":         cs.ppv_ips,
            "zc_freq_hz":      cs.zc_freq_hz,
            "time_of_peak_s":  cs.time_of_peak_s,
            "peak_accel_g":    cs.peak_accel_g,
            "peak_disp_in":    cs.peak_disp_in,
        }
        # Drop all-None entries — keeps the JSON tidy for partial reports.
        out = {k: v for k, v in out.items() if v is not None}
        # Saturation flag (only present when True) — signals that ppv_ips
        # is the channel range max (a lower bound), not an exact reading.
        if getattr(cs, "ppv_saturated", False):
            out["ppv_saturated"] = True
        # ZC Freq above device reporting ceiling (BW ">100 Hz") — value
        # in zc_freq_hz is the threshold, not an exact measurement.
        if getattr(cs, "zc_freq_above_range", False):
            out["zc_freq_above_range"] = True
        return out

    def _sc(ch_name: str) -> dict:
        sc = report.sensor_check.get(ch_name)
        if sc is None:
            return {}
        out = {
            "freq_hz":      sc.test_freq_hz,
            "ratio":        sc.test_ratio,
            "amplitude_mv": sc.test_amplitude_mv,
            "result":       sc.test_results,
        }
        return {k: v for k, v in out.items() if v is not None}

    monitor_log = []
    for entry in report.monitor_log:
        e = {
            "start":       entry.start_time.isoformat() if entry.start_time else None,
            "stop":        entry.stop_time.isoformat()  if entry.stop_time  else None,
            "description": entry.description,
        }
        monitor_log.append({k: v for k, v in e.items() if v is not None})

    return {
        "available":   True,
        "event_type":  report.event_type,
        "version":     report.version,
        "trigger": {
            "channel":       report.trigger_channel,
            "geo_level_ips": report.geo_trigger_level_ips,
        },
        "recording": {
            "sample_rate_sps":  report.sample_rate_sps,
            "record_time_s":    report.record_time_s,
            "pretrig_s":        report.pretrig_s,
            "stop_mode":        report.record_stop_mode,
            "geo_range_ips":    report.geo_range_ips,
            "units":            report.units,
        },
        "device": {
            "battery_volts":    report.battery_volts,
            "calibration_date": report.calibration_date.isoformat() if report.calibration_date else None,
            "calibration_by":   report.calibration_by,
        },
        "peaks": {
            "tran":         _ch("Tran"),
            "vert":         _ch("Vert"),
            "long":         _ch("Long"),
            "vector_sum": {
                "ips":       report.peak_vector_sum_ips,
                "time_s":    report.peak_vector_sum_time_s,
                # Histogram events have an absolute date+time for the PVS
                # (the interval at which it occurred); waveform events
                # only have the time_s offset.
                "when":      report.peak_vector_sum_when.isoformat() if report.peak_vector_sum_when else None,
                # Set when BW reported the PVS as OORANGE — value is the
                # conservative upper bound sqrt(3) * geo_range_ips, not
                # an exact peak.
                "saturated": bool(getattr(report, "peak_vector_sum_saturated", False)),
            },
        },
        "mic": {
            "weighting":             report.mic.weighting,
            "pspl_dbl":              report.mic.pspl_dbl,
            "pspl_saturated":        bool(getattr(report.mic, "pspl_saturated", False)),
            "zc_freq_hz":            report.mic.zc_freq_hz,
            "zc_freq_above_range":   bool(getattr(report.mic, "zc_freq_above_range", False)),
            "time_of_peak_s":        report.mic.time_of_peak_s,
        },
        "sensor_check": {
            "tran": _sc("Tran"),
            "vert": _sc("Vert"),
            "long": _sc("Long"),
            "mic":  _sc("MicL"),
        },
        # Histogram-specific fields (None on waveform-mode events).
        # Per-channel absolute peak time/date for histograms — for
        # waveforms see channels[ch]["time_of_peak_s"] instead.
        "histogram": {
            "start":               report.histogram_start.isoformat() if report.histogram_start else None,
            "stop":                report.histogram_stop.isoformat()  if report.histogram_stop  else None,
            "n_intervals":         report.histogram_n_intervals,
            "interval_size":       report.histogram_interval_size_str,
            "interval_size_s":     report.histogram_interval_size_s,
            "channel_peak_when":   {ch: dt.isoformat() for ch, dt in report.channel_peak_when.items()},
        },
        "monitor_log":   monitor_log,
        "pc_sw_version": report.pc_sw_version,
    }


def _dbl_to_psi(pspl_dbl: float) -> float:
    """Convert dB(L) sound pressure level back to psi.  Uses the same
    20 µPa reference (= 2.9e-9 psi) as the webapp so server-side and
    browser-side conversions agree."""
    return _DBL_REF_PSI * (10.0 ** (pspl_dbl / 20.0))


def apply_report_to_event(event: Event, report: BwAsciiReport) -> None:
    """Overlay device-authoritative fields from a parsed BW ASCII report
    onto an in-memory Event, IN-PLACE.

    Why this exists
    ───────────────
    `read_blastware_file()` parses the BW binary and fills `Event.peak_values`
    via `_peaks_from_samples()` — which runs the (still-undecoded) BW body
    codec assuming raw int16 LE and produces ±32K-shaped noise on every
    channel.  Result: peak values land in the SeismoDb event row as
    ~10 in/s on every event regardless of the actual signal.

    When a paired BW ASCII report is available, the report carries the
    device's own authoritative peak / project / sample-rate / record-time
    values.  This helper folds those onto the Event before it flows to
    `SeismoDb.insert_events()`, so the DB columns reflect the report
    rather than the broken-codec output.

    Fields overlaid (only when the report supplies a non-None value):
      - peak_values.tran / .vert / .long              (from report.channels)
      - peak_values.peak_vector_sum                   (from report.peak_vector_sum_ips)
      - peak_values.micl  (psi)                       (from report.mic.pspl_dbl → psi)
      - project_info.project / .client / .operator / .sensor_location
      - sample_rate                                   (from report.sample_rate_sps)
      - rectime_seconds                               (from report.record_time_s)

    Fields NOT touched (operator-edit / parser-output preserved):
      - timestamp, raw_samples, record_type, total_samples,
        pretrig_samples, _waveform_key, _a5_frames, _raw_record
      - false_trigger and review state (those live on the sidecar, not on Event)
    """
    if event.peak_values is None:
        event.peak_values = PeakValues()
    pv = event.peak_values
    ch = report.channels
    if (t := ch.get("Tran")) and t.ppv_ips is not None: pv.tran = t.ppv_ips
    if (v := ch.get("Vert")) and v.ppv_ips is not None: pv.vert = v.ppv_ips
    if (l := ch.get("Long")) and l.ppv_ips is not None: pv.long = l.ppv_ips
    if report.peak_vector_sum_ips is not None:
        pv.peak_vector_sum = report.peak_vector_sum_ips
    if report.mic.pspl_dbl is not None and report.mic.pspl_dbl > 0:
        pv.micl = _dbl_to_psi(report.mic.pspl_dbl)

    if event.project_info is None:
        event.project_info = ProjectInfo()
    pi = event.project_info
    if report.project:         pi.project         = report.project
    if report.client:          pi.client          = report.client
    if report.operator:        pi.operator        = report.operator
    if report.sensor_location: pi.sensor_location = report.sensor_location

    if report.sample_rate_sps:
        event.sample_rate = report.sample_rate_sps
    if report.record_time_s is not None:
        event.rectime_seconds = report.record_time_s


def apply_bw_report_dict_to_event(event: Event, bw_report: dict) -> None:
    """Mirror of ``apply_report_to_event`` for the projected sidecar
    dict shape (as produced by ``_bw_report_to_dict``).

    Why this exists
    ───────────────
    The ingest path holds a live ``BwAsciiReport`` parsed straight from
    the ``_ASCII.TXT`` and uses ``apply_report_to_event`` to overlay
    device-authoritative peaks onto the codec output before insert.

    The backfill path doesn't have the original ``.TXT`` (it's not
    retained in the waveform store), but it does have the preserved
    ``bw_report`` block from the sidecar — which contains the same
    projected fields.  Re-overlaying those during a backfill keeps the
    DB peak columns aligned with what BW reports rather than letting
    the codec output (which may be incomplete for unhandled formats or
    walker edge cases) win by default.

    No-ops cleanly when ``bw_report`` is ``None``, empty, or missing
    any particular sub-field — only fields with a concrete value get
    written.  Mirrors ``apply_report_to_event``'s "report wins where
    present" semantics.
    """
    if not bw_report:
        return
    if event.peak_values is None:
        event.peak_values = PeakValues()
    pv = event.peak_values

    peaks = bw_report.get("peaks") or {}
    tran = (peaks.get("tran") or {}).get("ppv_ips")
    vert = (peaks.get("vert") or {}).get("ppv_ips")
    long = (peaks.get("long") or {}).get("ppv_ips")
    if tran is not None: pv.tran = tran
    if vert is not None: pv.vert = vert
    if long is not None: pv.long = long
    vs_ips = (peaks.get("vector_sum") or {}).get("ips")
    if vs_ips is not None:
        pv.peak_vector_sum = vs_ips

    mic = bw_report.get("mic") or {}
    pspl = mic.get("pspl_dbl")
    if pspl is not None and pspl > 0:
        pv.micl = _dbl_to_psi(pspl)

    rec = bw_report.get("recording") or {}
    sr = rec.get("sample_rate_sps")
    if sr:
        event.sample_rate = sr
    rt = rec.get("record_time_s")
    if rt is not None:
        event.rectime_seconds = rt


def _project_info_to_dict(pi: Optional[ProjectInfo]) -> dict:
    if pi is None:
        return {
            "project":         None,
            "client":          None,
            "operator":        None,
            "sensor_location": None,
        }
    return {
        "project":         pi.project,
        "client":          pi.client,
        "operator":        pi.operator,
        "sensor_location": pi.sensor_location,
    }


def event_to_sidecar_dict(
    event: Event,
    *,
    serial: str,
    blastware_filename: str,
    blastware_filesize: int,
    blastware_sha256: str,
    source_kind: str = "sfm-live",
    txt_filename: Optional[str] = None,
    a5_pickle_filename: Optional[str] = None,
    tool_version: str = _TOOL_VERSION_DEFAULT,
    captured_at: Optional[datetime.datetime] = None,
    review: Optional[dict] = None,
    extensions: Optional[dict] = None,
    bw_report: Optional[BwAsciiReport] = None,
) -> dict:
    """
    Build a v1 sidecar dict from an Event + the surrounding metadata.

    Pure helper — no file I/O.  Callers stitch the result into a sidecar
    via `write_sidecar()` (or POST it back via the PATCH endpoint).

    When *bw_report* is supplied (e.g. by the ACH-forwarded import path
    where Blastware writes a per-event ASCII report alongside the binary),
    its decoded fields are folded into the sidecar:

      - A new top-level ``bw_report`` block carries the rich derived
        per-channel stats (Peak Acceleration, Peak Displacement, ZC Freq,
        Time of Peak), the Peak Vector Sum + time, the per-channel sensor
        self-check results, and monitor-log timestamps.
      - ``peak_values`` is overlaid from the report (the report's PPV/PVS
        values are computed by the device firmware and are authoritative;
        anything ``read_blastware_file()`` derived from samples is
        approximate at best until the body codec is decoded).
      - ``project_info`` is overlaid from the report when the report
        supplies a non-empty value (the report mirrors the device's
        compliance config, which is what BW shows in its event report).
      - ``event.timestamp`` is overlaid from the report's Event Date +
        Event Time (BW's report timestamps are second-resolution and
        match the binary's footer; we prefer the report value because
        the BW-binary footer timestamp can drift on some firmware).
    """
    if source_kind not in {"sfm-live", "sfm-ach", "bw-import", "idf-import"}:
        raise ValueError(f"unknown source_kind: {source_kind!r}")

    captured_at = captured_at or datetime.datetime.utcnow()

    # ── Overlay event fields from the report when present ───────────────────
    timestamp_iso = _ts_iso(event.timestamp)
    if bw_report and bw_report.event_datetime:
        timestamp_iso = bw_report.event_datetime.isoformat()

    # Build peak_values, optionally overlaid from the report.  The report
    # stores Mic peak as PSPL (dB(L)); we convert to psi to match the
    # existing peak_values.mic_psi field.
    peak_dict = _peak_values_to_dict(event.peak_values)
    if bw_report:
        ch = bw_report.channels
        if (t := ch.get("Tran")) and t.ppv_ips is not None: peak_dict["transverse"]   = t.ppv_ips
        if (v := ch.get("Vert")) and v.ppv_ips is not None: peak_dict["vertical"]     = v.ppv_ips
        if (l := ch.get("Long")) and l.ppv_ips is not None: peak_dict["longitudinal"] = l.ppv_ips
        if bw_report.peak_vector_sum_ips is not None:
            peak_dict["vector_sum"] = bw_report.peak_vector_sum_ips
        if bw_report.mic.pspl_dbl is not None and bw_report.mic.pspl_dbl > 0:
            peak_dict["mic_psi"] = _dbl_to_psi(bw_report.mic.pspl_dbl)

    # Project info: overlay from report (the report mirrors the
    # session-start compliance config that BW renders in event reports).
    proj_dict = _project_info_to_dict(event.project_info)
    if bw_report:
        if bw_report.project:         proj_dict["project"]         = bw_report.project
        if bw_report.client:          proj_dict["client"]          = bw_report.client
        if bw_report.operator:        proj_dict["operator"]        = bw_report.operator
        if bw_report.sensor_location: proj_dict["sensor_location"] = bw_report.sensor_location

    # Event-block fields: overlay from report where available.
    event_block = {
        "serial":           serial,
        "timestamp":        timestamp_iso,
        "waveform_key":     event._waveform_key.hex() if event._waveform_key else None,
        "record_type":      event.record_type,
        "sample_rate":      event.sample_rate,
        "rectime_seconds":  event.rectime_seconds,
        "total_samples":    event.total_samples,
        "pretrig_samples":  event.pretrig_samples,
    }
    if bw_report:
        # Report values are authoritative — they're the user-configured
        # values BW reads back, not STRT-derived guesses.  In particular
        # `event.rectime_seconds` from `read_blastware_file()` reads
        # STRT[18] which is actually the `0x46` record-type marker (= 70)
        # rather than the user's Record Time setting.  Always overwrite.
        if bw_report.sample_rate_sps:
            event_block["sample_rate"] = bw_report.sample_rate_sps
        if bw_report.record_time_s is not None:
            event_block["rectime_seconds"] = bw_report.record_time_s
        # Derive total_samples + pretrig_samples per channel from the
        # report's sample_rate × times.  These match the row count of
        # the report's sample table (verified: event-c reports 1024 sps
        # × (1.0 + 0.25) = 1280 rows).
        if (sr := bw_report.sample_rate_sps) and bw_report.record_time_s is not None:
            pretrig_s = abs(bw_report.pretrig_s) if bw_report.pretrig_s is not None else 0.0
            event_block["total_samples"]   = int(round(sr * (bw_report.record_time_s + pretrig_s)))
            event_block["pretrig_samples"] = int(round(sr * pretrig_s))

    out = {
        "schema_version": SCHEMA_VERSION,
        "kind":           SIDECAR_KIND,

        "event":        event_block,
        "peak_values":  peak_dict,
        "project_info": proj_dict,

        "blastware": {
            "filename":  blastware_filename,
            "filesize":  blastware_filesize,
            "sha256":    blastware_sha256,
            "available": True,
        },

        "source": {
            "kind":               source_kind,
            "captured_at":        captured_at.isoformat() + "Z" if captured_at.tzinfo is None else captured_at.isoformat(),
            "tool_version":       tool_version,
            "a5_pickle_filename": a5_pickle_filename,
            "txt_filename":       txt_filename,
        },

        "review": review or {
            "false_trigger": False,
            "reviewer":      None,
            "reviewed_at":   None,
            "notes":         "",
        },

        "extensions": extensions or {},
    }

    if bw_report:
        out["bw_report"] = _bw_report_to_dict(bw_report)

    return out


# ── Sidecar IO ────────────────────────────────────────────────────────────────


def write_sidecar(path: Union[str, Path], data: dict) -> None:
    """
    Atomic write of a sidecar dict to <path>.

    Validates schema_version is supported before writing so we don't
    silently drop a future-format sidecar over the wire.
    """
    path = Path(path)
    sv = data.get("schema_version")
    if not isinstance(sv, int) or sv < 1 or sv > SCHEMA_VERSION:
        raise ValueError(
            f"write_sidecar: unsupported schema_version={sv!r} "
            f"(this build supports 1..{SCHEMA_VERSION})"
        )

    tmp = path.with_suffix(path.suffix + ".tmp")
    with tmp.open("w", encoding="utf-8") as f:
        json.dump(data, f, indent=2, sort_keys=False, default=str)
        f.write("\n")
        f.flush()
        os.fsync(f.fileno())
    os.replace(tmp, path)


def read_sidecar(path: Union[str, Path]) -> dict:
    """
    Load a sidecar JSON file.

    Raises FileNotFoundError if missing, ValueError on bad shape /
    unsupported schema_version.  Unknown keys at the top level are
    preserved in the returned dict (forward-compat).
    """
    path = Path(path)
    with path.open("r", encoding="utf-8") as f:
        data = json.load(f)
    if not isinstance(data, dict):
        raise ValueError(f"sidecar at {path}: top-level is not a JSON object")
    sv = data.get("schema_version")
    if not isinstance(sv, int) or sv < 1:
        raise ValueError(f"sidecar at {path}: missing or invalid schema_version")
    if sv > SCHEMA_VERSION:
        raise ValueError(
            f"sidecar at {path}: schema_version={sv} > supported {SCHEMA_VERSION}; "
            "upgrade seismo-relay to read this file"
        )
    if data.get("kind") != SIDECAR_KIND:
        raise ValueError(f"sidecar at {path}: unexpected kind={data.get('kind')!r}")
    return data


def patch_sidecar(
    path: Union[str, Path],
    *,
    review: Optional[dict] = None,
    extensions: Optional[dict] = None,
    reviewer_now: bool = True,
) -> dict:
    """
    Atomically apply a JSON-merge-patch to a sidecar file's `review`
    and/or `extensions` blocks.  Other top-level keys are untouched.

    `review_now`: when True (default) and `review` is non-empty, stamps
    `review.reviewed_at` with the current UTC time so the review-time is
    auditable without the caller having to pass it.

    Returns the new full sidecar dict.
    """
    path = Path(path)
    data = read_sidecar(path)

    if review:
        merged = dict(data.get("review") or {})
        merged.update({k: v for k, v in review.items() if v is not None or k in merged})
        if reviewer_now:
            merged["reviewed_at"] = datetime.datetime.utcnow().isoformat() + "Z"
        data["review"] = merged

    if extensions:
        merged_ext = dict(data.get("extensions") or {})
        merged_ext.update(extensions)
        data["extensions"] = merged_ext

    write_sidecar(path, data)
    return data


def sidecar_path_for(blastware_path: Union[str, Path]) -> Path:
    """Convention: <bw_path>.sfm.json sits next to the BW binary."""
    p = Path(blastware_path)
    return p.with_name(p.name + ".sfm.json")


def file_sha256(path: Union[str, Path], chunk_size: int = 65536) -> str:
    """Compute SHA-256 of a file as a hex string."""
    h = hashlib.sha256()
    with open(path, "rb") as f:
        while True:
            chunk = f.read(chunk_size)
            if not chunk:
                break
            h.update(chunk)
    return h.hexdigest()


# ── Blastware-file reader ─────────────────────────────────────────────────────
#
# Reverse of `blastware_file.write_blastware_file`.  Used by the BW-import
# flow to ingest files produced by Blastware's own ACH (where the source
# A5 frames are not available).
#
# File structure (recap):
#   [22B header] [21B STRT record] [body bytes] [26B footer]
#
# The body holds:
#   - 6B preamble (00 00 ff ff ff ff) immediately after the STRT
#   - 4-channel interleaved int16 LE samples
#   - Embedded ASCII metadata strings (Project: / Client: / User Name: /
#     Seis Loc: / Extended Notes) from the device's session-start config
#
# The 0C waveform record (per-event peaks, project name) is NOT in the
# BW file — those are computed by the device firmware and only carried
# in the live SUB 0C response.  read_blastware_file() therefore computes
# peaks from the raw samples assuming Normal-range (10 in/s full-scale)
# geophone sensitivity.  Imported events surface that assumption via the
# sidecar's `peak_values.computed_from_samples` flag.


# Geophone scale factor, in/s per ADC unit, for Normal range (10 in/s FS).
# Confirmed from CLAUDE.md (geo_hardware_constant = 6.206053 in/s per V,
# ADC full-scale = 1.61133 V Normal range = 10.0 in/s peak; per-count
# resolution ≈ 10.0 / 32768).
_GEO_NORMAL_FS_INS  = 10.0
_GEO_SENSITIVE_FS_INS = 1.250
_INT16_FS = 32768.0

# Microphone scale factor, psi per ADC count.  Approximate — exact factor
# depends on the geophone-vs-mic ADC scaling and the firmware reference.
# We mark mic_psi as "computed approximate" in the sidecar.
_MIC_FS_PSI = 0.0125 / _INT16_FS   # ~0.5 psi full-scale assumption


def _decode_strt(strt: bytes) -> dict:
    """
    Decode the 21-byte STRT record from a BW file.

    Returns dict with waveform_key (4B), total_samples, pretrig_samples,
    rectime_seconds.  Falls back to None on truncated/missing fields.
    """
    if len(strt) < 21 or strt[0:4] != b"STRT":
        return {}
    return {
        "waveform_key":    strt[6:10].hex(),
        "total_samples":   struct.unpack_from(">H", strt, 8)[0],
        "pretrig_samples": struct.unpack_from(">H", strt, 16)[0],
        "rectime_seconds": strt[18],
    }


def _find_first_string(buf: bytes, label: bytes, max_len: int = 256) -> Optional[str]:
    """
    Search `buf` for `label` (e.g. b"Project:") and return the
    null-terminated ASCII string that follows, stripped.
    """
    pos = buf.find(label)
    if pos < 0:
        return None
    start = pos + len(label)
    end = buf.find(b"\x00", start, start + max_len)
    if end < 0:
        end = start + max_len
    text = buf[start:end].decode("ascii", errors="replace").strip()
    return text or None


def _decode_samples_4ch_int16_le(stream: bytes) -> dict[str, list[int]]:
    """
    Decode a 4-channel interleaved int16 LE byte stream into per-channel
    lists.  Channels are [Tran, Vert, Long, Mic] = [ch0, ch1, ch2, ch3].
    Truncates to a multiple of 8 bytes (one full sample-set).
    """
    n_complete = (len(stream) // 8) * 8
    if n_complete == 0:
        return {"Tran": [], "Vert": [], "Long": [], "MicL": []}
    fmt = "<" + "h" * (n_complete // 2)
    flat = list(struct.unpack(fmt, stream[:n_complete]))
    return {
        "Tran": flat[0::4],
        "Vert": flat[1::4],
        "Long": flat[2::4],
        "MicL": flat[3::4],
    }


def _peaks_from_samples(samples: dict[str, list[int]]) -> PeakValues:
    """
    Compute approximate peaks from raw int16 samples assuming Normal-range
    geophone sensitivity.  Used by the BW-importer when the 0C waveform
    record (the device's authoritative peaks) is unavailable.
    """
    def _peak_ins(ch: list[int]) -> float:
        if not ch:
            return 0.0
        m = max(abs(int(v)) for v in ch)
        return m / _INT16_FS * _GEO_NORMAL_FS_INS

    tran = _peak_ins(samples.get("Tran", []))
    vert = _peak_ins(samples.get("Vert", []))
    long_ = _peak_ins(samples.get("Long", []))

    # Mic in psi (approximate)
    mic_ch = samples.get("MicL", []) or []
    mic = max((abs(int(v)) for v in mic_ch), default=0) * _MIC_FS_PSI

    # Peak vector sum: max over time of sqrt(T^2 + V^2 + L^2)
    pvs = 0.0
    n = min(len(samples.get("Tran", [])), len(samples.get("Vert", [])), len(samples.get("Long", [])))
    if n:
        scale = _GEO_NORMAL_FS_INS / _INT16_FS
        T = samples["Tran"]; V = samples["Vert"]; L = samples["Long"]
        for i in range(n):
            t = T[i] * scale
            v = V[i] * scale
            l = L[i] * scale
            mag = (t*t + v*v + l*l) ** 0.5
            if mag > pvs:
                pvs = mag

    return PeakValues(
        tran=tran, vert=vert, long=long_,
        peak_vector_sum=pvs, micl=mic,
    )


_RECORD_TYPE_BY_EXT_SUFFIX = {
    'H': 'Histogram',
    'W': 'Waveform',
    'M': 'Manual',
    'E': 'Event',
    'C': 'Combo',
}


def derive_record_type_from_filename(filename, default: str = "Waveform") -> str:
    """Derive a BW Event's record_type from its filename's extension suffix.

    V10.72+ MiniMate Plus firmware encodes the event type as the LAST
    character of the extension (the `T` in BW's `AB0T` scheme):

        ``M529LKIQ.G10H``  →  H  →  ``"Histogram"``
        ``T350L385.VY0W``  →  W  →  ``"Waveform"``
        ``...M``           →  M  →  ``"Manual"``
        ``...E``           →  E  →  ``"Event"``
        ``...C``           →  C  →  ``"Combo"``

    Old S338 firmware uses 3-char extensions ending in ``0`` whose
    encoding is not yet known — those fall through to ``default``.
    Micromate Series 4 uses a different scheme entirely (observed:
    ``IDFH``, ``IDFW``) but the LAST-char convention (H / W) still holds
    for the type code, so it works for both families.

    Returns ``default`` if filename is empty, has no extension, or the
    suffix char isn't a recognized type code.
    """
    if not filename:
        return default
    try:
        name = Path(filename).name
    except (TypeError, ValueError):
        return default
    if '.' not in name:
        return default
    ext = name.rsplit('.', 1)[1]
    if not ext:
        return default
    return _RECORD_TYPE_BY_EXT_SUFFIX.get(ext[-1].upper(), default)


def read_blastware_file(path: Union[str, Path]) -> Event:
    """
    Parse a Blastware waveform file into an Event.

    Recovers:
      - waveform_key, rectime_seconds, total_samples, pretrig_samples
        (from the STRT record)
      - timestamp (from the footer's start-time field)
      - project_info (from ASCII labels embedded in the body)
      - raw_samples (Tran/Vert/Long/MicL int16 lists)
      - peak_values (computed from raw_samples; approximate — see notes
        on _peaks_from_samples about Normal-range assumption)

    Does NOT recover the source A5 frames (they aren't in the BW file).
    The returned Event has `_a5_frames = None`, signalling that
    byte-for-byte regeneration of the BW file from this Event alone is
    not possible — the on-disk BW file IS the byte-for-byte source.
    """
    path = Path(path)
    raw = path.read_bytes()
    if len(raw) < _bw._WAVEFORM_HEADER_SIZE + 21 + 26:
        raise ValueError(f"{path}: file too short ({len(raw)} bytes) to be a BW event")

    # Header: validate magic prefix.
    header = raw[:_bw._WAVEFORM_HEADER_SIZE]
    if not header.startswith(_bw._FILE_HEADER_PREFIX):
        raise ValueError(f"{path}: not a Blastware file (bad header prefix)")

    # STRT record: 21 bytes immediately after the header.
    strt_raw = raw[_bw._WAVEFORM_HEADER_SIZE : _bw._WAVEFORM_HEADER_SIZE + 21]
    strt_fields = _decode_strt(strt_raw)
    if not strt_fields:
        raise ValueError(f"{path}: STRT record missing or malformed")

    # Footer: locate the 0e 08 marker, validating the year is in a sane range.
    body_start = _bw._WAVEFORM_HEADER_SIZE + 21
    footer_pos = -1
    pos = body_start
    while True:
        pos = raw.find(b"\x0e\x08", pos)
        if pos < 0 or pos + 26 > len(raw):
            break
        yr = (raw[pos + 4] << 8) | raw[pos + 5]
        if 2015 <= yr <= 2050:
            footer_pos = pos
            break
        pos += 1

    if footer_pos < 0 and len(raw) >= 26:
        footer_pos = len(raw) - 26
    if footer_pos < body_start:
        raise ValueError(f"{path}: footer not found")

    body   = raw[body_start : footer_pos]
    footer = raw[footer_pos : footer_pos + 26]

    # Footer layout:
    #   [0:2]   0e 08  marker
    #   [2:10]  ts1 (start) BE 8B
    #   [10:18] ts2 (stop)  BE 8B
    #   [18:24] 00 01 00 02 00 00
    #   [24:26] crc
    ts1 = _bw._decode_ts_be(footer[2:10])
    ts2 = _bw._decode_ts_be(footer[10:18])

    # Body: decode via the verified body codecs.  Two formats coexist:
    #
    #   1. Waveform-mode (.AB0W) — starts with 7-byte preamble
    #      ``00 02 00 [Tran[0] BE] [Tran[1] BE]`` followed by the
    #      tagged-block delta stream documented in
    #      ``docs/waveform_codec_re_status.md`` and §7.6.1 of the
    #      protocol reference.  Decoded by ``waveform_codec.decode_waveform_v2``.
    #
    #   2. Histogram-mode (.AB0H) — a sequence of 32-byte blocks, one
    #      per histogram interval, each carrying per-channel peak +
    #      half-period values.  Decoded by
    #      ``histogram_codec.decode_histogram_body``.  Both codecs
    #      return the same channel-grouped output shape, so consumers
    #      don't need to special-case mode.
    #
    # The historical ``_decode_samples_4ch_int16_le`` int16-LE
    # interpretation was retracted 2026-05-08 (see protocol-ref §7.6.1
    # retraction box) — it produced ±32K noise on every event.
    #
    # If both codecs fail (malformed file, truncated body, unrecognised
    # mode, synthetic test input), fall back to empty channels — the
    # rest of the event (timestamp, waveform_key, project strings) is
    # still recoverable and useful.
    decoded = decode_waveform_v2(body)
    if decoded is None:
        decoded = decode_histogram_body(body)
    if decoded is None:
        log.warning(
            "%s: body codec failed to decode (body starts %s) — "
            "raw_samples will be empty", path, body[:8].hex(" "),
        )
        samples = {"Tran": [], "Vert": [], "Long": [], "MicL": []}
    else:
        samples = decoded_to_adc_counts(decoded)

    # Metadata strings (label-anchored search across the body).
    project = _find_first_string(body, b"Project:")
    client  = _find_first_string(body, b"Client:")
    user    = _find_first_string(body, b"User Name:")
    seisloc = _find_first_string(body, b"Seis Loc:")

    # Build the Event.
    ev = Event(index=-1)
    if strt_fields.get("waveform_key"):
        ev._waveform_key = bytes.fromhex(strt_fields["waveform_key"])
    # Derive record_type from the filename's extension suffix (H/W/M/E/C).
    # When called from save_imported_bw the path here is a tmp file with a
    # ".bw" suffix, so the derivation falls back to "Waveform" and the
    # caller overrides ev.record_type using the original filename — see
    # waveform_store.save_imported_bw.
    ev.record_type     = derive_record_type_from_filename(path.name)
    ev.rectime_seconds = strt_fields.get("rectime_seconds")
    ev.total_samples   = strt_fields.get("total_samples")
    ev.pretrig_samples = strt_fields.get("pretrig_samples")

    if ts1 is not None:
        ev.timestamp = Timestamp(
            raw=footer[2:10],
            flag=0x10,
            year=ts1.year, unknown_byte=0, month=ts1.month, day=ts1.day,
            hour=ts1.hour, minute=ts1.minute, second=ts1.second,
        )

    ev.project_info = ProjectInfo(
        project=project, client=client, operator=user, sensor_location=seisloc,
    )
    ev.raw_samples = samples
    # Only compute peaks from samples when we actually have samples.
    # For events the codec couldn't decode (histogram-mode bodies, until
    # the §7.6.2 histogram codec is wired in), samples is an empty dict
    # and ``_peaks_from_samples`` would return PeakValues(0, 0, 0, 0, 0).
    # That would then OVERWRITE existing good DB peak values (e.g. from
    # paired BW ASCII reports) during the backfill UPSERT path.
    # Leaving peak_values=None signals "we don't know" to downstream
    # consumers; the backfill script seeds from the DB row when it sees
    # None, and ``apply_report_to_event`` overlays from a paired ASCII
    # report when one is supplied.
    has_samples = any(samples.get(ch) for ch in ("Tran", "Vert", "Long", "MicL"))
    ev.peak_values = _peaks_from_samples(samples) if has_samples else None
    ev._a5_frames = None  # not recoverable from BW file

    return ev