seismo-relay/sfm/event_hdf5.py

"""
sfm/event_hdf5.py — HDF5 codec for the canonical "clean waveform" file.

Layout written to `<filename>.h5`:

  /
  ├─ samples/
  │   ├─ Tran  (float32, in/s)         shape: (N,)
  │   ├─ Vert  (float32, in/s)         shape: (N,)
  │   ├─ Long  (float32, in/s)         shape: (N,)
  │   └─ MicL  (float32, psi)          shape: (N,)
  ├─ samples_int16/ (optional)
  │   ├─ Tran  (int16, raw ADC counts) shape: (N,)
  │   └─ ... per channel (only when present in the source)
  └─ root attrs (event metadata):
       schema_version       int    = 1
       kind                 str    = "sfm.event.hdf5"
       serial               str
       waveform_key         str    (8-hex)
       timestamp            str    (ISO-8601)
       record_type          str
       sample_rate          int    (sps)
       pretrig_samples      int
       total_samples        int
       rectime_seconds      float
       geo_range            str    "normal" | "sensitive"
       geo_full_scale_ips   float  (10.0 or 1.250)
       project              str
       client               str
       operator             str
       sensor_location      str
       peak_tran_ips        float  (from 0C; authoritative)
       peak_vert_ips        float
       peak_long_ips        float
       peak_pvs_ips         float
       peak_mic_psi         float
       tool_version         str
       captured_at          str    (ISO-8601 UTC)
       source_kind          str    "sfm-live" | "sfm-ach" | "bw-import"

Why HDF5 and not just JSON for the canonical clean format:
  - Native float32 arrays (no base64 dance, no per-value JSON parsing).
  - Per-dataset gzip compression — sample arrays compress 3-5×.
  - Cross-language: h5py (Python), HDF5.jl (Julia), io.netcdf (R), etc.
    Analysis pipelines don't have to know anything about Blastware.
  - Self-describing via attributes; future fields don't break readers.

The plot-ready `sfm.plot.v1` JSON returned by the REST endpoints is
derived from this HDF5 (or computed on-the-fly when no .h5 exists yet).
"""

from __future__ import annotations

import datetime
import logging
from pathlib import Path
from typing import Optional, Union

import h5py
import numpy as np

from minimateplus.event_file_io import TOOL_VERSION as _DEFAULT_TOOL_VERSION
from minimateplus.models import Event

log = logging.getLogger(__name__)

SCHEMA_VERSION = 1
HDF5_KIND = "sfm.event.hdf5"

# Geophone full-scale velocity per range (in/s).  Confirmed in CLAUDE.md
# from 4-20-26 captures: Normal=0x00 → 10 in/s, Sensitive=0x01 → 1.25 in/s.
_GEO_FS_BY_RANGE = {
    "normal":    10.000,
    "sensitive": 1.2500,
    0:           10.000,
    1:           1.2500,
}
_INT16_FS = 32768.0

# Default mic conversion: ADC count → psi.  Approximate; exact factor
# depends on firmware reference voltage and mic sensitivity, neither of
# which is independently confirmed.  We try to refine it from the device-
# reported peak when available (peak_mic_psi / max_abs_int16).
_MIC_DEFAULT_FS_PSI = 0.0125    # ≈ 0.5 psi at full scale (rough)


def _resolve_geo_full_scale(geo_range) -> float:
    """Map a geo_range value (string or int from compliance config) to the
    full-scale velocity in in/s.  Defaults to Normal range (10.0) when the
    value is unknown — same default as Blastware itself."""
    if geo_range is None:
        return _GEO_FS_BY_RANGE["normal"]
    if isinstance(geo_range, str):
        return _GEO_FS_BY_RANGE.get(geo_range.lower(), _GEO_FS_BY_RANGE["normal"])
    return _GEO_FS_BY_RANGE.get(int(geo_range), _GEO_FS_BY_RANGE["normal"])


def _normalise_range(geo_range) -> str:
    """Return 'normal' or 'sensitive' (string) regardless of input form."""
    if isinstance(geo_range, str):
        v = geo_range.lower()
        if v in ("normal", "sensitive"):
            return v
        return "normal"
    if geo_range == 1:
        return "sensitive"
    return "normal"


def _ts_iso(ts) -> str:
    if ts is None:
        return ""
    try:
        return datetime.datetime(
            ts.year, ts.month, ts.day,
            ts.hour or 0, ts.minute or 0, ts.second or 0,
        ).isoformat()
    except Exception:
        return str(ts)


def _samples_to_float(
    samples_int16: list[int],
    full_scale: float,
) -> np.ndarray:
    """Convert int16 ADC counts → float32 physical units.

    Uses _INT16_FS=32768 (not 32767) so that a count of -32768 maps to
    exactly -full_scale and +32767 maps to ~+full_scale * 32767/32768.
    Matches the device firmware's documented mapping (see CLAUDE.md
    geo_hardware_constant rationale).
    """
    if not samples_int16:
        return np.array([], dtype=np.float32)
    arr = np.asarray(samples_int16, dtype=np.int32)  # int32 to avoid overflow during scale
    return (arr.astype(np.float32) * (full_scale / _INT16_FS)).astype(np.float32)


def _mic_scale_factor(
    samples_int16: list[int],
    peak_mic_psi: Optional[float],
) -> float:
    """Resolve the per-count psi factor for the microphone channel.

    When the device reports a peak mic value via the 0C record, we
    back-solve the per-count factor from `peak_psi / max(|samples|)` so
    the plotted waveform peaks land exactly at the device-reported value.
    Otherwise fall back to the rough _MIC_DEFAULT_FS_PSI estimate.
    """
    if peak_mic_psi is not None and peak_mic_psi > 0 and samples_int16:
        max_count = max(abs(int(v)) for v in samples_int16) or 1
        return float(peak_mic_psi) / float(max_count)
    return _MIC_DEFAULT_FS_PSI / _INT16_FS


def write_event_hdf5(
    path: Union[str, Path],
    event: Event,
    *,
    serial: str,
    geo_range = "normal",
    source_kind: str = "sfm-live",
    tool_version: Optional[str] = None,
    captured_at: Optional[datetime.datetime] = None,
    include_int16: bool = True,
) -> dict:
    """
    Persist a decoded Event as an HDF5 file with samples in physical units.

    Returns a small summary dict suitable for logging:
      {"path": Path, "n_samples": int, "geo_full_scale_ips": float}
    """
    path = Path(path)
    raw = event.raw_samples or {}
    pv  = event.peak_values
    pi  = event.project_info

    geo_fs = _resolve_geo_full_scale(geo_range)
    geo_range_str = _normalise_range(geo_range)
    captured_at = captured_at or datetime.datetime.utcnow()
    tool_version = tool_version or _DEFAULT_TOOL_VERSION

    # Per-channel float32 arrays in physical units.
    geo_arrays = {}
    for ch in ("Tran", "Vert", "Long"):
        geo_arrays[ch] = _samples_to_float(raw.get(ch, []), geo_fs)

    # Mic channel — the per-count factor is resolved from the device-reported
    # peak when available so the plot peaks the BW value exactly.
    mic_int16 = raw.get("MicL", [])
    mic_factor = _mic_scale_factor(
        mic_int16,
        getattr(pv, "micl", None) if pv else None,
    )
    if mic_int16:
        mic_arr = (np.asarray(mic_int16, dtype=np.int32).astype(np.float32) * mic_factor).astype(np.float32)
    else:
        mic_arr = np.array([], dtype=np.float32)

    n_samples = max(
        (len(geo_arrays[ch]) for ch in geo_arrays),
        default=0,
    )

    # Atomic write: temp file + os.replace.
    tmp = path.with_suffix(path.suffix + ".tmp")
    with h5py.File(tmp, "w") as f:
        # Root attrs — event-level metadata.
        attrs = f.attrs
        attrs["schema_version"]  = SCHEMA_VERSION
        attrs["kind"]            = HDF5_KIND
        attrs["serial"]          = serial or ""
        attrs["waveform_key"]    = event._waveform_key.hex() if event._waveform_key else ""
        attrs["timestamp"]       = _ts_iso(event.timestamp)
        attrs["record_type"]     = event.record_type or ""
        attrs["sample_rate"]     = int(event.sample_rate or 0)
        attrs["pretrig_samples"] = int(event.pretrig_samples or 0)
        attrs["total_samples"]   = int(event.total_samples or n_samples)
        attrs["rectime_seconds"] = float(event.rectime_seconds or 0.0)
        attrs["geo_range"]       = geo_range_str
        attrs["geo_full_scale_ips"] = float(geo_fs)
        attrs["project"]         = (pi.project         if pi else "") or ""
        attrs["client"]          = (pi.client          if pi else "") or ""
        attrs["operator"]        = (pi.operator        if pi else "") or ""
        attrs["sensor_location"] = (pi.sensor_location if pi else "") or ""
        attrs["peak_tran_ips"]   = float(pv.tran            if pv and pv.tran            is not None else 0.0)
        attrs["peak_vert_ips"]   = float(pv.vert            if pv and pv.vert            is not None else 0.0)
        attrs["peak_long_ips"]   = float(pv.long            if pv and pv.long            is not None else 0.0)
        attrs["peak_pvs_ips"]    = float(pv.peak_vector_sum if pv and pv.peak_vector_sum is not None else 0.0)
        attrs["peak_mic_psi"]    = float(pv.micl            if pv and pv.micl            is not None else 0.0)
        attrs["tool_version"]    = tool_version or ""
        attrs["captured_at"]     = captured_at.isoformat() + "Z" if captured_at.tzinfo is None else captured_at.isoformat()
        attrs["source_kind"]     = source_kind

        # /samples — physical-units float32 (the primary data).
        sgrp = f.create_group("samples")
        for ch, arr in geo_arrays.items():
            sgrp.create_dataset(
                ch, data=arr, dtype="float32",
                compression="gzip", compression_opts=4, shuffle=True,
            )
        sgrp.create_dataset(
            "MicL", data=mic_arr, dtype="float32",
            compression="gzip", compression_opts=4, shuffle=True,
        )

        # /samples_int16 — optional raw ADC counts (preserved for analysis
        # tools that want pre-conversion data).  Cheap to include.
        if include_int16:
            igrp = f.create_group("samples_int16")
            for ch in ("Tran", "Vert", "Long", "MicL"):
                vals = raw.get(ch, [])
                if vals:
                    igrp.create_dataset(
                        ch, data=np.asarray(vals, dtype=np.int16),
                        compression="gzip", compression_opts=4, shuffle=True,
                    )
            igrp.attrs["mic_psi_per_count"] = float(mic_factor)

    import os
    os.replace(tmp, path)

    log.info(
        "write_event_hdf5: %s n_samples=%d geo_fs=%.3f filesize=%d",
        path, n_samples, geo_fs, path.stat().st_size,
    )
    return {
        "path": path,
        "n_samples": n_samples,
        "geo_full_scale_ips": geo_fs,
    }


def read_event_hdf5(path: Union[str, Path]) -> dict:
    """
    Load an event HDF5 into a plain dict (no Event reconstruction —
    callers that want an Event can use the data directly).

    Returns:
        {
          "schema_version": int,
          "kind": str,
          "attrs": dict[str, …],          # all root attributes
          "samples": {                     # float32 lists in physical units
            "Tran": ndarray, "Vert": ndarray, "Long": ndarray, "MicL": ndarray,
          },
          "samples_int16": {…} or None,
          "mic_psi_per_count": float | None,
        }

    Raises FileNotFoundError if missing, ValueError on bad shape /
    unsupported schema_version.
    """
    path = Path(path)
    with h5py.File(path, "r") as f:
        attrs = {k: _h5_attr_value(v) for k, v in f.attrs.items()}
        sv = attrs.get("schema_version", 0)
        if not isinstance(sv, int) or sv < 1 or sv > SCHEMA_VERSION:
            raise ValueError(
                f"{path}: unsupported HDF5 schema_version={sv} "
                f"(this build supports 1..{SCHEMA_VERSION})"
            )
        if attrs.get("kind") != HDF5_KIND:
            raise ValueError(f"{path}: kind != {HDF5_KIND!r} (got {attrs.get('kind')!r})")

        samples = {}
        for ch in ("Tran", "Vert", "Long", "MicL"):
            ds = f.get(f"samples/{ch}")
            samples[ch] = np.asarray(ds[()]) if ds is not None else np.array([], dtype=np.float32)

        samples_int16 = None
        mic_psi = None
        igrp = f.get("samples_int16")
        if igrp is not None:
            samples_int16 = {}
            for ch in ("Tran", "Vert", "Long", "MicL"):
                ds = igrp.get(ch)
                if ds is not None:
                    samples_int16[ch] = np.asarray(ds[()])
            mic_attr = igrp.attrs.get("mic_psi_per_count")
            if mic_attr is not None:
                mic_psi = float(mic_attr)

    return {
        "schema_version":     sv,
        "kind":               attrs.get("kind"),
        "attrs":              attrs,
        "samples":            samples,
        "samples_int16":      samples_int16,
        "mic_psi_per_count":  mic_psi,
    }


def _h5_attr_value(v):
    """Convert an h5py attribute value to a plain Python type."""
    if isinstance(v, bytes):
        return v.decode("utf-8", errors="replace")
    if isinstance(v, np.generic):
        return v.item()
    return v


# ── Plot-ready JSON ──────────────────────────────────────────────────────────


def event_to_plot_json(
    event: Event,
    *,
    serial: str,
    geo_range = "normal",
    event_id: Optional[str] = None,
    index: Optional[int] = None,
) -> dict:
    """
    Build a `sfm.plot.v1` JSON dict directly from an Event (skipping HDF5).

    Used by:
      - `/device/event/{idx}/waveform`  (live device path)
      - The CLI / tests for in-memory conversion sanity-checks.

    Stored events go through `plot_json_from_hdf5()` so the wire format
    is identical regardless of whether the data came from the live device
    or the on-disk HDF5.
    """
    raw = event.raw_samples or {}
    pv  = event.peak_values
    geo_fs = _resolve_geo_full_scale(geo_range)
    geo_range_str = _normalise_range(geo_range)
    sr = int(event.sample_rate or 0) or 1024
    pretrig = int(event.pretrig_samples or 0)

    geo_arrays = {ch: _samples_to_float(raw.get(ch, []), geo_fs).tolist()
                  for ch in ("Tran", "Vert", "Long")}
    mic_int16 = raw.get("MicL", [])
    mic_factor = _mic_scale_factor(
        mic_int16,
        getattr(pv, "micl", None) if pv else None,
    )
    mic_arr = [float(v) * mic_factor for v in mic_int16] if mic_int16 else []

    n = max(
        (len(geo_arrays[ch]) for ch in geo_arrays),
        default=len(mic_arr),
    )
    return _build_plot_dict(
        n_samples=n,
        sample_rate=sr,
        pretrig_samples=pretrig,
        total_samples=int(event.total_samples or n),
        rectime_seconds=float(event.rectime_seconds or 0.0),
        timestamp_iso=_ts_iso(event.timestamp),
        serial=serial,
        record_type=event.record_type,
        waveform_key=event._waveform_key.hex() if event._waveform_key else None,
        geo_range=geo_range_str,
        geo_fs=geo_fs,
        channels_floats={
            "Tran": geo_arrays["Tran"],
            "Vert": geo_arrays["Vert"],
            "Long": geo_arrays["Long"],
            "MicL": mic_arr,
        },
        peaks_dict={
            "tran": getattr(pv, "tran",            None) if pv else None,
            "vert": getattr(pv, "vert",            None) if pv else None,
            "long": getattr(pv, "long",            None) if pv else None,
            "pvs":  getattr(pv, "peak_vector_sum", None) if pv else None,
            "mic":  getattr(pv, "micl",            None) if pv else None,
        },
        event_id=event_id,
        index=index if index is not None else event.index,
    )


def plot_json_from_hdf5(
    path: Union[str, Path],
    *,
    event_id: Optional[str] = None,
    index: Optional[int] = None,
) -> dict:
    """Build a `sfm.plot.v1` JSON dict from a stored .h5 file."""
    data = read_event_hdf5(path)
    a = data["attrs"]
    s = data["samples"]
    return _build_plot_dict(
        n_samples=len(s["Tran"]) if "Tran" in s else 0,
        sample_rate=int(a.get("sample_rate", 1024) or 1024),
        pretrig_samples=int(a.get("pretrig_samples", 0) or 0),
        total_samples=int(a.get("total_samples", 0) or 0),
        rectime_seconds=float(a.get("rectime_seconds", 0.0) or 0.0),
        timestamp_iso=a.get("timestamp", ""),
        serial=a.get("serial", ""),
        record_type=a.get("record_type", ""),
        waveform_key=a.get("waveform_key", "") or None,
        geo_range=a.get("geo_range", "normal"),
        geo_fs=float(a.get("geo_full_scale_ips", 10.0) or 10.0),
        channels_floats={
            "Tran": s.get("Tran", np.array([])).tolist(),
            "Vert": s.get("Vert", np.array([])).tolist(),
            "Long": s.get("Long", np.array([])).tolist(),
            "MicL": s.get("MicL", np.array([])).tolist(),
        },
        peaks_dict={
            "tran": float(a.get("peak_tran_ips", 0.0) or 0.0) or None,
            "vert": float(a.get("peak_vert_ips", 0.0) or 0.0) or None,
            "long": float(a.get("peak_long_ips", 0.0) or 0.0) or None,
            "pvs":  float(a.get("peak_pvs_ips",  0.0) or 0.0) or None,
            "mic":  float(a.get("peak_mic_psi",  0.0) or 0.0) or None,
        },
        event_id=event_id,
        index=index,
    )


def _build_plot_dict(
    *,
    n_samples: int,
    sample_rate: int,
    pretrig_samples: int,
    total_samples: int,
    rectime_seconds: float,
    timestamp_iso: str,
    serial: str,
    record_type: Optional[str],
    waveform_key: Optional[str],
    geo_range: str,
    geo_fs: float,
    channels_floats: dict[str, list[float]],
    peaks_dict: dict[str, Optional[float]],
    event_id: Optional[str],
    index: Optional[int] = None,
) -> dict:
    dt_ms = (1000.0 / sample_rate) if sample_rate > 0 else 0.0
    t0_ms = -pretrig_samples * dt_ms

    def _ch(unit: str, values: list[float], peak: Optional[float]) -> dict:
        # Locate the peak's time within the values array (max abs).
        if values:
            mags = [abs(v) for v in values]
            i = mags.index(max(mags))
            peak_t_ms = round(t0_ms + i * dt_ms, 4)
            peak_value = peak if peak is not None else values[i]
        else:
            peak_t_ms = None
            peak_value = peak
        return {
            "unit":       unit,
            "values":     values,
            "peak":       peak_value,
            "peak_t_ms":  peak_t_ms,
        }

    return {
        "schema":      "sfm.plot.v1",
        "event_id":    event_id,
        "index":       index,
        "serial":      serial,
        "timestamp":   timestamp_iso,
        "record_type": record_type,
        "waveform_key": waveform_key,

        "time_axis": {
            "sample_rate":     sample_rate,
            "pretrig_samples": pretrig_samples,
            "total_samples":   total_samples or n_samples,
            "n_samples":       n_samples,
            "t0_ms":           round(t0_ms, 4),
            "dt_ms":           round(dt_ms, 6),
            "rectime_seconds": rectime_seconds,
        },

        "geo_range":          geo_range,
        "geo_full_scale_ips": geo_fs,
        "trigger_ms":         0.0,

        "channels": {
            "Tran": _ch("in/s", channels_floats.get("Tran", []), peaks_dict.get("tran")),
            "Vert": _ch("in/s", channels_floats.get("Vert", []), peaks_dict.get("vert")),
            "Long": _ch("in/s", channels_floats.get("Long", []), peaks_dict.get("long")),
            "MicL": _ch("psi",  channels_floats.get("MicL", []), peaks_dict.get("mic")),
        },

        "peak_values": {
            "transverse":   peaks_dict.get("tran"),
            "vertical":     peaks_dict.get("vert"),
            "longitudinal": peaks_dict.get("long"),
            "vector_sum":   peaks_dict.get("pvs"),
            "mic_psi":      peaks_dict.get("mic"),
        },
    }