feat: v0.15.0

### Added - **Layered event storage architecture.** Each event now lands as four files in the per-serial waveform store, each with a clear role: - `<filename>` — the Blastware-readable binary (BW file). Untouched. - `<filename>.a5.pkl` — the raw 5A frames (regenerative source). - `<filename>.h5` — clean per-channel waveform arrays in physical units (in/s for geo, psi for mic) plus event metadata (HDF5 with gzip compression). This is the canonical format for downstream analysis tools. - `<filename>.sfm.json` — the modern review/metadata sidecar (peaks, project, source provenance, review state, extensions). SQLite (`seismo_relay.db`) is the searchable index over all four. - **Plot-ready waveform JSON (`sfm.plot.v1`).** The `/device/event/{idx}/waveform` and `/db/events/{id}/waveform.json` endpoints now return samples in physical units with explicit time-axis metadata, peak markers, and per-channel unit hints — no more guessing the ADC-to-velocity scale client-side. The webapp waveform viewer was rewritten to consume this shape. - **In-app waveform viewer accuracy fix.** The standalone SFM webapp viewer was scaling geophone amplitudes by `geoAdcScale / 32767` (≈ 6.206 / 32767), where `geoAdcScale = 6.206053` is the device's *in/s per V* hardware constant — not the ADC-counts-to-velocity factor. This silently scaled every plot ~38% too low for Normal-range geophones (the correct full-scale is 10.0 in/s, or 1.25 in/s for Sensitive). Conversion is now done server-side using the geo_range from compliance config; the client just plots. - New `sfm/event_hdf5.py` module: `write_event_hdf5()`, `read_event_hdf5()`, plus a plot-JSON helper. - Backfill script extended to also emit `.h5` for existing events. ### Dependencies - Added `h5py>=3.10` and `numpy>=1.24` for the HDF5 storage layer. - Added `python-multipart>=0.0.7` (required by FastAPI for the `/db/import/blastware_file` endpoint introduced in this release).
2026-05-08 04:39:51 +00:00
parent 9afa3484f4
commit c641d5fc10
14 changed files with 3511 additions and 177 deletions
@@ -0,0 +1,530 @@
+"""
+sfm/event_hdf5.py — HDF5 codec for the canonical "clean waveform" file.
+
+Layout written to `<filename>.h5`:
+
+  /
+  ├─ samples/
+  │   ├─ Tran  (float32, in/s)         shape: (N,)
+  │   ├─ Vert  (float32, in/s)         shape: (N,)
+  │   ├─ Long  (float32, in/s)         shape: (N,)
+  │   └─ MicL  (float32, psi)          shape: (N,)
+  ├─ samples_int16/ (optional)
+  │   ├─ Tran  (int16, raw ADC counts) shape: (N,)
+  │   └─ ... per channel (only when present in the source)
+  └─ root attrs (event metadata):
+       schema_version       int    = 1
+       kind                 str    = "sfm.event.hdf5"
+       serial               str
+       waveform_key         str    (8-hex)
+       timestamp            str    (ISO-8601)
+       record_type          str
+       sample_rate          int    (sps)
+       pretrig_samples      int
+       total_samples        int
+       rectime_seconds      float
+       geo_range            str    "normal" | "sensitive"
+       geo_full_scale_ips   float  (10.0 or 1.250)
+       project              str
+       client               str
+       operator             str
+       sensor_location      str
+       peak_tran_ips        float  (from 0C; authoritative)
+       peak_vert_ips        float
+       peak_long_ips        float
+       peak_pvs_ips         float
+       peak_mic_psi         float
+       tool_version         str
+       captured_at          str    (ISO-8601 UTC)
+       source_kind          str    "sfm-live" | "sfm-ach" | "bw-import"
+
+Why HDF5 and not just JSON for the canonical clean format:
+  - Native float32 arrays (no base64 dance, no per-value JSON parsing).
+  - Per-dataset gzip compression — sample arrays compress 3-5×.
+  - Cross-language: h5py (Python), HDF5.jl (Julia), io.netcdf (R), etc.
+    Analysis pipelines don't have to know anything about Blastware.
+  - Self-describing via attributes; future fields don't break readers.
+
+The plot-ready `sfm.plot.v1` JSON returned by the REST endpoints is
+derived from this HDF5 (or computed on-the-fly when no .h5 exists yet).
+"""
+
+from __future__ import annotations
+
+import datetime
+import logging
+from pathlib import Path
+from typing import Optional, Union
+
+import h5py
+import numpy as np
+
+from minimateplus.event_file_io import TOOL_VERSION as _DEFAULT_TOOL_VERSION
+from minimateplus.models import Event
+
+log = logging.getLogger(__name__)
+
+SCHEMA_VERSION = 1
+HDF5_KIND = "sfm.event.hdf5"
+
+# Geophone full-scale velocity per range (in/s).  Confirmed in CLAUDE.md
+# from 4-20-26 captures: Normal=0x00 → 10 in/s, Sensitive=0x01 → 1.25 in/s.
+_GEO_FS_BY_RANGE = {
+    "normal":    10.000,
+    "sensitive": 1.2500,
+    0:           10.000,
+    1:           1.2500,
+}
+_INT16_FS = 32768.0
+
+# Default mic conversion: ADC count → psi.  Approximate; exact factor
+# depends on firmware reference voltage and mic sensitivity, neither of
+# which is independently confirmed.  We try to refine it from the device-
+# reported peak when available (peak_mic_psi / max_abs_int16).
+_MIC_DEFAULT_FS_PSI = 0.0125    # ≈ 0.5 psi at full scale (rough)
+
+
+def _resolve_geo_full_scale(geo_range) -> float:
+    """Map a geo_range value (string or int from compliance config) to the
+    full-scale velocity in in/s.  Defaults to Normal range (10.0) when the
+    value is unknown — same default as Blastware itself."""
+    if geo_range is None:
+        return _GEO_FS_BY_RANGE["normal"]
+    if isinstance(geo_range, str):
+        return _GEO_FS_BY_RANGE.get(geo_range.lower(), _GEO_FS_BY_RANGE["normal"])
+    return _GEO_FS_BY_RANGE.get(int(geo_range), _GEO_FS_BY_RANGE["normal"])
+
+
+def _normalise_range(geo_range) -> str:
+    """Return 'normal' or 'sensitive' (string) regardless of input form."""
+    if isinstance(geo_range, str):
+        v = geo_range.lower()
+        if v in ("normal", "sensitive"):
+            return v
+        return "normal"
+    if geo_range == 1:
+        return "sensitive"
+    return "normal"
+
+
+def _ts_iso(ts) -> str:
+    if ts is None:
+        return ""
+    try:
+        return datetime.datetime(
+            ts.year, ts.month, ts.day,
+            ts.hour or 0, ts.minute or 0, ts.second or 0,
+        ).isoformat()
+    except Exception:
+        return str(ts)
+
+
+def _samples_to_float(
+    samples_int16: list[int],
+    full_scale: float,
+) -> np.ndarray:
+    """Convert int16 ADC counts → float32 physical units.
+
+    Uses _INT16_FS=32768 (not 32767) so that a count of -32768 maps to
+    exactly -full_scale and +32767 maps to ~+full_scale * 32767/32768.
+    Matches the device firmware's documented mapping (see CLAUDE.md
+    geo_hardware_constant rationale).
+    """
+    if not samples_int16:
+        return np.array([], dtype=np.float32)
+    arr = np.asarray(samples_int16, dtype=np.int32)  # int32 to avoid overflow during scale
+    return (arr.astype(np.float32) * (full_scale / _INT16_FS)).astype(np.float32)
+
+
+def _mic_scale_factor(
+    samples_int16: list[int],
+    peak_mic_psi: Optional[float],
+) -> float:
+    """Resolve the per-count psi factor for the microphone channel.
+
+    When the device reports a peak mic value via the 0C record, we
+    back-solve the per-count factor from `peak_psi / max(|samples|)` so
+    the plotted waveform peaks land exactly at the device-reported value.
+    Otherwise fall back to the rough _MIC_DEFAULT_FS_PSI estimate.
+    """
+    if peak_mic_psi is not None and peak_mic_psi > 0 and samples_int16:
+        max_count = max(abs(int(v)) for v in samples_int16) or 1
+        return float(peak_mic_psi) / float(max_count)
+    return _MIC_DEFAULT_FS_PSI / _INT16_FS
+
+
+def write_event_hdf5(
+    path: Union[str, Path],
+    event: Event,
+    *,
+    serial: str,
+    geo_range = "normal",
+    source_kind: str = "sfm-live",
+    tool_version: Optional[str] = None,
+    captured_at: Optional[datetime.datetime] = None,
+    include_int16: bool = True,
+) -> dict:
+    """
+    Persist a decoded Event as an HDF5 file with samples in physical units.
+
+    Returns a small summary dict suitable for logging:
+      {"path": Path, "n_samples": int, "geo_full_scale_ips": float}
+    """
+    path = Path(path)
+    raw = event.raw_samples or {}
+    pv  = event.peak_values
+    pi  = event.project_info
+
+    geo_fs = _resolve_geo_full_scale(geo_range)
+    geo_range_str = _normalise_range(geo_range)
+    captured_at = captured_at or datetime.datetime.utcnow()
+    tool_version = tool_version or _DEFAULT_TOOL_VERSION
+
+    # Per-channel float32 arrays in physical units.
+    geo_arrays = {}
+    for ch in ("Tran", "Vert", "Long"):
+        geo_arrays[ch] = _samples_to_float(raw.get(ch, []), geo_fs)
+
+    # Mic channel — the per-count factor is resolved from the device-reported
+    # peak when available so the plot peaks the BW value exactly.
+    mic_int16 = raw.get("MicL", [])
+    mic_factor = _mic_scale_factor(
+        mic_int16,
+        getattr(pv, "micl", None) if pv else None,
+    )
+    if mic_int16:
+        mic_arr = (np.asarray(mic_int16, dtype=np.int32).astype(np.float32) * mic_factor).astype(np.float32)
+    else:
+        mic_arr = np.array([], dtype=np.float32)
+
+    n_samples = max(
+        (len(geo_arrays[ch]) for ch in geo_arrays),
+        default=0,
+    )
+
+    # Atomic write: temp file + os.replace.
+    tmp = path.with_suffix(path.suffix + ".tmp")
+    with h5py.File(tmp, "w") as f:
+        # Root attrs — event-level metadata.
+        attrs = f.attrs
+        attrs["schema_version"]  = SCHEMA_VERSION
+        attrs["kind"]            = HDF5_KIND
+        attrs["serial"]          = serial or ""
+        attrs["waveform_key"]    = event._waveform_key.hex() if event._waveform_key else ""
+        attrs["timestamp"]       = _ts_iso(event.timestamp)
+        attrs["record_type"]     = event.record_type or ""
+        attrs["sample_rate"]     = int(event.sample_rate or 0)
+        attrs["pretrig_samples"] = int(event.pretrig_samples or 0)
+        attrs["total_samples"]   = int(event.total_samples or n_samples)
+        attrs["rectime_seconds"] = float(event.rectime_seconds or 0.0)
+        attrs["geo_range"]       = geo_range_str
+        attrs["geo_full_scale_ips"] = float(geo_fs)
+        attrs["project"]         = (pi.project         if pi else "") or ""
+        attrs["client"]          = (pi.client          if pi else "") or ""
+        attrs["operator"]        = (pi.operator        if pi else "") or ""
+        attrs["sensor_location"] = (pi.sensor_location if pi else "") or ""
+        attrs["peak_tran_ips"]   = float(pv.tran            if pv and pv.tran            is not None else 0.0)
+        attrs["peak_vert_ips"]   = float(pv.vert            if pv and pv.vert            is not None else 0.0)
+        attrs["peak_long_ips"]   = float(pv.long            if pv and pv.long            is not None else 0.0)
+        attrs["peak_pvs_ips"]    = float(pv.peak_vector_sum if pv and pv.peak_vector_sum is not None else 0.0)
+        attrs["peak_mic_psi"]    = float(pv.micl            if pv and pv.micl            is not None else 0.0)
+        attrs["tool_version"]    = tool_version or ""
+        attrs["captured_at"]     = captured_at.isoformat() + "Z" if captured_at.tzinfo is None else captured_at.isoformat()
+        attrs["source_kind"]     = source_kind
+
+        # /samples — physical-units float32 (the primary data).
+        sgrp = f.create_group("samples")
+        for ch, arr in geo_arrays.items():
+            sgrp.create_dataset(
+                ch, data=arr, dtype="float32",
+                compression="gzip", compression_opts=4, shuffle=True,
+            )
+        sgrp.create_dataset(
+            "MicL", data=mic_arr, dtype="float32",
+            compression="gzip", compression_opts=4, shuffle=True,
+        )
+
+        # /samples_int16 — optional raw ADC counts (preserved for analysis
+        # tools that want pre-conversion data).  Cheap to include.
+        if include_int16:
+            igrp = f.create_group("samples_int16")
+            for ch in ("Tran", "Vert", "Long", "MicL"):
+                vals = raw.get(ch, [])
+                if vals:
+                    igrp.create_dataset(
+                        ch, data=np.asarray(vals, dtype=np.int16),
+                        compression="gzip", compression_opts=4, shuffle=True,
+                    )
+            igrp.attrs["mic_psi_per_count"] = float(mic_factor)
+
+    import os
+    os.replace(tmp, path)
+
+    log.info(
+        "write_event_hdf5: %s n_samples=%d geo_fs=%.3f filesize=%d",
+        path, n_samples, geo_fs, path.stat().st_size,
+    )
+    return {
+        "path": path,
+        "n_samples": n_samples,
+        "geo_full_scale_ips": geo_fs,
+    }
+
+
+def read_event_hdf5(path: Union[str, Path]) -> dict:
+    """
+    Load an event HDF5 into a plain dict (no Event reconstruction —
+    callers that want an Event can use the data directly).
+
+    Returns:
+        {
+          "schema_version": int,
+          "kind": str,
+          "attrs": dict[str, …],          # all root attributes
+          "samples": {                     # float32 lists in physical units
+            "Tran": ndarray, "Vert": ndarray, "Long": ndarray, "MicL": ndarray,
+          },
+          "samples_int16": {…} or None,
+          "mic_psi_per_count": float | None,
+        }
+
+    Raises FileNotFoundError if missing, ValueError on bad shape /
+    unsupported schema_version.
+    """
+    path = Path(path)
+    with h5py.File(path, "r") as f:
+        attrs = {k: _h5_attr_value(v) for k, v in f.attrs.items()}
+        sv = attrs.get("schema_version", 0)
+        if not isinstance(sv, int) or sv < 1 or sv > SCHEMA_VERSION:
+            raise ValueError(
+                f"{path}: unsupported HDF5 schema_version={sv} "
+                f"(this build supports 1..{SCHEMA_VERSION})"
+            )
+        if attrs.get("kind") != HDF5_KIND:
+            raise ValueError(f"{path}: kind != {HDF5_KIND!r} (got {attrs.get('kind')!r})")
+
+        samples = {}
+        for ch in ("Tran", "Vert", "Long", "MicL"):
+            ds = f.get(f"samples/{ch}")
+            samples[ch] = np.asarray(ds[()]) if ds is not None else np.array([], dtype=np.float32)
+
+        samples_int16 = None
+        mic_psi = None
+        igrp = f.get("samples_int16")
+        if igrp is not None:
+            samples_int16 = {}
+            for ch in ("Tran", "Vert", "Long", "MicL"):
+                ds = igrp.get(ch)
+                if ds is not None:
+                    samples_int16[ch] = np.asarray(ds[()])
+            mic_attr = igrp.attrs.get("mic_psi_per_count")
+            if mic_attr is not None:
+                mic_psi = float(mic_attr)
+
+    return {
+        "schema_version":     sv,
+        "kind":               attrs.get("kind"),
+        "attrs":              attrs,
+        "samples":            samples,
+        "samples_int16":      samples_int16,
+        "mic_psi_per_count":  mic_psi,
+    }
+
+
+def _h5_attr_value(v):
+    """Convert an h5py attribute value to a plain Python type."""
+    if isinstance(v, bytes):
+        return v.decode("utf-8", errors="replace")
+    if isinstance(v, np.generic):
+        return v.item()
+    return v
+
+
+# ── Plot-ready JSON ──────────────────────────────────────────────────────────
+
+
+def event_to_plot_json(
+    event: Event,
+    *,
+    serial: str,
+    geo_range = "normal",
+    event_id: Optional[str] = None,
+    index: Optional[int] = None,
+) -> dict:
+    """
+    Build a `sfm.plot.v1` JSON dict directly from an Event (skipping HDF5).
+
+    Used by:
+      - `/device/event/{idx}/waveform`  (live device path)
+      - The CLI / tests for in-memory conversion sanity-checks.
+
+    Stored events go through `plot_json_from_hdf5()` so the wire format
+    is identical regardless of whether the data came from the live device
+    or the on-disk HDF5.
+    """
+    raw = event.raw_samples or {}
+    pv  = event.peak_values
+    geo_fs = _resolve_geo_full_scale(geo_range)
+    geo_range_str = _normalise_range(geo_range)
+    sr = int(event.sample_rate or 0) or 1024
+    pretrig = int(event.pretrig_samples or 0)
+
+    geo_arrays = {ch: _samples_to_float(raw.get(ch, []), geo_fs).tolist()
+                  for ch in ("Tran", "Vert", "Long")}
+    mic_int16 = raw.get("MicL", [])
+    mic_factor = _mic_scale_factor(
+        mic_int16,
+        getattr(pv, "micl", None) if pv else None,
+    )
+    mic_arr = [float(v) * mic_factor for v in mic_int16] if mic_int16 else []
+
+    n = max(
+        (len(geo_arrays[ch]) for ch in geo_arrays),
+        default=len(mic_arr),
+    )
+    return _build_plot_dict(
+        n_samples=n,
+        sample_rate=sr,
+        pretrig_samples=pretrig,
+        total_samples=int(event.total_samples or n),
+        rectime_seconds=float(event.rectime_seconds or 0.0),
+        timestamp_iso=_ts_iso(event.timestamp),
+        serial=serial,
+        record_type=event.record_type,
+        waveform_key=event._waveform_key.hex() if event._waveform_key else None,
+        geo_range=geo_range_str,
+        geo_fs=geo_fs,
+        channels_floats={
+            "Tran": geo_arrays["Tran"],
+            "Vert": geo_arrays["Vert"],
+            "Long": geo_arrays["Long"],
+            "MicL": mic_arr,
+        },
+        peaks_dict={
+            "tran": getattr(pv, "tran",            None) if pv else None,
+            "vert": getattr(pv, "vert",            None) if pv else None,
+            "long": getattr(pv, "long",            None) if pv else None,
+            "pvs":  getattr(pv, "peak_vector_sum", None) if pv else None,
+            "mic":  getattr(pv, "micl",            None) if pv else None,
+        },
+        event_id=event_id,
+        index=index if index is not None else event.index,
+    )
+
+
+def plot_json_from_hdf5(
+    path: Union[str, Path],
+    *,
+    event_id: Optional[str] = None,
+    index: Optional[int] = None,
+) -> dict:
+    """Build a `sfm.plot.v1` JSON dict from a stored .h5 file."""
+    data = read_event_hdf5(path)
+    a = data["attrs"]
+    s = data["samples"]
+    return _build_plot_dict(
+        n_samples=len(s["Tran"]) if "Tran" in s else 0,
+        sample_rate=int(a.get("sample_rate", 1024) or 1024),
+        pretrig_samples=int(a.get("pretrig_samples", 0) or 0),
+        total_samples=int(a.get("total_samples", 0) or 0),
+        rectime_seconds=float(a.get("rectime_seconds", 0.0) or 0.0),
+        timestamp_iso=a.get("timestamp", ""),
+        serial=a.get("serial", ""),
+        record_type=a.get("record_type", ""),
+        waveform_key=a.get("waveform_key", "") or None,
+        geo_range=a.get("geo_range", "normal"),
+        geo_fs=float(a.get("geo_full_scale_ips", 10.0) or 10.0),
+        channels_floats={
+            "Tran": s.get("Tran", np.array([])).tolist(),
+            "Vert": s.get("Vert", np.array([])).tolist(),
+            "Long": s.get("Long", np.array([])).tolist(),
+            "MicL": s.get("MicL", np.array([])).tolist(),
+        },
+        peaks_dict={
+            "tran": float(a.get("peak_tran_ips", 0.0) or 0.0) or None,
+            "vert": float(a.get("peak_vert_ips", 0.0) or 0.0) or None,
+            "long": float(a.get("peak_long_ips", 0.0) or 0.0) or None,
+            "pvs":  float(a.get("peak_pvs_ips",  0.0) or 0.0) or None,
+            "mic":  float(a.get("peak_mic_psi",  0.0) or 0.0) or None,
+        },
+        event_id=event_id,
+        index=index,
+    )
+
+
+def _build_plot_dict(
+    *,
+    n_samples: int,
+    sample_rate: int,
+    pretrig_samples: int,
+    total_samples: int,
+    rectime_seconds: float,
+    timestamp_iso: str,
+    serial: str,
+    record_type: Optional[str],
+    waveform_key: Optional[str],
+    geo_range: str,
+    geo_fs: float,
+    channels_floats: dict[str, list[float]],
+    peaks_dict: dict[str, Optional[float]],
+    event_id: Optional[str],
+    index: Optional[int] = None,
+) -> dict:
+    dt_ms = (1000.0 / sample_rate) if sample_rate > 0 else 0.0
+    t0_ms = -pretrig_samples * dt_ms
+
+    def _ch(unit: str, values: list[float], peak: Optional[float]) -> dict:
+        # Locate the peak's time within the values array (max abs).
+        if values:
+            mags = [abs(v) for v in values]
+            i = mags.index(max(mags))
+            peak_t_ms = round(t0_ms + i * dt_ms, 4)
+            peak_value = peak if peak is not None else values[i]
+        else:
+            peak_t_ms = None
+            peak_value = peak
+        return {
+            "unit":       unit,
+            "values":     values,
+            "peak":       peak_value,
+            "peak_t_ms":  peak_t_ms,
+        }
+
+    return {
+        "schema":      "sfm.plot.v1",
+        "event_id":    event_id,
+        "index":       index,
+        "serial":      serial,
+        "timestamp":   timestamp_iso,
+        "record_type": record_type,
+        "waveform_key": waveform_key,
+
+        "time_axis": {
+            "sample_rate":     sample_rate,
+            "pretrig_samples": pretrig_samples,
+            "total_samples":   total_samples or n_samples,
+            "n_samples":       n_samples,
+            "t0_ms":           round(t0_ms, 4),
+            "dt_ms":           round(dt_ms, 6),
+            "rectime_seconds": rectime_seconds,
+        },
+
+        "geo_range":          geo_range,
+        "geo_full_scale_ips": geo_fs,
+        "trigger_ms":         0.0,
+
+        "channels": {
+            "Tran": _ch("in/s", channels_floats.get("Tran", []), peaks_dict.get("tran")),
+            "Vert": _ch("in/s", channels_floats.get("Vert", []), peaks_dict.get("vert")),
+            "Long": _ch("in/s", channels_floats.get("Long", []), peaks_dict.get("long")),
+            "MicL": _ch("psi",  channels_floats.get("MicL", []), peaks_dict.get("mic")),
+        },
+
+        "peak_values": {
+            "transverse":   peaks_dict.get("tran"),
+            "vertical":     peaks_dict.get("vert"),
+            "longitudinal": peaks_dict.get("long"),
+            "vector_sum":   peaks_dict.get("pvs"),
+            "mic_psi":      peaks_dict.get("mic"),
+        },
+    }