""" sfm/event_hdf5.py — HDF5 codec for the canonical "clean waveform" file. Layout written to `.h5`: / ├─ samples/ │ ├─ Tran (float32, in/s) shape: (N,) │ ├─ Vert (float32, in/s) shape: (N,) │ ├─ Long (float32, in/s) shape: (N,) │ └─ MicL (float32, psi) shape: (N,) ├─ samples_int16/ (optional) │ ├─ Tran (int16, raw ADC counts) shape: (N,) │ └─ ... per channel (only when present in the source) └─ root attrs (event metadata): schema_version int = 1 kind str = "sfm.event.hdf5" serial str waveform_key str (8-hex) timestamp str (ISO-8601) record_type str sample_rate int (sps) pretrig_samples int total_samples int rectime_seconds float geo_range str "normal" | "sensitive" geo_full_scale_ips float (10.0 or 1.250) project str client str operator str sensor_location str peak_tran_ips float (from 0C; authoritative) peak_vert_ips float peak_long_ips float peak_pvs_ips float peak_mic_psi float tool_version str captured_at str (ISO-8601 UTC) source_kind str "sfm-live" | "sfm-ach" | "bw-import" Why HDF5 and not just JSON for the canonical clean format: - Native float32 arrays (no base64 dance, no per-value JSON parsing). - Per-dataset gzip compression — sample arrays compress 3-5×. - Cross-language: h5py (Python), HDF5.jl (Julia), io.netcdf (R), etc. Analysis pipelines don't have to know anything about Blastware. - Self-describing via attributes; future fields don't break readers. The plot-ready `sfm.plot.v1` JSON returned by the REST endpoints is derived from this HDF5 (or computed on-the-fly when no .h5 exists yet). """ from __future__ import annotations import datetime import logging from pathlib import Path from typing import Optional, Union import h5py import numpy as np from minimateplus.event_file_io import TOOL_VERSION as _DEFAULT_TOOL_VERSION from minimateplus.models import Event log = logging.getLogger(__name__) SCHEMA_VERSION = 1 HDF5_KIND = "sfm.event.hdf5" # Geophone full-scale velocity per range (in/s). Confirmed in CLAUDE.md # from 4-20-26 captures: Normal=0x00 → 10 in/s, Sensitive=0x01 → 1.25 in/s. _GEO_FS_BY_RANGE = { "normal": 10.000, "sensitive": 1.2500, 0: 10.000, 1: 1.2500, } _INT16_FS = 32768.0 # Default mic conversion: ADC count → psi. Approximate; exact factor # depends on firmware reference voltage and mic sensitivity, neither of # which is independently confirmed. We try to refine it from the device- # reported peak when available (peak_mic_psi / max_abs_int16). _MIC_DEFAULT_FS_PSI = 0.0125 # ≈ 0.5 psi at full scale (rough) def _resolve_geo_full_scale(geo_range) -> float: """Map a geo_range value (string or int from compliance config) to the full-scale velocity in in/s. Defaults to Normal range (10.0) when the value is unknown — same default as Blastware itself.""" if geo_range is None: return _GEO_FS_BY_RANGE["normal"] if isinstance(geo_range, str): return _GEO_FS_BY_RANGE.get(geo_range.lower(), _GEO_FS_BY_RANGE["normal"]) return _GEO_FS_BY_RANGE.get(int(geo_range), _GEO_FS_BY_RANGE["normal"]) def _normalise_range(geo_range) -> str: """Return 'normal' or 'sensitive' (string) regardless of input form.""" if isinstance(geo_range, str): v = geo_range.lower() if v in ("normal", "sensitive"): return v return "normal" if geo_range == 1: return "sensitive" return "normal" def _ts_iso(ts) -> str: if ts is None: return "" try: return datetime.datetime( ts.year, ts.month, ts.day, ts.hour or 0, ts.minute or 0, ts.second or 0, ).isoformat() except Exception: return str(ts) def _samples_to_float( samples_int16: list[int], full_scale: float, ) -> np.ndarray: """Convert int16 ADC counts → float32 physical units. Uses _INT16_FS=32768 (not 32767) so that a count of -32768 maps to exactly -full_scale and +32767 maps to ~+full_scale * 32767/32768. Matches the device firmware's documented mapping (see CLAUDE.md geo_hardware_constant rationale). """ if not samples_int16: return np.array([], dtype=np.float32) arr = np.asarray(samples_int16, dtype=np.int32) # int32 to avoid overflow during scale return (arr.astype(np.float32) * (full_scale / _INT16_FS)).astype(np.float32) def _mic_scale_factor( samples_int16: list[int], peak_mic_psi: Optional[float], ) -> float: """Resolve the per-count psi factor for the microphone channel. When the device reports a peak mic value via the 0C record, we back-solve the per-count factor from `peak_psi / max(|samples|)` so the plotted waveform peaks land exactly at the device-reported value. Otherwise fall back to the rough _MIC_DEFAULT_FS_PSI estimate. """ if peak_mic_psi is not None and peak_mic_psi > 0 and samples_int16: max_count = max(abs(int(v)) for v in samples_int16) or 1 return float(peak_mic_psi) / float(max_count) return _MIC_DEFAULT_FS_PSI / _INT16_FS def write_event_hdf5( path: Union[str, Path], event: Event, *, serial: str, geo_range = "normal", source_kind: str = "sfm-live", tool_version: Optional[str] = None, captured_at: Optional[datetime.datetime] = None, include_int16: bool = True, ) -> dict: """ Persist a decoded Event as an HDF5 file with samples in physical units. Returns a small summary dict suitable for logging: {"path": Path, "n_samples": int, "geo_full_scale_ips": float} """ path = Path(path) raw = event.raw_samples or {} pv = event.peak_values pi = event.project_info geo_fs = _resolve_geo_full_scale(geo_range) geo_range_str = _normalise_range(geo_range) captured_at = captured_at or datetime.datetime.utcnow() tool_version = tool_version or _DEFAULT_TOOL_VERSION # Per-channel float32 arrays in physical units. geo_arrays = {} for ch in ("Tran", "Vert", "Long"): geo_arrays[ch] = _samples_to_float(raw.get(ch, []), geo_fs) # Mic channel — the per-count factor is resolved from the device-reported # peak when available so the plot peaks the BW value exactly. mic_int16 = raw.get("MicL", []) mic_factor = _mic_scale_factor( mic_int16, getattr(pv, "micl", None) if pv else None, ) if mic_int16: mic_arr = (np.asarray(mic_int16, dtype=np.int32).astype(np.float32) * mic_factor).astype(np.float32) else: mic_arr = np.array([], dtype=np.float32) n_samples = max( (len(geo_arrays[ch]) for ch in geo_arrays), default=0, ) # Atomic write: temp file + os.replace. tmp = path.with_suffix(path.suffix + ".tmp") with h5py.File(tmp, "w") as f: # Root attrs — event-level metadata. attrs = f.attrs attrs["schema_version"] = SCHEMA_VERSION attrs["kind"] = HDF5_KIND attrs["serial"] = serial or "" attrs["waveform_key"] = event._waveform_key.hex() if event._waveform_key else "" attrs["timestamp"] = _ts_iso(event.timestamp) attrs["record_type"] = event.record_type or "" attrs["sample_rate"] = int(event.sample_rate or 0) attrs["pretrig_samples"] = int(event.pretrig_samples or 0) attrs["total_samples"] = int(event.total_samples or n_samples) attrs["rectime_seconds"] = float(event.rectime_seconds or 0.0) attrs["geo_range"] = geo_range_str attrs["geo_full_scale_ips"] = float(geo_fs) attrs["project"] = (pi.project if pi else "") or "" attrs["client"] = (pi.client if pi else "") or "" attrs["operator"] = (pi.operator if pi else "") or "" attrs["sensor_location"] = (pi.sensor_location if pi else "") or "" attrs["peak_tran_ips"] = float(pv.tran if pv and pv.tran is not None else 0.0) attrs["peak_vert_ips"] = float(pv.vert if pv and pv.vert is not None else 0.0) attrs["peak_long_ips"] = float(pv.long if pv and pv.long is not None else 0.0) attrs["peak_pvs_ips"] = float(pv.peak_vector_sum if pv and pv.peak_vector_sum is not None else 0.0) attrs["peak_mic_psi"] = float(pv.micl if pv and pv.micl is not None else 0.0) attrs["tool_version"] = tool_version or "" attrs["captured_at"] = captured_at.isoformat() + "Z" if captured_at.tzinfo is None else captured_at.isoformat() attrs["source_kind"] = source_kind # /samples — physical-units float32 (the primary data). sgrp = f.create_group("samples") for ch, arr in geo_arrays.items(): sgrp.create_dataset( ch, data=arr, dtype="float32", compression="gzip", compression_opts=4, shuffle=True, ) sgrp.create_dataset( "MicL", data=mic_arr, dtype="float32", compression="gzip", compression_opts=4, shuffle=True, ) # /samples_int16 — optional raw ADC counts (preserved for analysis # tools that want pre-conversion data). Cheap to include. if include_int16: igrp = f.create_group("samples_int16") for ch in ("Tran", "Vert", "Long", "MicL"): vals = raw.get(ch, []) if vals: igrp.create_dataset( ch, data=np.asarray(vals, dtype=np.int16), compression="gzip", compression_opts=4, shuffle=True, ) igrp.attrs["mic_psi_per_count"] = float(mic_factor) import os os.replace(tmp, path) log.info( "write_event_hdf5: %s n_samples=%d geo_fs=%.3f filesize=%d", path, n_samples, geo_fs, path.stat().st_size, ) return { "path": path, "n_samples": n_samples, "geo_full_scale_ips": geo_fs, } def read_event_hdf5(path: Union[str, Path]) -> dict: """ Load an event HDF5 into a plain dict (no Event reconstruction — callers that want an Event can use the data directly). Returns: { "schema_version": int, "kind": str, "attrs": dict[str, …], # all root attributes "samples": { # float32 lists in physical units "Tran": ndarray, "Vert": ndarray, "Long": ndarray, "MicL": ndarray, }, "samples_int16": {…} or None, "mic_psi_per_count": float | None, } Raises FileNotFoundError if missing, ValueError on bad shape / unsupported schema_version. """ path = Path(path) with h5py.File(path, "r") as f: attrs = {k: _h5_attr_value(v) for k, v in f.attrs.items()} sv = attrs.get("schema_version", 0) if not isinstance(sv, int) or sv < 1 or sv > SCHEMA_VERSION: raise ValueError( f"{path}: unsupported HDF5 schema_version={sv} " f"(this build supports 1..{SCHEMA_VERSION})" ) if attrs.get("kind") != HDF5_KIND: raise ValueError(f"{path}: kind != {HDF5_KIND!r} (got {attrs.get('kind')!r})") samples = {} for ch in ("Tran", "Vert", "Long", "MicL"): ds = f.get(f"samples/{ch}") samples[ch] = np.asarray(ds[()]) if ds is not None else np.array([], dtype=np.float32) samples_int16 = None mic_psi = None igrp = f.get("samples_int16") if igrp is not None: samples_int16 = {} for ch in ("Tran", "Vert", "Long", "MicL"): ds = igrp.get(ch) if ds is not None: samples_int16[ch] = np.asarray(ds[()]) mic_attr = igrp.attrs.get("mic_psi_per_count") if mic_attr is not None: mic_psi = float(mic_attr) return { "schema_version": sv, "kind": attrs.get("kind"), "attrs": attrs, "samples": samples, "samples_int16": samples_int16, "mic_psi_per_count": mic_psi, } def _h5_attr_value(v): """Convert an h5py attribute value to a plain Python type.""" if isinstance(v, bytes): return v.decode("utf-8", errors="replace") if isinstance(v, np.generic): return v.item() return v # ── Plot-ready JSON ────────────────────────────────────────────────────────── def event_to_plot_json( event: Event, *, serial: str, geo_range = "normal", event_id: Optional[str] = None, index: Optional[int] = None, ) -> dict: """ Build a `sfm.plot.v1` JSON dict directly from an Event (skipping HDF5). Used by: - `/device/event/{idx}/waveform` (live device path) - The CLI / tests for in-memory conversion sanity-checks. Stored events go through `plot_json_from_hdf5()` so the wire format is identical regardless of whether the data came from the live device or the on-disk HDF5. """ raw = event.raw_samples or {} pv = event.peak_values geo_fs = _resolve_geo_full_scale(geo_range) geo_range_str = _normalise_range(geo_range) sr = int(event.sample_rate or 0) or 1024 pretrig = int(event.pretrig_samples or 0) geo_arrays = {ch: _samples_to_float(raw.get(ch, []), geo_fs).tolist() for ch in ("Tran", "Vert", "Long")} mic_int16 = raw.get("MicL", []) mic_factor = _mic_scale_factor( mic_int16, getattr(pv, "micl", None) if pv else None, ) mic_arr = [float(v) * mic_factor for v in mic_int16] if mic_int16 else [] n = max( (len(geo_arrays[ch]) for ch in geo_arrays), default=len(mic_arr), ) return _build_plot_dict( n_samples=n, sample_rate=sr, pretrig_samples=pretrig, total_samples=int(event.total_samples or n), rectime_seconds=float(event.rectime_seconds or 0.0), timestamp_iso=_ts_iso(event.timestamp), serial=serial, record_type=event.record_type, waveform_key=event._waveform_key.hex() if event._waveform_key else None, geo_range=geo_range_str, geo_fs=geo_fs, channels_floats={ "Tran": geo_arrays["Tran"], "Vert": geo_arrays["Vert"], "Long": geo_arrays["Long"], "MicL": mic_arr, }, peaks_dict={ "tran": getattr(pv, "tran", None) if pv else None, "vert": getattr(pv, "vert", None) if pv else None, "long": getattr(pv, "long", None) if pv else None, "pvs": getattr(pv, "peak_vector_sum", None) if pv else None, "mic": getattr(pv, "micl", None) if pv else None, }, event_id=event_id, index=index if index is not None else event.index, ) def plot_json_from_hdf5( path: Union[str, Path], *, event_id: Optional[str] = None, index: Optional[int] = None, ) -> dict: """Build a `sfm.plot.v1` JSON dict from a stored .h5 file.""" data = read_event_hdf5(path) a = data["attrs"] s = data["samples"] return _build_plot_dict( n_samples=len(s["Tran"]) if "Tran" in s else 0, sample_rate=int(a.get("sample_rate", 1024) or 1024), pretrig_samples=int(a.get("pretrig_samples", 0) or 0), total_samples=int(a.get("total_samples", 0) or 0), rectime_seconds=float(a.get("rectime_seconds", 0.0) or 0.0), timestamp_iso=a.get("timestamp", ""), serial=a.get("serial", ""), record_type=a.get("record_type", ""), waveform_key=a.get("waveform_key", "") or None, geo_range=a.get("geo_range", "normal"), geo_fs=float(a.get("geo_full_scale_ips", 10.0) or 10.0), channels_floats={ "Tran": s.get("Tran", np.array([])).tolist(), "Vert": s.get("Vert", np.array([])).tolist(), "Long": s.get("Long", np.array([])).tolist(), "MicL": s.get("MicL", np.array([])).tolist(), }, peaks_dict={ "tran": float(a.get("peak_tran_ips", 0.0) or 0.0) or None, "vert": float(a.get("peak_vert_ips", 0.0) or 0.0) or None, "long": float(a.get("peak_long_ips", 0.0) or 0.0) or None, "pvs": float(a.get("peak_pvs_ips", 0.0) or 0.0) or None, "mic": float(a.get("peak_mic_psi", 0.0) or 0.0) or None, }, event_id=event_id, index=index, ) def _build_plot_dict( *, n_samples: int, sample_rate: int, pretrig_samples: int, total_samples: int, rectime_seconds: float, timestamp_iso: str, serial: str, record_type: Optional[str], waveform_key: Optional[str], geo_range: str, geo_fs: float, channels_floats: dict[str, list[float]], peaks_dict: dict[str, Optional[float]], event_id: Optional[str], index: Optional[int] = None, ) -> dict: dt_ms = (1000.0 / sample_rate) if sample_rate > 0 else 0.0 t0_ms = -pretrig_samples * dt_ms def _ch(unit: str, values: list[float], peak: Optional[float]) -> dict: # Locate the peak's time within the values array (max abs). if values: mags = [abs(v) for v in values] i = mags.index(max(mags)) peak_t_ms = round(t0_ms + i * dt_ms, 4) peak_value = peak if peak is not None else values[i] else: peak_t_ms = None peak_value = peak return { "unit": unit, "values": values, "peak": peak_value, "peak_t_ms": peak_t_ms, } return { "schema": "sfm.plot.v1", "event_id": event_id, "index": index, "serial": serial, "timestamp": timestamp_iso, "record_type": record_type, "waveform_key": waveform_key, "time_axis": { "sample_rate": sample_rate, "pretrig_samples": pretrig_samples, "total_samples": total_samples or n_samples, "n_samples": n_samples, "t0_ms": round(t0_ms, 4), "dt_ms": round(dt_ms, 6), "rectime_seconds": rectime_seconds, }, "geo_range": geo_range, "geo_full_scale_ips": geo_fs, "trigger_ms": 0.0, "channels": { "Tran": _ch("in/s", channels_floats.get("Tran", []), peaks_dict.get("tran")), "Vert": _ch("in/s", channels_floats.get("Vert", []), peaks_dict.get("vert")), "Long": _ch("in/s", channels_floats.get("Long", []), peaks_dict.get("long")), "MicL": _ch("psi", channels_floats.get("MicL", []), peaks_dict.get("mic")), }, "peak_values": { "transverse": peaks_dict.get("tran"), "vertical": peaks_dict.get("vert"), "longitudinal": peaks_dict.get("long"), "vector_sum": peaks_dict.get("pvs"), "mic_psi": peaks_dict.get("mic"), }, }