Files
serversdown c641d5fc10 feat: v0.15.0
### Added

- **Layered event storage architecture.**  Each event now lands as four
  files in the per-serial waveform store, each with a clear role:

  - `<filename>` — the Blastware-readable binary (BW file).  Untouched.
  - `<filename>.a5.pkl` — the raw 5A frames (regenerative source).
  - `<filename>.h5` — clean per-channel waveform arrays in physical
    units (in/s for geo, psi for mic) plus event metadata (HDF5 with
    gzip compression).  This is the canonical format for downstream
    analysis tools.
  - `<filename>.sfm.json` — the modern review/metadata sidecar (peaks,
    project, source provenance, review state, extensions).

  SQLite (`seismo_relay.db`) is the searchable index over all four.

- **Plot-ready waveform JSON (`sfm.plot.v1`).**  The `/device/event/{idx}/waveform`
  and `/db/events/{id}/waveform.json` endpoints now return samples in
  physical units with explicit time-axis metadata, peak markers, and
  per-channel unit hints — no more guessing the ADC-to-velocity scale
  client-side.  The webapp waveform viewer was rewritten to consume
  this shape.

- **In-app waveform viewer accuracy fix.**  The standalone SFM webapp
  viewer was scaling geophone amplitudes by `geoAdcScale / 32767`
  (≈ 6.206 / 32767), where `geoAdcScale = 6.206053` is the device's
  *in/s per V* hardware constant — not the ADC-counts-to-velocity
  factor.  This silently scaled every plot ~38% too low for Normal-range
  geophones (the correct full-scale is 10.0 in/s, or 1.25 in/s for
  Sensitive).  Conversion is now done server-side using the geo_range
  from compliance config; the client just plots.

- New `sfm/event_hdf5.py` module: `write_event_hdf5()`,
  `read_event_hdf5()`, plus a plot-JSON helper.
- Backfill script extended to also emit `.h5` for existing events.

### Dependencies

- Added `h5py>=3.10` and `numpy>=1.24` for the HDF5 storage layer.
- Added `python-multipart>=0.0.7` (required by FastAPI for the
  `/db/import/blastware_file` endpoint introduced in this release).
2026-05-08 04:39:51 +00:00

531 lines
19 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
sfm/event_hdf5.py — HDF5 codec for the canonical "clean waveform" file.
Layout written to `<filename>.h5`:
/
├─ samples/
│ ├─ Tran (float32, in/s) shape: (N,)
│ ├─ Vert (float32, in/s) shape: (N,)
│ ├─ Long (float32, in/s) shape: (N,)
│ └─ MicL (float32, psi) shape: (N,)
├─ samples_int16/ (optional)
│ ├─ Tran (int16, raw ADC counts) shape: (N,)
│ └─ ... per channel (only when present in the source)
└─ root attrs (event metadata):
schema_version int = 1
kind str = "sfm.event.hdf5"
serial str
waveform_key str (8-hex)
timestamp str (ISO-8601)
record_type str
sample_rate int (sps)
pretrig_samples int
total_samples int
rectime_seconds float
geo_range str "normal" | "sensitive"
geo_full_scale_ips float (10.0 or 1.250)
project str
client str
operator str
sensor_location str
peak_tran_ips float (from 0C; authoritative)
peak_vert_ips float
peak_long_ips float
peak_pvs_ips float
peak_mic_psi float
tool_version str
captured_at str (ISO-8601 UTC)
source_kind str "sfm-live" | "sfm-ach" | "bw-import"
Why HDF5 and not just JSON for the canonical clean format:
- Native float32 arrays (no base64 dance, no per-value JSON parsing).
- Per-dataset gzip compression — sample arrays compress 3-5×.
- Cross-language: h5py (Python), HDF5.jl (Julia), io.netcdf (R), etc.
Analysis pipelines don't have to know anything about Blastware.
- Self-describing via attributes; future fields don't break readers.
The plot-ready `sfm.plot.v1` JSON returned by the REST endpoints is
derived from this HDF5 (or computed on-the-fly when no .h5 exists yet).
"""
from __future__ import annotations
import datetime
import logging
from pathlib import Path
from typing import Optional, Union
import h5py
import numpy as np
from minimateplus.event_file_io import TOOL_VERSION as _DEFAULT_TOOL_VERSION
from minimateplus.models import Event
log = logging.getLogger(__name__)
SCHEMA_VERSION = 1
HDF5_KIND = "sfm.event.hdf5"
# Geophone full-scale velocity per range (in/s). Confirmed in CLAUDE.md
# from 4-20-26 captures: Normal=0x00 → 10 in/s, Sensitive=0x01 → 1.25 in/s.
_GEO_FS_BY_RANGE = {
"normal": 10.000,
"sensitive": 1.2500,
0: 10.000,
1: 1.2500,
}
_INT16_FS = 32768.0
# Default mic conversion: ADC count → psi. Approximate; exact factor
# depends on firmware reference voltage and mic sensitivity, neither of
# which is independently confirmed. We try to refine it from the device-
# reported peak when available (peak_mic_psi / max_abs_int16).
_MIC_DEFAULT_FS_PSI = 0.0125 # ≈ 0.5 psi at full scale (rough)
def _resolve_geo_full_scale(geo_range) -> float:
"""Map a geo_range value (string or int from compliance config) to the
full-scale velocity in in/s. Defaults to Normal range (10.0) when the
value is unknown — same default as Blastware itself."""
if geo_range is None:
return _GEO_FS_BY_RANGE["normal"]
if isinstance(geo_range, str):
return _GEO_FS_BY_RANGE.get(geo_range.lower(), _GEO_FS_BY_RANGE["normal"])
return _GEO_FS_BY_RANGE.get(int(geo_range), _GEO_FS_BY_RANGE["normal"])
def _normalise_range(geo_range) -> str:
"""Return 'normal' or 'sensitive' (string) regardless of input form."""
if isinstance(geo_range, str):
v = geo_range.lower()
if v in ("normal", "sensitive"):
return v
return "normal"
if geo_range == 1:
return "sensitive"
return "normal"
def _ts_iso(ts) -> str:
if ts is None:
return ""
try:
return datetime.datetime(
ts.year, ts.month, ts.day,
ts.hour or 0, ts.minute or 0, ts.second or 0,
).isoformat()
except Exception:
return str(ts)
def _samples_to_float(
samples_int16: list[int],
full_scale: float,
) -> np.ndarray:
"""Convert int16 ADC counts → float32 physical units.
Uses _INT16_FS=32768 (not 32767) so that a count of -32768 maps to
exactly -full_scale and +32767 maps to ~+full_scale * 32767/32768.
Matches the device firmware's documented mapping (see CLAUDE.md
geo_hardware_constant rationale).
"""
if not samples_int16:
return np.array([], dtype=np.float32)
arr = np.asarray(samples_int16, dtype=np.int32) # int32 to avoid overflow during scale
return (arr.astype(np.float32) * (full_scale / _INT16_FS)).astype(np.float32)
def _mic_scale_factor(
samples_int16: list[int],
peak_mic_psi: Optional[float],
) -> float:
"""Resolve the per-count psi factor for the microphone channel.
When the device reports a peak mic value via the 0C record, we
back-solve the per-count factor from `peak_psi / max(|samples|)` so
the plotted waveform peaks land exactly at the device-reported value.
Otherwise fall back to the rough _MIC_DEFAULT_FS_PSI estimate.
"""
if peak_mic_psi is not None and peak_mic_psi > 0 and samples_int16:
max_count = max(abs(int(v)) for v in samples_int16) or 1
return float(peak_mic_psi) / float(max_count)
return _MIC_DEFAULT_FS_PSI / _INT16_FS
def write_event_hdf5(
path: Union[str, Path],
event: Event,
*,
serial: str,
geo_range = "normal",
source_kind: str = "sfm-live",
tool_version: Optional[str] = None,
captured_at: Optional[datetime.datetime] = None,
include_int16: bool = True,
) -> dict:
"""
Persist a decoded Event as an HDF5 file with samples in physical units.
Returns a small summary dict suitable for logging:
{"path": Path, "n_samples": int, "geo_full_scale_ips": float}
"""
path = Path(path)
raw = event.raw_samples or {}
pv = event.peak_values
pi = event.project_info
geo_fs = _resolve_geo_full_scale(geo_range)
geo_range_str = _normalise_range(geo_range)
captured_at = captured_at or datetime.datetime.utcnow()
tool_version = tool_version or _DEFAULT_TOOL_VERSION
# Per-channel float32 arrays in physical units.
geo_arrays = {}
for ch in ("Tran", "Vert", "Long"):
geo_arrays[ch] = _samples_to_float(raw.get(ch, []), geo_fs)
# Mic channel — the per-count factor is resolved from the device-reported
# peak when available so the plot peaks the BW value exactly.
mic_int16 = raw.get("MicL", [])
mic_factor = _mic_scale_factor(
mic_int16,
getattr(pv, "micl", None) if pv else None,
)
if mic_int16:
mic_arr = (np.asarray(mic_int16, dtype=np.int32).astype(np.float32) * mic_factor).astype(np.float32)
else:
mic_arr = np.array([], dtype=np.float32)
n_samples = max(
(len(geo_arrays[ch]) for ch in geo_arrays),
default=0,
)
# Atomic write: temp file + os.replace.
tmp = path.with_suffix(path.suffix + ".tmp")
with h5py.File(tmp, "w") as f:
# Root attrs — event-level metadata.
attrs = f.attrs
attrs["schema_version"] = SCHEMA_VERSION
attrs["kind"] = HDF5_KIND
attrs["serial"] = serial or ""
attrs["waveform_key"] = event._waveform_key.hex() if event._waveform_key else ""
attrs["timestamp"] = _ts_iso(event.timestamp)
attrs["record_type"] = event.record_type or ""
attrs["sample_rate"] = int(event.sample_rate or 0)
attrs["pretrig_samples"] = int(event.pretrig_samples or 0)
attrs["total_samples"] = int(event.total_samples or n_samples)
attrs["rectime_seconds"] = float(event.rectime_seconds or 0.0)
attrs["geo_range"] = geo_range_str
attrs["geo_full_scale_ips"] = float(geo_fs)
attrs["project"] = (pi.project if pi else "") or ""
attrs["client"] = (pi.client if pi else "") or ""
attrs["operator"] = (pi.operator if pi else "") or ""
attrs["sensor_location"] = (pi.sensor_location if pi else "") or ""
attrs["peak_tran_ips"] = float(pv.tran if pv and pv.tran is not None else 0.0)
attrs["peak_vert_ips"] = float(pv.vert if pv and pv.vert is not None else 0.0)
attrs["peak_long_ips"] = float(pv.long if pv and pv.long is not None else 0.0)
attrs["peak_pvs_ips"] = float(pv.peak_vector_sum if pv and pv.peak_vector_sum is not None else 0.0)
attrs["peak_mic_psi"] = float(pv.micl if pv and pv.micl is not None else 0.0)
attrs["tool_version"] = tool_version or ""
attrs["captured_at"] = captured_at.isoformat() + "Z" if captured_at.tzinfo is None else captured_at.isoformat()
attrs["source_kind"] = source_kind
# /samples — physical-units float32 (the primary data).
sgrp = f.create_group("samples")
for ch, arr in geo_arrays.items():
sgrp.create_dataset(
ch, data=arr, dtype="float32",
compression="gzip", compression_opts=4, shuffle=True,
)
sgrp.create_dataset(
"MicL", data=mic_arr, dtype="float32",
compression="gzip", compression_opts=4, shuffle=True,
)
# /samples_int16 — optional raw ADC counts (preserved for analysis
# tools that want pre-conversion data). Cheap to include.
if include_int16:
igrp = f.create_group("samples_int16")
for ch in ("Tran", "Vert", "Long", "MicL"):
vals = raw.get(ch, [])
if vals:
igrp.create_dataset(
ch, data=np.asarray(vals, dtype=np.int16),
compression="gzip", compression_opts=4, shuffle=True,
)
igrp.attrs["mic_psi_per_count"] = float(mic_factor)
import os
os.replace(tmp, path)
log.info(
"write_event_hdf5: %s n_samples=%d geo_fs=%.3f filesize=%d",
path, n_samples, geo_fs, path.stat().st_size,
)
return {
"path": path,
"n_samples": n_samples,
"geo_full_scale_ips": geo_fs,
}
def read_event_hdf5(path: Union[str, Path]) -> dict:
"""
Load an event HDF5 into a plain dict (no Event reconstruction —
callers that want an Event can use the data directly).
Returns:
{
"schema_version": int,
"kind": str,
"attrs": dict[str, …], # all root attributes
"samples": { # float32 lists in physical units
"Tran": ndarray, "Vert": ndarray, "Long": ndarray, "MicL": ndarray,
},
"samples_int16": {…} or None,
"mic_psi_per_count": float | None,
}
Raises FileNotFoundError if missing, ValueError on bad shape /
unsupported schema_version.
"""
path = Path(path)
with h5py.File(path, "r") as f:
attrs = {k: _h5_attr_value(v) for k, v in f.attrs.items()}
sv = attrs.get("schema_version", 0)
if not isinstance(sv, int) or sv < 1 or sv > SCHEMA_VERSION:
raise ValueError(
f"{path}: unsupported HDF5 schema_version={sv} "
f"(this build supports 1..{SCHEMA_VERSION})"
)
if attrs.get("kind") != HDF5_KIND:
raise ValueError(f"{path}: kind != {HDF5_KIND!r} (got {attrs.get('kind')!r})")
samples = {}
for ch in ("Tran", "Vert", "Long", "MicL"):
ds = f.get(f"samples/{ch}")
samples[ch] = np.asarray(ds[()]) if ds is not None else np.array([], dtype=np.float32)
samples_int16 = None
mic_psi = None
igrp = f.get("samples_int16")
if igrp is not None:
samples_int16 = {}
for ch in ("Tran", "Vert", "Long", "MicL"):
ds = igrp.get(ch)
if ds is not None:
samples_int16[ch] = np.asarray(ds[()])
mic_attr = igrp.attrs.get("mic_psi_per_count")
if mic_attr is not None:
mic_psi = float(mic_attr)
return {
"schema_version": sv,
"kind": attrs.get("kind"),
"attrs": attrs,
"samples": samples,
"samples_int16": samples_int16,
"mic_psi_per_count": mic_psi,
}
def _h5_attr_value(v):
"""Convert an h5py attribute value to a plain Python type."""
if isinstance(v, bytes):
return v.decode("utf-8", errors="replace")
if isinstance(v, np.generic):
return v.item()
return v
# ── Plot-ready JSON ──────────────────────────────────────────────────────────
def event_to_plot_json(
event: Event,
*,
serial: str,
geo_range = "normal",
event_id: Optional[str] = None,
index: Optional[int] = None,
) -> dict:
"""
Build a `sfm.plot.v1` JSON dict directly from an Event (skipping HDF5).
Used by:
- `/device/event/{idx}/waveform` (live device path)
- The CLI / tests for in-memory conversion sanity-checks.
Stored events go through `plot_json_from_hdf5()` so the wire format
is identical regardless of whether the data came from the live device
or the on-disk HDF5.
"""
raw = event.raw_samples or {}
pv = event.peak_values
geo_fs = _resolve_geo_full_scale(geo_range)
geo_range_str = _normalise_range(geo_range)
sr = int(event.sample_rate or 0) or 1024
pretrig = int(event.pretrig_samples or 0)
geo_arrays = {ch: _samples_to_float(raw.get(ch, []), geo_fs).tolist()
for ch in ("Tran", "Vert", "Long")}
mic_int16 = raw.get("MicL", [])
mic_factor = _mic_scale_factor(
mic_int16,
getattr(pv, "micl", None) if pv else None,
)
mic_arr = [float(v) * mic_factor for v in mic_int16] if mic_int16 else []
n = max(
(len(geo_arrays[ch]) for ch in geo_arrays),
default=len(mic_arr),
)
return _build_plot_dict(
n_samples=n,
sample_rate=sr,
pretrig_samples=pretrig,
total_samples=int(event.total_samples or n),
rectime_seconds=float(event.rectime_seconds or 0.0),
timestamp_iso=_ts_iso(event.timestamp),
serial=serial,
record_type=event.record_type,
waveform_key=event._waveform_key.hex() if event._waveform_key else None,
geo_range=geo_range_str,
geo_fs=geo_fs,
channels_floats={
"Tran": geo_arrays["Tran"],
"Vert": geo_arrays["Vert"],
"Long": geo_arrays["Long"],
"MicL": mic_arr,
},
peaks_dict={
"tran": getattr(pv, "tran", None) if pv else None,
"vert": getattr(pv, "vert", None) if pv else None,
"long": getattr(pv, "long", None) if pv else None,
"pvs": getattr(pv, "peak_vector_sum", None) if pv else None,
"mic": getattr(pv, "micl", None) if pv else None,
},
event_id=event_id,
index=index if index is not None else event.index,
)
def plot_json_from_hdf5(
path: Union[str, Path],
*,
event_id: Optional[str] = None,
index: Optional[int] = None,
) -> dict:
"""Build a `sfm.plot.v1` JSON dict from a stored .h5 file."""
data = read_event_hdf5(path)
a = data["attrs"]
s = data["samples"]
return _build_plot_dict(
n_samples=len(s["Tran"]) if "Tran" in s else 0,
sample_rate=int(a.get("sample_rate", 1024) or 1024),
pretrig_samples=int(a.get("pretrig_samples", 0) or 0),
total_samples=int(a.get("total_samples", 0) or 0),
rectime_seconds=float(a.get("rectime_seconds", 0.0) or 0.0),
timestamp_iso=a.get("timestamp", ""),
serial=a.get("serial", ""),
record_type=a.get("record_type", ""),
waveform_key=a.get("waveform_key", "") or None,
geo_range=a.get("geo_range", "normal"),
geo_fs=float(a.get("geo_full_scale_ips", 10.0) or 10.0),
channels_floats={
"Tran": s.get("Tran", np.array([])).tolist(),
"Vert": s.get("Vert", np.array([])).tolist(),
"Long": s.get("Long", np.array([])).tolist(),
"MicL": s.get("MicL", np.array([])).tolist(),
},
peaks_dict={
"tran": float(a.get("peak_tran_ips", 0.0) or 0.0) or None,
"vert": float(a.get("peak_vert_ips", 0.0) or 0.0) or None,
"long": float(a.get("peak_long_ips", 0.0) or 0.0) or None,
"pvs": float(a.get("peak_pvs_ips", 0.0) or 0.0) or None,
"mic": float(a.get("peak_mic_psi", 0.0) or 0.0) or None,
},
event_id=event_id,
index=index,
)
def _build_plot_dict(
*,
n_samples: int,
sample_rate: int,
pretrig_samples: int,
total_samples: int,
rectime_seconds: float,
timestamp_iso: str,
serial: str,
record_type: Optional[str],
waveform_key: Optional[str],
geo_range: str,
geo_fs: float,
channels_floats: dict[str, list[float]],
peaks_dict: dict[str, Optional[float]],
event_id: Optional[str],
index: Optional[int] = None,
) -> dict:
dt_ms = (1000.0 / sample_rate) if sample_rate > 0 else 0.0
t0_ms = -pretrig_samples * dt_ms
def _ch(unit: str, values: list[float], peak: Optional[float]) -> dict:
# Locate the peak's time within the values array (max abs).
if values:
mags = [abs(v) for v in values]
i = mags.index(max(mags))
peak_t_ms = round(t0_ms + i * dt_ms, 4)
peak_value = peak if peak is not None else values[i]
else:
peak_t_ms = None
peak_value = peak
return {
"unit": unit,
"values": values,
"peak": peak_value,
"peak_t_ms": peak_t_ms,
}
return {
"schema": "sfm.plot.v1",
"event_id": event_id,
"index": index,
"serial": serial,
"timestamp": timestamp_iso,
"record_type": record_type,
"waveform_key": waveform_key,
"time_axis": {
"sample_rate": sample_rate,
"pretrig_samples": pretrig_samples,
"total_samples": total_samples or n_samples,
"n_samples": n_samples,
"t0_ms": round(t0_ms, 4),
"dt_ms": round(dt_ms, 6),
"rectime_seconds": rectime_seconds,
},
"geo_range": geo_range,
"geo_full_scale_ips": geo_fs,
"trigger_ms": 0.0,
"channels": {
"Tran": _ch("in/s", channels_floats.get("Tran", []), peaks_dict.get("tran")),
"Vert": _ch("in/s", channels_floats.get("Vert", []), peaks_dict.get("vert")),
"Long": _ch("in/s", channels_floats.get("Long", []), peaks_dict.get("long")),
"MicL": _ch("psi", channels_floats.get("MicL", []), peaks_dict.get("mic")),
},
"peak_values": {
"transverse": peaks_dict.get("tran"),
"vertical": peaks_dict.get("vert"),
"longitudinal": peaks_dict.get("long"),
"vector_sum": peaks_dict.get("pvs"),
"mic_psi": peaks_dict.get("mic"),
},
}