c641d5fc10
### Added
- **Layered event storage architecture.** Each event now lands as four
files in the per-serial waveform store, each with a clear role:
- `<filename>` — the Blastware-readable binary (BW file). Untouched.
- `<filename>.a5.pkl` — the raw 5A frames (regenerative source).
- `<filename>.h5` — clean per-channel waveform arrays in physical
units (in/s for geo, psi for mic) plus event metadata (HDF5 with
gzip compression). This is the canonical format for downstream
analysis tools.
- `<filename>.sfm.json` — the modern review/metadata sidecar (peaks,
project, source provenance, review state, extensions).
SQLite (`seismo_relay.db`) is the searchable index over all four.
- **Plot-ready waveform JSON (`sfm.plot.v1`).** The `/device/event/{idx}/waveform`
and `/db/events/{id}/waveform.json` endpoints now return samples in
physical units with explicit time-axis metadata, peak markers, and
per-channel unit hints — no more guessing the ADC-to-velocity scale
client-side. The webapp waveform viewer was rewritten to consume
this shape.
- **In-app waveform viewer accuracy fix.** The standalone SFM webapp
viewer was scaling geophone amplitudes by `geoAdcScale / 32767`
(≈ 6.206 / 32767), where `geoAdcScale = 6.206053` is the device's
*in/s per V* hardware constant — not the ADC-counts-to-velocity
factor. This silently scaled every plot ~38% too low for Normal-range
geophones (the correct full-scale is 10.0 in/s, or 1.25 in/s for
Sensitive). Conversion is now done server-side using the geo_range
from compliance config; the client just plots.
- New `sfm/event_hdf5.py` module: `write_event_hdf5()`,
`read_event_hdf5()`, plus a plot-JSON helper.
- Backfill script extended to also emit `.h5` for existing events.
### Dependencies
- Added `h5py>=3.10` and `numpy>=1.24` for the HDF5 storage layer.
- Added `python-multipart>=0.0.7` (required by FastAPI for the
`/db/import/blastware_file` endpoint introduced in this release).
347 lines
15 KiB
Python
347 lines
15 KiB
Python
"""
|
|
scripts/backfill_sidecars.py — generate .sfm.json sidecars AND .h5
|
|
clean-waveform files for existing events already in the waveform store
|
|
that predate those features.
|
|
|
|
Walks `<store_root>/<serial>/<filename>` and for each BW event file:
|
|
|
|
Sidecar (.sfm.json):
|
|
- Skip when an existing sidecar's blastware.sha256 matches the
|
|
current BW file's sha256.
|
|
- Else regenerate: prefer .a5.pkl (full fidelity); fall back to
|
|
parsing the BW binary directly (peaks computed from samples).
|
|
|
|
Clean waveform (.h5):
|
|
- Skip when <filename>.h5 already exists (idempotent).
|
|
- Else write from .a5.pkl (preferred) or BW binary parse (fallback).
|
|
|
|
Usage:
|
|
python scripts/backfill_sidecars.py [--store-root PATH]
|
|
[--db-path PATH]
|
|
[--dry-run]
|
|
[--skip-hdf5]
|
|
[-v]
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import logging
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Allow running from the repo root without installation.
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
from minimateplus import event_file_io
|
|
from sfm import event_hdf5
|
|
from sfm.waveform_store import WaveformStore, _frame_to_dict, _dict_to_frame # noqa: F401
|
|
from sfm.database import SeismoDb
|
|
|
|
log = logging.getLogger("backfill_sidecars")
|
|
|
|
|
|
def _looks_like_event_file(path: Path) -> bool:
|
|
"""Same heuristic as the importer CLI."""
|
|
if not path.is_file():
|
|
return False
|
|
if path.name.endswith((".a5.pkl", ".sfm.json")):
|
|
return False
|
|
ext = path.suffix.lstrip(".")
|
|
if not (3 <= len(ext) <= 4):
|
|
return False
|
|
if not (ext[-1].upper() in {"W", "H"} or ext.endswith("0")):
|
|
return False
|
|
try:
|
|
return path.stat().st_size >= 70
|
|
except OSError:
|
|
return False
|
|
|
|
|
|
def main(argv=None) -> int:
|
|
p = argparse.ArgumentParser(description=__doc__)
|
|
p.add_argument(
|
|
"--db-path",
|
|
default=str(Path(__file__).resolve().parent.parent / "bridges" / "captures" / "seismo_relay.db"),
|
|
)
|
|
p.add_argument("--store-root", default=None)
|
|
p.add_argument("--dry-run", action="store_true")
|
|
p.add_argument(
|
|
"--skip-hdf5", action="store_true",
|
|
help="Don't generate .h5 clean-waveform files (only sidecars).",
|
|
)
|
|
p.add_argument(
|
|
"--force", action="store_true",
|
|
help=(
|
|
"Regenerate sidecars + .h5 even when an existing sidecar's "
|
|
"blastware.sha256 matches the current BW file. Use this after "
|
|
"upgrading seismo-relay to pull in decoder bug fixes (e.g. the "
|
|
"STRT-rectime byte-offset fix in v0.15.x)."
|
|
),
|
|
)
|
|
p.add_argument("-v", "--verbose", action="store_true")
|
|
args = p.parse_args(argv)
|
|
|
|
logging.basicConfig(
|
|
level=logging.DEBUG if args.verbose else logging.INFO,
|
|
format="%(asctime)s %(levelname)-7s %(name)s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
|
|
db_path = Path(args.db_path).expanduser().resolve()
|
|
store_root = (
|
|
Path(args.store_root).expanduser().resolve()
|
|
if args.store_root else db_path.parent / "waveforms"
|
|
)
|
|
if not store_root.exists():
|
|
print(f"error: store root does not exist: {store_root}", file=sys.stderr)
|
|
return 2
|
|
|
|
store = WaveformStore(store_root)
|
|
db = SeismoDb(db_path)
|
|
|
|
written = skipped = errors = 0
|
|
for serial_dir in sorted(p for p in store_root.iterdir() if p.is_dir()):
|
|
serial = serial_dir.name
|
|
for path in sorted(serial_dir.iterdir()):
|
|
if not _looks_like_event_file(path):
|
|
continue
|
|
sidecar_path = store.sidecar_path_for(serial, path.name)
|
|
try:
|
|
bw_sha = event_file_io.file_sha256(path)
|
|
except Exception as exc:
|
|
log.error("sha256 failed for %s: %s", path, exc)
|
|
errors += 1
|
|
continue
|
|
|
|
# Skip when an up-to-date sidecar already exists.
|
|
#
|
|
# Two-part freshness check:
|
|
# 1. blastware.sha256 must match the current BW file (proves
|
|
# the sidecar describes THIS file).
|
|
# 2. source.tool_version must be ≥ current TOOL_VERSION (proves
|
|
# the sidecar was written by a build that includes any
|
|
# decoder fixes shipped since).
|
|
# Either part failing → regenerate. --force bypasses both.
|
|
if sidecar_path.exists() and not args.force:
|
|
try:
|
|
existing = event_file_io.read_sidecar(sidecar_path)
|
|
sha_ok = existing.get("blastware", {}).get("sha256") == bw_sha
|
|
src_ver = existing.get("source", {}).get("tool_version", "")
|
|
def _vt(s):
|
|
try:
|
|
return tuple(int(p) for p in str(s).split(".")[:3])
|
|
except Exception:
|
|
return (0, 0, 0)
|
|
ver_ok = _vt(src_ver) >= _vt(event_file_io.TOOL_VERSION)
|
|
if sha_ok and ver_ok:
|
|
skipped += 1
|
|
continue
|
|
if sha_ok and not ver_ok:
|
|
log.info(
|
|
"regenerating %s (sidecar tool_version=%s < current %s)",
|
|
sidecar_path.name, src_ver or "(none)",
|
|
event_file_io.TOOL_VERSION,
|
|
)
|
|
except Exception:
|
|
pass # fall through to rewrite
|
|
|
|
# Decide path: A5-based (high-fidelity) or BW-only.
|
|
a5_path = serial_dir / f"{path.name}.a5.pkl"
|
|
try:
|
|
if a5_path.exists():
|
|
frames = store.load_a5(serial, path.name)
|
|
if not frames:
|
|
raise RuntimeError("a5_pickle present but unreadable")
|
|
# Build an Event by replaying the A5 decoders. Note:
|
|
# the .a5.pkl alone CANNOT recover timestamp /
|
|
# record_type / waveform_key / per-channel peaks —
|
|
# those live in the 0C record, which isn't saved
|
|
# separately. We seed those from the DB row + the
|
|
# existing sidecar below so a re-backfill doesn't
|
|
# nuke fields the original save populated.
|
|
from minimateplus.client import (
|
|
_decode_a5_metadata_into,
|
|
_decode_a5_waveform,
|
|
)
|
|
from minimateplus.models import Event, PeakValues, ProjectInfo, Timestamp
|
|
ev = Event(index=-1)
|
|
_decode_a5_metadata_into(frames, ev)
|
|
_decode_a5_waveform(frames, ev)
|
|
source_kind = "sfm-live"
|
|
a5_filename = a5_path.name
|
|
else:
|
|
ev = event_file_io.read_blastware_file(path)
|
|
source_kind = "bw-import"
|
|
a5_filename = None
|
|
from minimateplus.models import Event, PeakValues, ProjectInfo, Timestamp
|
|
|
|
# ── Seed missing fields from the SeismoDb events row ──
|
|
# The DB row was populated at original save time with peaks,
|
|
# project info, timestamp, record_type, sample_rate, etc.
|
|
# All of those survive intact in SQLite; pull them onto the
|
|
# rebuilt Event so the regenerated sidecar matches what was
|
|
# there before the backfill ran.
|
|
db_row = None
|
|
try:
|
|
import sqlite3 as _sql
|
|
with _sql.connect(str(db.db_path)) as _conn:
|
|
_conn.row_factory = _sql.Row
|
|
db_row = _conn.execute(
|
|
"SELECT * FROM events "
|
|
"WHERE serial=? AND blastware_filename=? "
|
|
"LIMIT 1",
|
|
(serial, path.name),
|
|
).fetchone()
|
|
except Exception as exc:
|
|
log.debug("DB lookup failed for %s: %s", path.name, exc)
|
|
|
|
if db_row is not None:
|
|
if ev.sample_rate is None and db_row["sample_rate"]:
|
|
ev.sample_rate = int(db_row["sample_rate"])
|
|
if not ev.record_type and db_row["record_type"]:
|
|
ev.record_type = db_row["record_type"]
|
|
if ev._waveform_key is None and db_row["waveform_key"]:
|
|
try:
|
|
ev._waveform_key = bytes.fromhex(db_row["waveform_key"])
|
|
except Exception:
|
|
pass
|
|
# Timestamp from the ISO-8601 string in the DB row.
|
|
if ev.timestamp is None and db_row["timestamp"]:
|
|
try:
|
|
import datetime as _dt
|
|
_t = _dt.datetime.fromisoformat(db_row["timestamp"])
|
|
ev.timestamp = Timestamp(
|
|
raw=b"", flag=0x10,
|
|
year=_t.year, unknown_byte=0,
|
|
month=_t.month, day=_t.day,
|
|
hour=_t.hour, minute=_t.minute, second=_t.second,
|
|
)
|
|
except Exception:
|
|
pass
|
|
# Peaks from the DB row when the A5 decode didn't supply them.
|
|
if ev.peak_values is None:
|
|
ev.peak_values = PeakValues(
|
|
tran=db_row["tran_ppv"],
|
|
vert=db_row["vert_ppv"],
|
|
long=db_row["long_ppv"],
|
|
peak_vector_sum=db_row["peak_vector_sum"],
|
|
micl=db_row["mic_ppv"],
|
|
)
|
|
# Project info from the DB row when the A5 metadata-page
|
|
# decode didn't pick it up.
|
|
if ev.project_info is None or all(
|
|
v in (None, "")
|
|
for v in (
|
|
(ev.project_info.project if ev.project_info else None),
|
|
(ev.project_info.client if ev.project_info else None),
|
|
(ev.project_info.operator if ev.project_info else None),
|
|
(ev.project_info.sensor_location if ev.project_info else None),
|
|
)
|
|
):
|
|
ev.project_info = ProjectInfo(
|
|
project=db_row["project"],
|
|
client=db_row["client"],
|
|
operator=db_row["operator"],
|
|
sensor_location=db_row["sensor_location"],
|
|
)
|
|
|
|
# Derive total_samples when we have both rectime + sample_rate.
|
|
# The decoder's STRT-derived value can be a buffer offset
|
|
# rather than a sample count — drop it in that case.
|
|
if ev.sample_rate and ev.rectime_seconds:
|
|
derived = int(round(ev.sample_rate * ev.rectime_seconds))
|
|
if (ev.total_samples is None
|
|
or ev.total_samples > derived * 2
|
|
or ev.total_samples < derived // 4):
|
|
ev.total_samples = derived
|
|
|
|
# Preserve user-edited review state + extensions from the
|
|
# existing sidecar (false_trigger flag, notes, etc.) so a
|
|
# backfill never wipes them out.
|
|
preserved_review = None
|
|
preserved_ext = None
|
|
if sidecar_path.exists():
|
|
try:
|
|
_existing = event_file_io.read_sidecar(sidecar_path)
|
|
preserved_review = _existing.get("review")
|
|
preserved_ext = _existing.get("extensions")
|
|
except Exception:
|
|
pass
|
|
|
|
sidecar = event_file_io.event_to_sidecar_dict(
|
|
ev,
|
|
serial=serial,
|
|
blastware_filename=path.name,
|
|
blastware_filesize=path.stat().st_size,
|
|
blastware_sha256=bw_sha,
|
|
source_kind=source_kind,
|
|
a5_pickle_filename=a5_filename,
|
|
review=preserved_review,
|
|
extensions=preserved_ext,
|
|
)
|
|
|
|
# Also emit the .h5 clean-waveform file when missing OR when
|
|
# --force was passed (so a re-backfill picks up decoder fixes).
|
|
hdf5_path = store.hdf5_path_for(serial, path.name)
|
|
hdf5_filename = hdf5_path.name if hdf5_path.exists() else None
|
|
hdf5_action = "kept"
|
|
need_h5 = not args.skip_hdf5 and (args.force or not hdf5_path.exists())
|
|
if need_h5:
|
|
if args.dry_run:
|
|
hdf5_action = "would (re)write"
|
|
else:
|
|
try:
|
|
event_hdf5.write_event_hdf5(
|
|
hdf5_path, ev,
|
|
serial=serial,
|
|
geo_range="normal",
|
|
source_kind=source_kind,
|
|
)
|
|
hdf5_filename = hdf5_path.name
|
|
hdf5_action = "rewrote" if hdf5_path.exists() else "wrote"
|
|
except Exception as exc:
|
|
log.warning("HDF5 write failed for %s: %s", path.name, exc)
|
|
hdf5_action = "FAILED"
|
|
|
|
if args.dry_run:
|
|
print(f" [DRY ] would write {sidecar_path.name} "
|
|
f"+ .h5 ({hdf5_action}) source={source_kind}")
|
|
written += 1
|
|
continue
|
|
|
|
event_file_io.write_sidecar(sidecar_path, sidecar)
|
|
|
|
# Best-effort: keep the SQL row's sidecar_filename in sync
|
|
# by upserting via insert_events (it dedups on serial+ts).
|
|
try:
|
|
db.insert_events(
|
|
[ev], serial=serial,
|
|
waveform_records=(
|
|
{ev._waveform_key.hex(): {
|
|
"filename": path.name,
|
|
"filesize": path.stat().st_size,
|
|
"a5_pickle_filename": a5_filename,
|
|
"sidecar_filename": sidecar_path.name,
|
|
}}
|
|
if ev._waveform_key else None
|
|
),
|
|
)
|
|
except Exception as exc:
|
|
log.warning("DB upsert failed for %s: %s", path.name, exc)
|
|
|
|
print(f" [OK ] {path.name} → {sidecar_path.name} "
|
|
f"+ h5 ({hdf5_action}) source={source_kind}")
|
|
written += 1
|
|
|
|
except Exception as exc:
|
|
log.error("backfill failed for %s: %s", path, exc, exc_info=args.verbose)
|
|
errors += 1
|
|
|
|
print(f"\nDone. written={written} skipped(uptodate)={skipped} errors={errors}")
|
|
return 0 if errors == 0 else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|