feat: v0.15.0
### Added
- **Layered event storage architecture.** Each event now lands as four
files in the per-serial waveform store, each with a clear role:
- `<filename>` — the Blastware-readable binary (BW file). Untouched.
- `<filename>.a5.pkl` — the raw 5A frames (regenerative source).
- `<filename>.h5` — clean per-channel waveform arrays in physical
units (in/s for geo, psi for mic) plus event metadata (HDF5 with
gzip compression). This is the canonical format for downstream
analysis tools.
- `<filename>.sfm.json` — the modern review/metadata sidecar (peaks,
project, source provenance, review state, extensions).
SQLite (`seismo_relay.db`) is the searchable index over all four.
- **Plot-ready waveform JSON (`sfm.plot.v1`).** The `/device/event/{idx}/waveform`
and `/db/events/{id}/waveform.json` endpoints now return samples in
physical units with explicit time-axis metadata, peak markers, and
per-channel unit hints — no more guessing the ADC-to-velocity scale
client-side. The webapp waveform viewer was rewritten to consume
this shape.
- **In-app waveform viewer accuracy fix.** The standalone SFM webapp
viewer was scaling geophone amplitudes by `geoAdcScale / 32767`
(≈ 6.206 / 32767), where `geoAdcScale = 6.206053` is the device's
*in/s per V* hardware constant — not the ADC-counts-to-velocity
factor. This silently scaled every plot ~38% too low for Normal-range
geophones (the correct full-scale is 10.0 in/s, or 1.25 in/s for
Sensitive). Conversion is now done server-side using the geo_range
from compliance config; the client just plots.
- New `sfm/event_hdf5.py` module: `write_event_hdf5()`,
`read_event_hdf5()`, plus a plot-JSON helper.
- Backfill script extended to also emit `.h5` for existing events.
### Dependencies
- Added `h5py>=3.10` and `numpy>=1.24` for the HDF5 storage layer.
- Added `python-multipart>=0.0.7` (required by FastAPI for the
`/db/import/blastware_file` endpoint introduced in this release).
This commit is contained in:
@@ -0,0 +1,346 @@
|
||||
"""
|
||||
scripts/backfill_sidecars.py — generate .sfm.json sidecars AND .h5
|
||||
clean-waveform files for existing events already in the waveform store
|
||||
that predate those features.
|
||||
|
||||
Walks `<store_root>/<serial>/<filename>` and for each BW event file:
|
||||
|
||||
Sidecar (.sfm.json):
|
||||
- Skip when an existing sidecar's blastware.sha256 matches the
|
||||
current BW file's sha256.
|
||||
- Else regenerate: prefer .a5.pkl (full fidelity); fall back to
|
||||
parsing the BW binary directly (peaks computed from samples).
|
||||
|
||||
Clean waveform (.h5):
|
||||
- Skip when <filename>.h5 already exists (idempotent).
|
||||
- Else write from .a5.pkl (preferred) or BW binary parse (fallback).
|
||||
|
||||
Usage:
|
||||
python scripts/backfill_sidecars.py [--store-root PATH]
|
||||
[--db-path PATH]
|
||||
[--dry-run]
|
||||
[--skip-hdf5]
|
||||
[-v]
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import logging
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Allow running from the repo root without installation.
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from minimateplus import event_file_io
|
||||
from sfm import event_hdf5
|
||||
from sfm.waveform_store import WaveformStore, _frame_to_dict, _dict_to_frame # noqa: F401
|
||||
from sfm.database import SeismoDb
|
||||
|
||||
log = logging.getLogger("backfill_sidecars")
|
||||
|
||||
|
||||
def _looks_like_event_file(path: Path) -> bool:
|
||||
"""Same heuristic as the importer CLI."""
|
||||
if not path.is_file():
|
||||
return False
|
||||
if path.name.endswith((".a5.pkl", ".sfm.json")):
|
||||
return False
|
||||
ext = path.suffix.lstrip(".")
|
||||
if not (3 <= len(ext) <= 4):
|
||||
return False
|
||||
if not (ext[-1].upper() in {"W", "H"} or ext.endswith("0")):
|
||||
return False
|
||||
try:
|
||||
return path.stat().st_size >= 70
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def main(argv=None) -> int:
|
||||
p = argparse.ArgumentParser(description=__doc__)
|
||||
p.add_argument(
|
||||
"--db-path",
|
||||
default=str(Path(__file__).resolve().parent.parent / "bridges" / "captures" / "seismo_relay.db"),
|
||||
)
|
||||
p.add_argument("--store-root", default=None)
|
||||
p.add_argument("--dry-run", action="store_true")
|
||||
p.add_argument(
|
||||
"--skip-hdf5", action="store_true",
|
||||
help="Don't generate .h5 clean-waveform files (only sidecars).",
|
||||
)
|
||||
p.add_argument(
|
||||
"--force", action="store_true",
|
||||
help=(
|
||||
"Regenerate sidecars + .h5 even when an existing sidecar's "
|
||||
"blastware.sha256 matches the current BW file. Use this after "
|
||||
"upgrading seismo-relay to pull in decoder bug fixes (e.g. the "
|
||||
"STRT-rectime byte-offset fix in v0.15.x)."
|
||||
),
|
||||
)
|
||||
p.add_argument("-v", "--verbose", action="store_true")
|
||||
args = p.parse_args(argv)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if args.verbose else logging.INFO,
|
||||
format="%(asctime)s %(levelname)-7s %(name)s %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
|
||||
db_path = Path(args.db_path).expanduser().resolve()
|
||||
store_root = (
|
||||
Path(args.store_root).expanduser().resolve()
|
||||
if args.store_root else db_path.parent / "waveforms"
|
||||
)
|
||||
if not store_root.exists():
|
||||
print(f"error: store root does not exist: {store_root}", file=sys.stderr)
|
||||
return 2
|
||||
|
||||
store = WaveformStore(store_root)
|
||||
db = SeismoDb(db_path)
|
||||
|
||||
written = skipped = errors = 0
|
||||
for serial_dir in sorted(p for p in store_root.iterdir() if p.is_dir()):
|
||||
serial = serial_dir.name
|
||||
for path in sorted(serial_dir.iterdir()):
|
||||
if not _looks_like_event_file(path):
|
||||
continue
|
||||
sidecar_path = store.sidecar_path_for(serial, path.name)
|
||||
try:
|
||||
bw_sha = event_file_io.file_sha256(path)
|
||||
except Exception as exc:
|
||||
log.error("sha256 failed for %s: %s", path, exc)
|
||||
errors += 1
|
||||
continue
|
||||
|
||||
# Skip when an up-to-date sidecar already exists.
|
||||
#
|
||||
# Two-part freshness check:
|
||||
# 1. blastware.sha256 must match the current BW file (proves
|
||||
# the sidecar describes THIS file).
|
||||
# 2. source.tool_version must be ≥ current TOOL_VERSION (proves
|
||||
# the sidecar was written by a build that includes any
|
||||
# decoder fixes shipped since).
|
||||
# Either part failing → regenerate. --force bypasses both.
|
||||
if sidecar_path.exists() and not args.force:
|
||||
try:
|
||||
existing = event_file_io.read_sidecar(sidecar_path)
|
||||
sha_ok = existing.get("blastware", {}).get("sha256") == bw_sha
|
||||
src_ver = existing.get("source", {}).get("tool_version", "")
|
||||
def _vt(s):
|
||||
try:
|
||||
return tuple(int(p) for p in str(s).split(".")[:3])
|
||||
except Exception:
|
||||
return (0, 0, 0)
|
||||
ver_ok = _vt(src_ver) >= _vt(event_file_io.TOOL_VERSION)
|
||||
if sha_ok and ver_ok:
|
||||
skipped += 1
|
||||
continue
|
||||
if sha_ok and not ver_ok:
|
||||
log.info(
|
||||
"regenerating %s (sidecar tool_version=%s < current %s)",
|
||||
sidecar_path.name, src_ver or "(none)",
|
||||
event_file_io.TOOL_VERSION,
|
||||
)
|
||||
except Exception:
|
||||
pass # fall through to rewrite
|
||||
|
||||
# Decide path: A5-based (high-fidelity) or BW-only.
|
||||
a5_path = serial_dir / f"{path.name}.a5.pkl"
|
||||
try:
|
||||
if a5_path.exists():
|
||||
frames = store.load_a5(serial, path.name)
|
||||
if not frames:
|
||||
raise RuntimeError("a5_pickle present but unreadable")
|
||||
# Build an Event by replaying the A5 decoders. Note:
|
||||
# the .a5.pkl alone CANNOT recover timestamp /
|
||||
# record_type / waveform_key / per-channel peaks —
|
||||
# those live in the 0C record, which isn't saved
|
||||
# separately. We seed those from the DB row + the
|
||||
# existing sidecar below so a re-backfill doesn't
|
||||
# nuke fields the original save populated.
|
||||
from minimateplus.client import (
|
||||
_decode_a5_metadata_into,
|
||||
_decode_a5_waveform,
|
||||
)
|
||||
from minimateplus.models import Event, PeakValues, ProjectInfo, Timestamp
|
||||
ev = Event(index=-1)
|
||||
_decode_a5_metadata_into(frames, ev)
|
||||
_decode_a5_waveform(frames, ev)
|
||||
source_kind = "sfm-live"
|
||||
a5_filename = a5_path.name
|
||||
else:
|
||||
ev = event_file_io.read_blastware_file(path)
|
||||
source_kind = "bw-import"
|
||||
a5_filename = None
|
||||
from minimateplus.models import Event, PeakValues, ProjectInfo, Timestamp
|
||||
|
||||
# ── Seed missing fields from the SeismoDb events row ──
|
||||
# The DB row was populated at original save time with peaks,
|
||||
# project info, timestamp, record_type, sample_rate, etc.
|
||||
# All of those survive intact in SQLite; pull them onto the
|
||||
# rebuilt Event so the regenerated sidecar matches what was
|
||||
# there before the backfill ran.
|
||||
db_row = None
|
||||
try:
|
||||
import sqlite3 as _sql
|
||||
with _sql.connect(str(db.db_path)) as _conn:
|
||||
_conn.row_factory = _sql.Row
|
||||
db_row = _conn.execute(
|
||||
"SELECT * FROM events "
|
||||
"WHERE serial=? AND blastware_filename=? "
|
||||
"LIMIT 1",
|
||||
(serial, path.name),
|
||||
).fetchone()
|
||||
except Exception as exc:
|
||||
log.debug("DB lookup failed for %s: %s", path.name, exc)
|
||||
|
||||
if db_row is not None:
|
||||
if ev.sample_rate is None and db_row["sample_rate"]:
|
||||
ev.sample_rate = int(db_row["sample_rate"])
|
||||
if not ev.record_type and db_row["record_type"]:
|
||||
ev.record_type = db_row["record_type"]
|
||||
if ev._waveform_key is None and db_row["waveform_key"]:
|
||||
try:
|
||||
ev._waveform_key = bytes.fromhex(db_row["waveform_key"])
|
||||
except Exception:
|
||||
pass
|
||||
# Timestamp from the ISO-8601 string in the DB row.
|
||||
if ev.timestamp is None and db_row["timestamp"]:
|
||||
try:
|
||||
import datetime as _dt
|
||||
_t = _dt.datetime.fromisoformat(db_row["timestamp"])
|
||||
ev.timestamp = Timestamp(
|
||||
raw=b"", flag=0x10,
|
||||
year=_t.year, unknown_byte=0,
|
||||
month=_t.month, day=_t.day,
|
||||
hour=_t.hour, minute=_t.minute, second=_t.second,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
# Peaks from the DB row when the A5 decode didn't supply them.
|
||||
if ev.peak_values is None:
|
||||
ev.peak_values = PeakValues(
|
||||
tran=db_row["tran_ppv"],
|
||||
vert=db_row["vert_ppv"],
|
||||
long=db_row["long_ppv"],
|
||||
peak_vector_sum=db_row["peak_vector_sum"],
|
||||
micl=db_row["mic_ppv"],
|
||||
)
|
||||
# Project info from the DB row when the A5 metadata-page
|
||||
# decode didn't pick it up.
|
||||
if ev.project_info is None or all(
|
||||
v in (None, "")
|
||||
for v in (
|
||||
(ev.project_info.project if ev.project_info else None),
|
||||
(ev.project_info.client if ev.project_info else None),
|
||||
(ev.project_info.operator if ev.project_info else None),
|
||||
(ev.project_info.sensor_location if ev.project_info else None),
|
||||
)
|
||||
):
|
||||
ev.project_info = ProjectInfo(
|
||||
project=db_row["project"],
|
||||
client=db_row["client"],
|
||||
operator=db_row["operator"],
|
||||
sensor_location=db_row["sensor_location"],
|
||||
)
|
||||
|
||||
# Derive total_samples when we have both rectime + sample_rate.
|
||||
# The decoder's STRT-derived value can be a buffer offset
|
||||
# rather than a sample count — drop it in that case.
|
||||
if ev.sample_rate and ev.rectime_seconds:
|
||||
derived = int(round(ev.sample_rate * ev.rectime_seconds))
|
||||
if (ev.total_samples is None
|
||||
or ev.total_samples > derived * 2
|
||||
or ev.total_samples < derived // 4):
|
||||
ev.total_samples = derived
|
||||
|
||||
# Preserve user-edited review state + extensions from the
|
||||
# existing sidecar (false_trigger flag, notes, etc.) so a
|
||||
# backfill never wipes them out.
|
||||
preserved_review = None
|
||||
preserved_ext = None
|
||||
if sidecar_path.exists():
|
||||
try:
|
||||
_existing = event_file_io.read_sidecar(sidecar_path)
|
||||
preserved_review = _existing.get("review")
|
||||
preserved_ext = _existing.get("extensions")
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
sidecar = event_file_io.event_to_sidecar_dict(
|
||||
ev,
|
||||
serial=serial,
|
||||
blastware_filename=path.name,
|
||||
blastware_filesize=path.stat().st_size,
|
||||
blastware_sha256=bw_sha,
|
||||
source_kind=source_kind,
|
||||
a5_pickle_filename=a5_filename,
|
||||
review=preserved_review,
|
||||
extensions=preserved_ext,
|
||||
)
|
||||
|
||||
# Also emit the .h5 clean-waveform file when missing OR when
|
||||
# --force was passed (so a re-backfill picks up decoder fixes).
|
||||
hdf5_path = store.hdf5_path_for(serial, path.name)
|
||||
hdf5_filename = hdf5_path.name if hdf5_path.exists() else None
|
||||
hdf5_action = "kept"
|
||||
need_h5 = not args.skip_hdf5 and (args.force or not hdf5_path.exists())
|
||||
if need_h5:
|
||||
if args.dry_run:
|
||||
hdf5_action = "would (re)write"
|
||||
else:
|
||||
try:
|
||||
event_hdf5.write_event_hdf5(
|
||||
hdf5_path, ev,
|
||||
serial=serial,
|
||||
geo_range="normal",
|
||||
source_kind=source_kind,
|
||||
)
|
||||
hdf5_filename = hdf5_path.name
|
||||
hdf5_action = "rewrote" if hdf5_path.exists() else "wrote"
|
||||
except Exception as exc:
|
||||
log.warning("HDF5 write failed for %s: %s", path.name, exc)
|
||||
hdf5_action = "FAILED"
|
||||
|
||||
if args.dry_run:
|
||||
print(f" [DRY ] would write {sidecar_path.name} "
|
||||
f"+ .h5 ({hdf5_action}) source={source_kind}")
|
||||
written += 1
|
||||
continue
|
||||
|
||||
event_file_io.write_sidecar(sidecar_path, sidecar)
|
||||
|
||||
# Best-effort: keep the SQL row's sidecar_filename in sync
|
||||
# by upserting via insert_events (it dedups on serial+ts).
|
||||
try:
|
||||
db.insert_events(
|
||||
[ev], serial=serial,
|
||||
waveform_records=(
|
||||
{ev._waveform_key.hex(): {
|
||||
"filename": path.name,
|
||||
"filesize": path.stat().st_size,
|
||||
"a5_pickle_filename": a5_filename,
|
||||
"sidecar_filename": sidecar_path.name,
|
||||
}}
|
||||
if ev._waveform_key else None
|
||||
),
|
||||
)
|
||||
except Exception as exc:
|
||||
log.warning("DB upsert failed for %s: %s", path.name, exc)
|
||||
|
||||
print(f" [OK ] {path.name} → {sidecar_path.name} "
|
||||
f"+ h5 ({hdf5_action}) source={source_kind}")
|
||||
written += 1
|
||||
|
||||
except Exception as exc:
|
||||
log.error("backfill failed for %s: %s", path, exc, exc_info=args.verbose)
|
||||
errors += 1
|
||||
|
||||
print(f"\nDone. written={written} skipped(uptodate)={skipped} errors={errors}")
|
||||
return 0 if errors == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
Reference in New Issue
Block a user