""" minimateplus/event_file_io.py — modern event-file (.sfm.json sidecar) IO. This module is the single home for event-file conversion code that doesn't fit cleanly inside `blastware_file.py` (which is the BW binary codec): - sidecar JSON read/write (the modern per-event metadata file) - read_blastware_file() — reverse of write_blastware_file, used by the BW-importer flow when SFM is ingesting files produced by Blastware's own ACH (where the source A5 frames aren't available). Sidecar schema v1 layout — see docs in the project plan or the schema declared in `event_to_sidecar_dict()`. """ from __future__ import annotations import base64 import datetime import hashlib import json import logging import os import struct from pathlib import Path from typing import Optional, Union from .models import Event, PeakValues, ProjectInfo, Timestamp from . import blastware_file as _bw # avoid circular reference at module load log = logging.getLogger(__name__) # Schema version for the sidecar JSON. Bump when fields change shape. # Older readers must reject anything > SCHEMA_VERSION; newer fields added # inside `extensions` are forward-compatible without a bump. SCHEMA_VERSION = 1 SIDECAR_KIND = "sfm.event" # Default tool_version stamp; callers can override. Hard-coded here # rather than read via importlib.metadata because the latter reflects the # *installed* dist-info, which doesn't update when pyproject.toml is # bumped without a `pip install` re-run — leading to confusing stale # version stamps in sidecars. Bump this constant and CHANGELOG.md # together at release time. TOOL_VERSION = "0.15.0" try: # Best-effort: prefer the installed metadata when it's NEWER than the # baked-in constant (e.g. a downstream packager bumped the wheel # without editing this file). Otherwise fall back to TOOL_VERSION. from importlib.metadata import version as _pkg_version _meta_v = _pkg_version("seismo-relay") def _vtuple(s): try: return tuple(int(p) for p in s.split(".")[:3]) except Exception: return (0, 0, 0) _TOOL_VERSION_DEFAULT = ( _meta_v if _vtuple(_meta_v) > _vtuple(TOOL_VERSION) else TOOL_VERSION ) except Exception: _TOOL_VERSION_DEFAULT = TOOL_VERSION # ── Sidecar dict construction ───────────────────────────────────────────────── def _ts_iso(ts: Optional[Timestamp]) -> Optional[str]: if ts is None: return None try: return datetime.datetime( ts.year, ts.month, ts.day, ts.hour or 0, ts.minute or 0, ts.second or 0, ).isoformat() except Exception: return str(ts) def _peak_values_to_dict(pv: Optional[PeakValues]) -> dict: if pv is None: return { "transverse": None, "vertical": None, "longitudinal": None, "vector_sum": None, "mic_psi": None, } return { "transverse": pv.tran, "vertical": pv.vert, "longitudinal": pv.long, "vector_sum": pv.peak_vector_sum, "mic_psi": pv.micl, } def _project_info_to_dict(pi: Optional[ProjectInfo]) -> dict: if pi is None: return { "project": None, "client": None, "operator": None, "sensor_location": None, } return { "project": pi.project, "client": pi.client, "operator": pi.operator, "sensor_location": pi.sensor_location, } def event_to_sidecar_dict( event: Event, *, serial: str, blastware_filename: str, blastware_filesize: int, blastware_sha256: str, source_kind: str = "sfm-live", a5_pickle_filename: Optional[str] = None, tool_version: str = _TOOL_VERSION_DEFAULT, captured_at: Optional[datetime.datetime] = None, review: Optional[dict] = None, extensions: Optional[dict] = None, ) -> dict: """ Build a v1 sidecar dict from an Event + the surrounding metadata. Pure helper — no file I/O. Callers stitch the result into a sidecar via `write_sidecar()` (or POST it back via the PATCH endpoint). """ if source_kind not in {"sfm-live", "sfm-ach", "bw-import"}: raise ValueError(f"unknown source_kind: {source_kind!r}") captured_at = captured_at or datetime.datetime.utcnow() # Stash raw 0C record bytes in `extensions.raw_records` so future # field-decoding work (Peak Acceleration, ZC Freq, Time of Peak, # sensor self-check results, etc.) can run offline against committed # sidecars without a live device. Cheap (~280 bytes base64) and # forward-compatible (older readers ignore unknown extensions keys). ext_dict: dict = dict(extensions) if extensions else {} raw_0c = getattr(event, "_raw_record", None) if raw_0c: rr = ext_dict.setdefault("raw_records", {}) # Don't clobber a raw_0c that callers explicitly passed in via # `extensions=...` (e.g. round-trip preservation in patch_sidecar). rr.setdefault("waveform_record_b64", base64.b64encode(raw_0c).decode("ascii")) rr.setdefault("waveform_record_len", len(raw_0c)) return { "schema_version": SCHEMA_VERSION, "kind": SIDECAR_KIND, "event": { "serial": serial, "timestamp": _ts_iso(event.timestamp), "waveform_key": event._waveform_key.hex() if event._waveform_key else None, "record_type": event.record_type, "sample_rate": event.sample_rate, "rectime_seconds": event.rectime_seconds, "total_samples": event.total_samples, "pretrig_samples": event.pretrig_samples, }, "peak_values": _peak_values_to_dict(event.peak_values), "project_info": _project_info_to_dict(event.project_info), "blastware": { "filename": blastware_filename, "filesize": blastware_filesize, "sha256": blastware_sha256, "available": True, }, "source": { "kind": source_kind, "captured_at": captured_at.isoformat() + "Z" if captured_at.tzinfo is None else captured_at.isoformat(), "tool_version": tool_version, "a5_pickle_filename": a5_pickle_filename, }, "review": review or { "false_trigger": False, "reviewer": None, "reviewed_at": None, "notes": "", }, "extensions": ext_dict, } # ── Sidecar IO ──────────────────────────────────────────────────────────────── def write_sidecar(path: Union[str, Path], data: dict) -> None: """ Atomic write of a sidecar dict to . Validates schema_version is supported before writing so we don't silently drop a future-format sidecar over the wire. """ path = Path(path) sv = data.get("schema_version") if not isinstance(sv, int) or sv < 1 or sv > SCHEMA_VERSION: raise ValueError( f"write_sidecar: unsupported schema_version={sv!r} " f"(this build supports 1..{SCHEMA_VERSION})" ) tmp = path.with_suffix(path.suffix + ".tmp") with tmp.open("w", encoding="utf-8") as f: json.dump(data, f, indent=2, sort_keys=False, default=str) f.write("\n") f.flush() os.fsync(f.fileno()) os.replace(tmp, path) def read_sidecar(path: Union[str, Path]) -> dict: """ Load a sidecar JSON file. Raises FileNotFoundError if missing, ValueError on bad shape / unsupported schema_version. Unknown keys at the top level are preserved in the returned dict (forward-compat). """ path = Path(path) with path.open("r", encoding="utf-8") as f: data = json.load(f) if not isinstance(data, dict): raise ValueError(f"sidecar at {path}: top-level is not a JSON object") sv = data.get("schema_version") if not isinstance(sv, int) or sv < 1: raise ValueError(f"sidecar at {path}: missing or invalid schema_version") if sv > SCHEMA_VERSION: raise ValueError( f"sidecar at {path}: schema_version={sv} > supported {SCHEMA_VERSION}; " "upgrade seismo-relay to read this file" ) if data.get("kind") != SIDECAR_KIND: raise ValueError(f"sidecar at {path}: unexpected kind={data.get('kind')!r}") return data def patch_sidecar( path: Union[str, Path], *, review: Optional[dict] = None, extensions: Optional[dict] = None, reviewer_now: bool = True, ) -> dict: """ Atomically apply a JSON-merge-patch to a sidecar file's `review` and/or `extensions` blocks. Other top-level keys are untouched. `review_now`: when True (default) and `review` is non-empty, stamps `review.reviewed_at` with the current UTC time so the review-time is auditable without the caller having to pass it. Returns the new full sidecar dict. """ path = Path(path) data = read_sidecar(path) if review: merged = dict(data.get("review") or {}) merged.update({k: v for k, v in review.items() if v is not None or k in merged}) if reviewer_now: merged["reviewed_at"] = datetime.datetime.utcnow().isoformat() + "Z" data["review"] = merged if extensions: merged_ext = dict(data.get("extensions") or {}) merged_ext.update(extensions) data["extensions"] = merged_ext write_sidecar(path, data) return data def sidecar_path_for(blastware_path: Union[str, Path]) -> Path: """Convention: .sfm.json sits next to the BW binary.""" p = Path(blastware_path) return p.with_name(p.name + ".sfm.json") def file_sha256(path: Union[str, Path], chunk_size: int = 65536) -> str: """Compute SHA-256 of a file as a hex string.""" h = hashlib.sha256() with open(path, "rb") as f: while True: chunk = f.read(chunk_size) if not chunk: break h.update(chunk) return h.hexdigest() # ── Blastware-file reader ───────────────────────────────────────────────────── # # Reverse of `blastware_file.write_blastware_file`. Used by the BW-import # flow to ingest files produced by Blastware's own ACH (where the source # A5 frames are not available). # # File structure (recap): # [22B header] [21B STRT record] [body bytes] [26B footer] # # The body holds: # - 6B preamble (00 00 ff ff ff ff) immediately after the STRT # - 4-channel interleaved int16 LE samples # - Embedded ASCII metadata strings (Project: / Client: / User Name: / # Seis Loc: / Extended Notes) from the device's session-start config # # The 0C waveform record (per-event peaks, project name) is NOT in the # BW file — those are computed by the device firmware and only carried # in the live SUB 0C response. read_blastware_file() therefore computes # peaks from the raw samples assuming Normal-range (10 in/s full-scale) # geophone sensitivity. Imported events surface that assumption via the # sidecar's `peak_values.computed_from_samples` flag. # Geophone scale factor, in/s per ADC unit, for Normal range (10 in/s FS). # Confirmed from CLAUDE.md (geo_hardware_constant = 6.206053 in/s per V, # ADC full-scale = 1.61133 V Normal range = 10.0 in/s peak; per-count # resolution ≈ 10.0 / 32768). _GEO_NORMAL_FS_INS = 10.0 _GEO_SENSITIVE_FS_INS = 1.250 _INT16_FS = 32768.0 # Microphone scale factor, psi per ADC count. Approximate — exact factor # depends on the geophone-vs-mic ADC scaling and the firmware reference. # We mark mic_psi as "computed approximate" in the sidecar. _MIC_FS_PSI = 0.0125 / _INT16_FS # ~0.5 psi full-scale assumption def _decode_strt(strt: bytes) -> dict: """ Decode the 21-byte STRT record from a BW file. Returns dict with waveform_key (4B), total_samples, pretrig_samples, rectime_seconds. Falls back to None on truncated/missing fields. """ if len(strt) < 21 or strt[0:4] != b"STRT": return {} return { "waveform_key": strt[6:10].hex(), "total_samples": struct.unpack_from(">H", strt, 8)[0], "pretrig_samples": struct.unpack_from(">H", strt, 16)[0], "rectime_seconds": strt[18], } def _find_first_string(buf: bytes, label: bytes, max_len: int = 256) -> Optional[str]: """ Search `buf` for `label` (e.g. b"Project:") and return the null-terminated ASCII string that follows, stripped. """ pos = buf.find(label) if pos < 0: return None start = pos + len(label) end = buf.find(b"\x00", start, start + max_len) if end < 0: end = start + max_len text = buf[start:end].decode("ascii", errors="replace").strip() return text or None def _decode_samples_4ch_int16_le(stream: bytes) -> dict[str, list[int]]: """ Decode a 4-channel interleaved int16 LE byte stream into per-channel lists. Channels are [Tran, Vert, Long, Mic] = [ch0, ch1, ch2, ch3]. Truncates to a multiple of 8 bytes (one full sample-set). """ n_complete = (len(stream) // 8) * 8 if n_complete == 0: return {"Tran": [], "Vert": [], "Long": [], "MicL": []} fmt = "<" + "h" * (n_complete // 2) flat = list(struct.unpack(fmt, stream[:n_complete])) return { "Tran": flat[0::4], "Vert": flat[1::4], "Long": flat[2::4], "MicL": flat[3::4], } def _peaks_from_samples(samples: dict[str, list[int]]) -> PeakValues: """ Compute approximate peaks from raw int16 samples assuming Normal-range geophone sensitivity. Used by the BW-importer when the 0C waveform record (the device's authoritative peaks) is unavailable. """ def _peak_ins(ch: list[int]) -> float: if not ch: return 0.0 m = max(abs(int(v)) for v in ch) return m / _INT16_FS * _GEO_NORMAL_FS_INS tran = _peak_ins(samples.get("Tran", [])) vert = _peak_ins(samples.get("Vert", [])) long_ = _peak_ins(samples.get("Long", [])) # Mic in psi (approximate) mic_ch = samples.get("MicL", []) or [] mic = max((abs(int(v)) for v in mic_ch), default=0) * _MIC_FS_PSI # Peak vector sum: max over time of sqrt(T^2 + V^2 + L^2) pvs = 0.0 n = min(len(samples.get("Tran", [])), len(samples.get("Vert", [])), len(samples.get("Long", []))) if n: scale = _GEO_NORMAL_FS_INS / _INT16_FS T = samples["Tran"]; V = samples["Vert"]; L = samples["Long"] for i in range(n): t = T[i] * scale v = V[i] * scale l = L[i] * scale mag = (t*t + v*v + l*l) ** 0.5 if mag > pvs: pvs = mag return PeakValues( tran=tran, vert=vert, long=long_, peak_vector_sum=pvs, micl=mic, ) def read_blastware_file(path: Union[str, Path]) -> Event: """ Parse a Blastware waveform file into an Event. Recovers: - waveform_key, rectime_seconds, total_samples, pretrig_samples (from the STRT record) - timestamp (from the footer's start-time field) - project_info (from ASCII labels embedded in the body) - raw_samples (Tran/Vert/Long/MicL int16 lists) - peak_values (computed from raw_samples; approximate — see notes on _peaks_from_samples about Normal-range assumption) Does NOT recover the source A5 frames (they aren't in the BW file). The returned Event has `_a5_frames = None`, signalling that byte-for-byte regeneration of the BW file from this Event alone is not possible — the on-disk BW file IS the byte-for-byte source. """ path = Path(path) raw = path.read_bytes() if len(raw) < _bw._WAVEFORM_HEADER_SIZE + 21 + 26: raise ValueError(f"{path}: file too short ({len(raw)} bytes) to be a BW event") # Header: validate magic prefix. header = raw[:_bw._WAVEFORM_HEADER_SIZE] if not header.startswith(_bw._FILE_HEADER_PREFIX): raise ValueError(f"{path}: not a Blastware file (bad header prefix)") # STRT record: 21 bytes immediately after the header. strt_raw = raw[_bw._WAVEFORM_HEADER_SIZE : _bw._WAVEFORM_HEADER_SIZE + 21] strt_fields = _decode_strt(strt_raw) if not strt_fields: raise ValueError(f"{path}: STRT record missing or malformed") # Footer: locate the 0e 08 marker, validating the year is in a sane range. body_start = _bw._WAVEFORM_HEADER_SIZE + 21 footer_pos = -1 pos = body_start while True: pos = raw.find(b"\x0e\x08", pos) if pos < 0 or pos + 26 > len(raw): break yr = (raw[pos + 4] << 8) | raw[pos + 5] if 2015 <= yr <= 2050: footer_pos = pos break pos += 1 if footer_pos < 0 and len(raw) >= 26: footer_pos = len(raw) - 26 if footer_pos < body_start: raise ValueError(f"{path}: footer not found") body = raw[body_start : footer_pos] footer = raw[footer_pos : footer_pos + 26] # Footer layout: # [0:2] 0e 08 marker # [2:10] ts1 (start) BE 8B # [10:18] ts2 (stop) BE 8B # [18:24] 00 01 00 02 00 00 # [24:26] crc ts1 = _bw._decode_ts_be(footer[2:10]) ts2 = _bw._decode_ts_be(footer[10:18]) # Body: first 6 bytes are the preamble (00 00 ff ff ff ff). Strip # them before decoding samples. Any trailing tail past the last # full sample-set is silently truncated by _decode_samples_4ch. sample_bytes = body[6:] if body[:6].hex() in ("0000ffffffff", "0000FFFFFFFF") else body samples = _decode_samples_4ch_int16_le(sample_bytes) # Metadata strings (label-anchored search across the body). project = _find_first_string(body, b"Project:") client = _find_first_string(body, b"Client:") user = _find_first_string(body, b"User Name:") seisloc = _find_first_string(body, b"Seis Loc:") # Build the Event. ev = Event(index=-1) if strt_fields.get("waveform_key"): ev._waveform_key = bytes.fromhex(strt_fields["waveform_key"]) ev.record_type = "Waveform" ev.rectime_seconds = strt_fields.get("rectime_seconds") ev.total_samples = strt_fields.get("total_samples") ev.pretrig_samples = strt_fields.get("pretrig_samples") if ts1 is not None: ev.timestamp = Timestamp( raw=footer[2:10], flag=0x10, year=ts1.year, unknown_byte=0, month=ts1.month, day=ts1.day, hour=ts1.hour, minute=ts1.minute, second=ts1.second, ) ev.project_info = ProjectInfo( project=project, client=client, operator=user, sensor_location=seisloc, ) ev.raw_samples = samples ev.peak_values = _peaks_from_samples(samples) ev._a5_frames = None # not recoverable from BW file return ev