feat: add thor/micromate compatibility v0.18.0

2026-05-19 04:32:43 +00:00
parent 512d82c720
commit cd20be2eff
7 changed files with 839 additions and 2 deletions
@@ -0,0 +1,291 @@
+"""
+sfm/idf_ascii_report.py — parse Thor (Micromate Series IV) IDF ASCII reports.
+
+Thor exports a `.IDFW.txt` or `.IDFH.txt` sidecar next to each `.IDFW`
+(waveform) or `.IDFH` (histogram) event binary.  Each sidecar is a
+plain-text file with `"Key : Value"` lines covering the full device-
+authoritative event metadata — PPV per channel, ZC Freq, Time of Peak,
+Peak Acceleration / Displacement, sensor self-check results, project
+strings, calibration date, battery level, etc. — followed by a raw
+waveform-samples block headed by the literal line "Waveform Data Channels".
+
+This is the Thor analogue of `minimateplus/bw_ascii_report.py` for the
+Blastware (Series III) report format.  The parser is intentionally
+permissive: we extract everything we recognise into a flat dict and
+silently ignore anything we don't.  Downstream callers parse units
+(`"0.2119 in/s"` → 0.2119) only on the fields they need.
+
+Example input (truncated):
+
+    "EventType : Full Waveform"
+    "SampleRate : 1024 sps"
+    "EventTime : 16:27:23"
+    "EventDate : 2023-12-19"
+    "TranPPV : 0.0251 in/s"
+    "VertPPV : 0.2119 in/s"
+    "LongPPV : 0.0282 in/s"
+    "PeakVectorSum : 0.2131 in/s"
+    "MicPSPL : 99.4 dB(L)"
+    "TranZCFreq : 6.5 Hz"
+    "SerialNumber : UM11719"
+    "Version : Micromate ISEE 11.0AK"
+    "FileName : UM11719_20231219162723.IDFW"
+    "BatteryLevel : 3.8 volts"
+    "Calibration : November 22, 2023 by Instantel"
+    "TranTestResults : Passed"
+    "TitleString1 : UPMC Presby-Loc 3-Level1-1R Elevator Rm"
+    Waveform Data Channels
+        Tran    Vert    Long    MicL
+        0.0003  -0.0003  0.0003  0.00013
+        ...
+"""
+
+from __future__ import annotations
+
+import datetime
+import re
+from typing import Any, Dict, Optional, Tuple, Union
+
+
+# Lines look like:  "Key : Value"   (quotes literal, single ":" separator)
+_LINE_RE = re.compile(r'^\s*"?([^":]+?)"?\s*:\s*"?(.*?)"?\s*$')
+
+# Marker that ends the metadata block — everything after is raw sample data.
+_WAVEFORM_BLOCK_MARKER = "waveform data channels"
+
+
+def _normalize_key(raw: str) -> str:
+    """Convert "TranPPV" / "PreTriggerLength" → snake_case."""
+    s = raw.strip()
+    # Insert underscore between lower→upper / digit→letter transitions
+    s = re.sub(r"(?<=[a-z0-9])(?=[A-Z])", "_", s)
+    s = re.sub(r"(?<=[A-Z])(?=[A-Z][a-z])", "_", s)
+    s = s.replace("-", "_").replace(" ", "_")
+    return s.lower()
+
+
+def _strip_unit_suffix(value: str) -> str:
+    """Return the numeric part of values like "0.2119 in/s" → "0.2119"."""
+    parts = value.strip().split()
+    return parts[0] if parts else value.strip()
+
+
+def _parse_float(value: str) -> Optional[float]:
+    try:
+        return float(_strip_unit_suffix(value))
+    except (ValueError, TypeError):
+        return None
+
+
+def _parse_int(value: str) -> Optional[int]:
+    try:
+        return int(float(_strip_unit_suffix(value)))
+    except (ValueError, TypeError):
+        return None
+
+
+def parse_idf_report(text: Union[str, bytes]) -> Dict[str, Any]:
+    """
+    Parse a Thor IDFW.txt / IDFH.txt sidecar.
+
+    Returns a flat dict with two kinds of entries:
+
+      - **Raw fields** — every `Key : Value` line, keyed by snake_case
+        of the original key, value as a string (unit suffix preserved).
+        Lets callers grab any field we haven't explicitly normalised.
+
+      - **Derived fields** — a curated set with parsed types:
+          * `serial_number`     str
+          * `event_type`        str  ("Full Waveform" / "Full Histogram")
+          * `event_datetime`    ISO-8601 string ("YYYY-MM-DDTHH:MM:SS") when
+                                 both EventDate and EventTime are present
+          * `sample_rate`       int  (samples/sec)
+          * `tran_ppv`,`vert_ppv`,`long_ppv` float (in/s)
+          * `mic_ppv`           float (dB or psi — same units as MicPSPL)
+          * `peak_vector_sum`   float (in/s)
+          * `tran_zc_freq`,`vert_zc_freq`,`long_zc_freq` float (Hz)
+          * `record_time_sec`   float (seconds)
+          * `pre_trigger_sec`   float (seconds)
+          * `project`           str  (from TitleString1 — Thor's location)
+          * `client`            str  (TitleString2)
+          * `operator`          str  (TitleString3 — company/operator)
+          * `notes`             str  (TitleString4)
+          * `setup`             str
+          * `version`           str  (firmware)
+          * `battery_volts`     float
+          * `calibration_text`  str  (e.g. "November 22, 2023 by Instantel")
+          * `tran_test_passed`, `vert_test_passed`, `long_test_passed`,
+            `mic_test_passed`  bool  ("Passed" → True; anything else → False)
+          * `filename`          str  (FileName line — useful sanity check)
+
+    Stops parsing at the literal "Waveform Data Channels" line; the
+    raw-samples block is left to whoever wants to decode the binary.
+
+    Input may be `str` or `bytes` (`utf-8`/`latin-1` tolerant).
+    """
+    if isinstance(text, bytes):
+        try:
+            text = text.decode("utf-8")
+        except UnicodeDecodeError:
+            text = text.decode("latin-1", errors="replace")
+
+    raw: Dict[str, str] = {}
+
+    for line in text.splitlines():
+        stripped = line.strip()
+        if not stripped:
+            continue
+        if stripped.lower().startswith(_WAVEFORM_BLOCK_MARKER):
+            break
+        m = _LINE_RE.match(stripped)
+        if not m:
+            continue
+        key = _normalize_key(m.group(1))
+        value = m.group(2).strip()
+        # Multi-value lines (Channel, Units, etc.) — coalesce by appending.
+        if key in raw:
+            raw[key] = raw[key] + "; " + value
+        else:
+            raw[key] = value
+
+    out: Dict[str, Any] = dict(raw)  # keep all raw fields
+
+    # ── Derived fields ───────────────────────────────────────────────────────
+
+    def _take(*candidates: str) -> Optional[str]:
+        for c in candidates:
+            if c in raw:
+                return raw[c]
+        return None
+
+    # Event identity
+    if "serial_number" in raw:
+        out["serial_number"] = raw["serial_number"]
+    if "event_type" in raw:
+        out["event_type"] = raw["event_type"]
+    if "file_name" in raw:
+        out["filename"] = raw["file_name"]
+
+    # Combined date+time.  Waveform sidecars use "EventDate" / "EventTime";
+    # histogram sidecars use "HistogramStartDate" / "HistogramStartTime".
+    # Prefer the event_* names when both are present.
+    ed = raw.get("event_date") or raw.get("histogram_start_date")
+    et = raw.get("event_time") or raw.get("histogram_start_time")
+    if ed and et:
+        try:
+            dt = datetime.datetime.strptime(f"{ed} {et}", "%Y-%m-%d %H:%M:%S")
+            out["event_datetime"] = dt.isoformat()
+        except ValueError:
+            pass
+
+    # Numeric scalars
+    for key in ("sample_rate",):
+        v = raw.get(key)
+        if v is not None:
+            iv = _parse_int(v)
+            if iv is not None:
+                out[key] = iv
+
+    for key in ("tran_ppv", "vert_ppv", "long_ppv", "peak_vector_sum",
+                "tran_zc_freq", "vert_zc_freq", "long_zc_freq",
+                "tran_peak_acceleration", "vert_peak_acceleration",
+                "long_peak_acceleration",
+                "tran_peak_displacement", "vert_peak_displacement",
+                "long_peak_displacement",
+                "tran_time_of_peak", "vert_time_of_peak", "long_time_of_peak",
+                "mic_time_of_peak", "mic_zc_freq"):
+        v = raw.get(key)
+        if v is not None:
+            fv = _parse_float(v)
+            if fv is not None:
+                out[key] = fv
+
+    # Microphone — Thor reports MicPSPL (dB(L)) which is the closest
+    # analogue to BW's mic_ppv.  Stored as a float; units are in the
+    # original raw field (`mic_pspl` raw entry preserves "99.4 dB(L)").
+    mic = raw.get("mic_pspl")
+    if mic is not None:
+        fv = _parse_float(mic)
+        if fv is not None:
+            out["mic_ppv"] = fv
+
+    # Record / pre-trigger duration
+    rt = raw.get("record_time")
+    if rt is not None:
+        fv = _parse_float(rt)
+        if fv is not None:
+            out["record_time_sec"] = fv
+    pt = raw.get("pre_trigger_length")
+    if pt is not None:
+        fv = _parse_float(pt)
+        if fv is not None:
+            out["pre_trigger_sec"] = fv
+
+    # Project / client / operator / location strings.  Thor's title
+    # strings are operator-defined; conventional mapping (per Thor's
+    # default TitleNote labels in the example data):
+    #   TitleString1 = Location  → project (sensor location identifier)
+    #   TitleString2 = Client    → client
+    #   TitleString3 = Company   → operator (the monitoring company)
+    #   TitleString4 = Notes     → notes
+    out["project"]  = _take("title_string1")
+    out["client"]   = _take("title_string2")
+    out["operator"] = _take("title_string3", "operator")
+    out["notes"]    = _take("title_string4", "post_event_note")
+
+    if "setup" in raw:
+        out["setup"] = raw["setup"]
+    if "version" in raw:
+        out["version"] = raw["version"]
+
+    # Battery (e.g. "3.8 volts" → 3.8)
+    bl = raw.get("battery_level")
+    if bl is not None:
+        fv = _parse_float(bl)
+        if fv is not None:
+            out["battery_volts"] = fv
+
+    # Calibration line is free-form (e.g. "November 22, 2023 by Instantel").
+    if "calibration" in raw:
+        out["calibration_text"] = raw["calibration"]
+
+    # Sensor self-check results — bool flags
+    for key, out_key in (
+        ("tran_test_results", "tran_test_passed"),
+        ("vert_test_results", "vert_test_passed"),
+        ("long_test_results", "long_test_passed"),
+        ("mic_test_results",  "mic_test_passed"),
+    ):
+        v = raw.get(key)
+        if v is not None:
+            out[out_key] = v.strip().lower() == "passed"
+
+    return out
+
+
+def serial_from_filename(name: str) -> Optional[str]:
+    """Convenience: pull the serial prefix from a Thor event filename.
+
+    Thor uses the literal serial as the filename prefix:
+      UM11719_20231219163444.IDFW  →  "UM11719"
+      BE9439_20200713124251.IDFH   →  "BE9439"
+    """
+    m = re.match(r"^([A-Z]{2}\d+)_\d{14}\.(IDFH|IDFW)(?:\.txt)?$",
+                 name, re.IGNORECASE)
+    return m.group(1).upper() if m else None
+
+
+def parse_event_filename(name: str) -> Optional[Tuple[str, datetime.datetime, str]]:
+    """Parse `<SERIAL>_<YYYYMMDDHHMMSS>.<KIND>` → (serial, datetime, kind).
+
+    `kind` is "IDFH" or "IDFW" (upper-case).  Returns None on no match.
+    """
+    m = re.match(r"^([A-Z]{2}\d+)_(\d{14})\.(IDFH|IDFW)$",
+                 name, re.IGNORECASE)
+    if not m:
+        return None
+    try:
+        ts = datetime.datetime.strptime(m.group(2), "%Y%m%d%H%M%S")
+    except ValueError:
+        return None
+    return m.group(1).upper(), ts, m.group(3).upper()
@@ -2472,6 +2472,123 @@ def _serial_from_event(ev) -> Optional[str]:
    return None


+# ── /db/import/idf_file — ingest Thor (Series IV) IDF event files ────────────
+
+
+@app.post("/db/import/idf_file")
+async def db_import_idf_file(
+    files:  list[UploadFile] = File(...),
+    serial: Optional[str]    = Query(None, description="Optional serial-number hint (e.g. UM11719); falls back to the IDF filename's literal prefix when omitted"),
+) -> dict:
+    """
+    Multipart upload of one or more Thor (Micromate Series IV) IDF event
+    file binaries (`.IDFH` histogram, `.IDFW` waveform), typically
+    forwarded by `thor-watcher`'s SFM forwarder.
+
+    For each file:
+
+      1. Pair the binary with its `<binary>.txt` ASCII report when one
+         is present in the same upload.
+      2. Parse the report via `sfm.idf_ascii_report.parse_idf_report`
+         and copy the binary into the persistent store via
+         `WaveformStore.save_imported_idf`, writing a `.sfm.json`
+         sidecar with `source.kind = "idf-import"`.
+      3. Upsert a row into `events` (dedup'd on serial+timestamp).
+
+    **Paired Thor TXT reports.**  Thor's TXT exporter writes a
+    per-event ASCII report next to each binary as `<binary>.txt`
+    (e.g. `UM11719_20231219163444.IDFW` + `UM11719_20231219163444.IDFW.txt`).
+    The thor-watcher forwarder ships both files in a single multipart
+    upload.  If the report is present, its decoded fields (Tran/Vert/Long
+    PPV, ZC Freq, Peak Vector Sum, Mic PSPL, calibration, sensor
+    self-check results, project strings) land in the sidecar's
+    `extensions.idf_report` block and the SFM `events` row's
+    device-authoritative columns.
+
+    Pairing is by exact filename match (case-insensitive): a binary
+    named `foo.IDFW` is paired with a report named `foo.IDFW.txt` or
+    `foo.IDFW.TXT`.
+
+    Response includes per-file outcomes so the watcher can see which
+    landed cleanly and which failed (e.g. malformed file, unknown
+    serial, etc.).
+    """
+    store = _get_store()
+    db    = _get_db()
+    results: list[dict] = []
+
+    binaries: list[tuple[str, bytes]] = []
+    reports:  dict[str, bytes]        = {}   # keyed by lower-cased binary filename
+    for upload in files:
+        name = upload.filename or ""
+        try:
+            content = await upload.read()
+        except Exception as exc:
+            results.append({
+                "filename": name or "<unnamed>", "status": "error",
+                "detail":   f"read failed: {exc}",
+            })
+            continue
+
+        if name.lower().endswith(".txt"):
+            # Thor convention: <binary>.txt — strip the trailing ".txt"
+            # to recover the binary's filename.
+            stripped = name[:-4]
+            reports[stripped.lower()] = content
+        else:
+            binaries.append((name, content))
+
+    for filename, content in binaries:
+        report_bytes = reports.get(filename.lower())
+        try:
+            ev, rec = store.save_imported_idf(
+                content,
+                source_path=Path(filename or "imported.idf"),
+                serial_hint=serial,
+                idf_report_text=report_bytes,
+            )
+            resolved_serial = (
+                serial
+                or rec.get("serial")
+                or "UNKNOWN"
+            )
+            inserted, skipped = db.insert_events(
+                [ev],
+                serial=resolved_serial,
+                waveform_records={
+                    ev._waveform_key.hex(): rec
+                } if ev._waveform_key else None,
+            )
+            results.append({
+                "filename":         filename,
+                "status":           "ok",
+                "stored_filename":  rec["filename"],
+                "filesize":         rec["filesize"],
+                "sha256":           rec["sha256"],
+                "serial":           resolved_serial,
+                "report_attached":  report_bytes is not None,
+                "inserted":         inserted,
+                "skipped":          skipped,
+            })
+        except Exception as exc:
+            log.error("idf import failed for %s: %s", filename, exc, exc_info=True)
+            results.append({
+                "filename": filename, "status": "error",
+                "detail":   str(exc),
+            })
+
+    # Surface unmatched .txt uploads so the daemon can detect mis-pairings.
+    used_report_keys = {fn.lower() for fn, _ in binaries}
+    for stem in reports.keys() - used_report_keys:
+        results.append({
+            "filename":  stem + ".txt",
+            "status":    "warning",
+            "detail":    "Thor TXT report supplied but no matching binary in this upload",
+        })
+
+    return {"count": len(results), "results": results}
+
+
@app.get("/db/units/{serial}/waveforms.zip")
 def db_unit_waveforms_zip(
    serial:  str,
@@ -413,6 +413,179 @@ class WaveformStore:
            "serial":             serial,
        }

+    def save_imported_idf(
+        self,
+        idf_bytes: bytes,
+        source_path: Path,
+        *,
+        serial_hint: Optional[str] = None,
+        idf_report_text: Optional[Union[str, bytes]] = None,
+    ) -> tuple[Optional["Event"], dict]:
+        """
+        Ingest a Thor (Micromate Series IV) IDF event file (`.IDFW` or
+        `.IDFH`) produced by Thor's TXT exporter.
+
+        Thor binaries are stored as opaque bytes — seismo-relay doesn't
+        decode the proprietary IDF binary format.  Device-authoritative
+        metadata comes from the paired `.IDFW.txt` / `.IDFH.txt` sidecar
+        when supplied; we parse that text and surface its fields onto
+        the returned Event so the SFM database row has real PPV/project
+        values instead of NULLs.
+
+        Workflow:
+          1. Parse the paired TXT report (when supplied) via
+             `sfm.idf_ascii_report.parse_idf_report`.
+          2. Build a minimal `Event` populated from the report fields
+             (timestamp, peaks, project info, sample_rate, record_type).
+          3. Resolve serial from filename prefix or `serial_hint`.
+          4. Copy bytes verbatim into <root>/<serial>/<filename>.
+          5. Write the `.sfm.json` sidecar with source.kind = "idf-import".
+
+        Returns (event, record_dict) so the endpoint can both insert
+        into SeismoDb and surface the parsed event.
+        """
+        from sfm.idf_ascii_report import (
+            parse_idf_report,
+            parse_event_filename,
+            serial_from_filename as _idf_serial_from_filename,
+        )
+        from minimateplus.models import (
+            Event, PeakValues, ProjectInfo, Timestamp,
+        )
+
+        # Parse the .txt sidecar (best-effort; non-fatal on failure).
+        report: dict = {}
+        if idf_report_text is not None:
+            try:
+                report = parse_idf_report(idf_report_text)
+            except Exception as exc:
+                log.warning(
+                    "save_imported_idf: report parse failed: %s — continuing without it",
+                    exc,
+                )
+
+        # Resolve serial: prefer the explicit hint, fall back to filename prefix.
+        serial = (
+            serial_hint
+            or report.get("serial_number")
+            or _idf_serial_from_filename(source_path.name)
+            or "UNKNOWN"
+        )
+
+        # Resolve event timestamp + kind from the filename (always present).
+        parsed_name = parse_event_filename(source_path.name)
+        kind = "Waveform"
+        ts_dt: Optional[datetime.datetime] = None
+        if parsed_name is not None:
+            _, ts_dt, kind_token = parsed_name
+            kind = "Histogram" if kind_token == "IDFH" else "Waveform"
+        # Report's event_datetime is the device-authoritative value; prefer it.
+        if "event_datetime" in report:
+            try:
+                ts_dt = datetime.datetime.fromisoformat(report["event_datetime"])
+            except (TypeError, ValueError):
+                pass
+
+        ts_obj: Optional[Timestamp] = None
+        if ts_dt is not None:
+            ts_obj = Timestamp(
+                raw=bytes(9),
+                flag=0,
+                year=ts_dt.year,
+                unknown_byte=0,
+                month=ts_dt.month,
+                day=ts_dt.day,
+                hour=ts_dt.hour,
+                minute=ts_dt.minute,
+                second=ts_dt.second,
+            )
+
+        # Build PeakValues from the report (fields are None when absent).
+        pv = PeakValues(
+            tran=report.get("tran_ppv"),
+            vert=report.get("vert_ppv"),
+            long=report.get("long_ppv"),
+            micl=report.get("mic_ppv"),
+            peak_vector_sum=report.get("peak_vector_sum"),
+        )
+
+        # Build ProjectInfo.  See idf_ascii_report — Thor's title strings
+        # carry project / client / company / notes in TitleString1..4.
+        pi = ProjectInfo(
+            setup_name=report.get("setup"),
+            project=report.get("project"),
+            client=report.get("client"),
+            operator=report.get("operator"),
+            sensor_location=None,  # Thor folds location into TitleString1 = project
+            notes=report.get("notes"),
+        )
+
+        # Filesystem write.
+        filename = source_path.name
+        bw_path = self._serial_dir(serial) / filename
+        bw_path.write_bytes(idf_bytes)
+        filesize = bw_path.stat().st_size
+        sha256   = event_file_io.file_sha256(bw_path)
+
+        # _waveform_key dedups (serial, timestamp) rows in the events
+        # table.  Use the binary's sha256 (first 16 bytes) as a stable
+        # surrogate — every distinct binary maps to a distinct row.
+        waveform_key = bytes.fromhex(sha256)[:16]
+
+        ev = Event(
+            index=0,
+            timestamp=ts_obj,
+            sample_rate=report.get("sample_rate"),
+            peak_values=pv,
+            project_info=pi,
+            record_type=kind,
+            rectime_seconds=report.get("record_time_sec"),
+        )
+        ev._waveform_key = waveform_key
+
+        # Write the sidecar.  Source kind "idf-import" was added to the
+        # allow-list in event_file_io.event_to_sidecar_dict for this.
+        sidecar_path = self.sidecar_path_for(serial, filename)
+        existing_review = None
+        if sidecar_path.exists():
+            try:
+                existing_review = event_file_io.read_sidecar(sidecar_path).get("review")
+            except Exception:
+                pass
+
+        sidecar = event_file_io.event_to_sidecar_dict(
+            ev,
+            serial=serial,
+            blastware_filename=filename,
+            blastware_filesize=filesize,
+            blastware_sha256=sha256,
+            source_kind="idf-import",
+            a5_pickle_filename=None,
+            review=existing_review,
+        )
+        # Stash the full parsed IDF report under extensions so downstream
+        # consumers can recover the rich derived fields that don't fit
+        # the BW-shaped event model (Peak Acceleration / Displacement,
+        # Time of Peak, sensor self-check, calibration, firmware).
+        if report:
+            sidecar["extensions"]["idf_report"] = report
+        event_file_io.write_sidecar(sidecar_path, sidecar)
+
+        log.info(
+            "WaveformStore.save_imported_idf serial=%s filename=%s filesize=%d "
+            "report_attached=%s",
+            serial, filename, filesize, bool(report),
+        )
+        return ev, {
+            "filename":           filename,
+            "filesize":           filesize,
+            "sha256":             sha256,
+            "a5_pickle_filename": None,
+            "hdf5_filename":      None,
+            "sidecar_filename":   sidecar_path.name,
+            "serial":             serial,
+        }
+
    def load_a5(self, serial: str, filename: str) -> Optional[list[S3Frame]]:
        """
        Re-hydrate the pickled A5 frame stream for a stored event.