feat(admin): SFM + SLMM diagnostic pages under Developer settings

New /admin/sfm page (linked from Settings → Developer): - Health banner — green/red with version + last-checked timestamp - Connection panel — shows SFM_BASE_URL terra-view is configured with - 4 KPI tiles — known units, total events, stale monitor_log rows, stale ach_sessions rows (the deprecated tables from the paused Python-ACH experiment, useful for confirming nothing's growing them) - Per-unit roll-up table — serial, last_seen, event count, stale per-unit counts, sourced from SFM's /db/units - Recent events with forwarding latency — color-coded gap between the event's recorded timestamp and SFM ingest time, so operators can spot watchers that are forwarding stale files (e.g. after a jobsite outage) - Raw API tester — text input + GET button against any /api/sfm/* path, response rendered as prettified JSON New /admin/slmm page — same layout, stripped down to health + connection + raw API tester. For per-device SLM control the existing /sound-level-meters dashboard remains the right entry point. Backend (backend/routers/admin_modules.py): - GET /admin/sfm, GET /admin/slmm — HTML pages - GET /api/admin/sfm/overview — single aggregated probe that returns health, units, last 25 events with computed latency, stale-table counts, cache stats. Tolerant of partial failures: any sub-fetch error is captured into errors{} so a flaky SFM endpoint doesn't break the whole page - GET /api/admin/slmm/overview — health + connection info only for now Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-14 17:53:43 +00:00
parent 155f0b007a
commit 904ff04440
5 changed files with 641 additions and 1 deletions
@@ -106,6 +106,9 @@ app.include_router(settings.router)
 from backend.routers import watcher_manager
 app.include_router(watcher_manager.router)

+from backend.routers import admin_modules
+app.include_router(admin_modules.router)
+
 # Projects system routers
 app.include_router(projects.router)
 app.include_router(project_locations.router)
@@ -0,0 +1,209 @@
+"""
+Admin / diagnostic pages for the device modules (SFM, SLMM).
+
+These pages live under /admin/{module} and exist purely so an operator can
+peek under the hood and confirm the module is reachable, what data it's
+holding, and whether the proxy from terra-view is healthy.
+
+Routes:
+  GET /admin/sfm   — SFM diagnostic page
+  GET /admin/slmm  — SLMM diagnostic page
+
+API helpers (called by the HTML pages via fetch):
+  GET /api/admin/sfm/overview   — aggregated SFM health + db stats in one call
+  GET /api/admin/slmm/overview  — aggregated SLMM health + device count
+
+The pages are intentionally read-only.  Any actual administration of SFM
+or SLMM happens in those modules directly.
+"""
+
+import logging
+import os
+from datetime import datetime, timezone
+from typing import Any, Dict
+
+import httpx
+from fastapi import APIRouter, Depends, Request
+from fastapi.responses import HTMLResponse, JSONResponse
+from sqlalchemy.orm import Session
+
+from backend.database import get_db
+from backend.templates_config import templates
+
+log = logging.getLogger(__name__)
+
+router = APIRouter()
+
+SFM_BASE_URL  = os.getenv("SFM_BASE_URL",  "http://localhost:8200")
+SLMM_BASE_URL = os.getenv("SLMM_BASE_URL", "http://localhost:8100")
+
+
+# ── SFM ───────────────────────────────────────────────────────────────────────
+
+
+@router.get("/admin/sfm", response_class=HTMLResponse)
+def admin_sfm_page(request: Request):
+    return templates.TemplateResponse("admin_sfm.html", {
+        "request": request,
+        "sfm_base_url": SFM_BASE_URL,
+    })
+
+
+@router.get("/api/admin/sfm/overview")
+async def admin_sfm_overview() -> JSONResponse:
+    """Aggregated SFM diagnostic snapshot.
+
+    Returns health, db stats, stale-table counts, per-unit summary, and
+    recent events with forwarding latency.  Tolerant of partial failures:
+    any individual sub-fetch error is captured into its section, so a flaky
+    sub-endpoint doesn't break the whole page.
+    """
+    overview: Dict[str, Any] = {
+        "sfm_base_url": SFM_BASE_URL,
+        "checked_at":   datetime.now(timezone.utc).isoformat(),
+        "health":       None,
+        "reachable":    False,
+        "units":        [],
+        "events":       [],
+        "stale": {
+            "monitor_log": None,
+            "sessions":    None,
+        },
+        "cache_stats":  None,
+        "errors":       {},
+    }
+
+    async with httpx.AsyncClient(timeout=5.0) as client:
+        # Health
+        try:
+            r = await client.get(f"{SFM_BASE_URL}/health")
+            r.raise_for_status()
+            overview["health"]    = r.json()
+            overview["reachable"] = overview["health"].get("status") == "ok"
+        except Exception as e:  # noqa: BLE001
+            overview["errors"]["health"] = str(e)
+            overview["reachable"] = False
+
+        # If SFM is down, no point hitting the rest.
+        if not overview["reachable"]:
+            return JSONResponse(overview)
+
+        # Units
+        try:
+            r = await client.get(f"{SFM_BASE_URL}/db/units")
+            r.raise_for_status()
+            overview["units"] = r.json() or []
+        except Exception as e:  # noqa: BLE001
+            overview["errors"]["units"] = str(e)
+
+        # Recent events (newest 25 — bigger sample of the call-home stream)
+        try:
+            r = await client.get(f"{SFM_BASE_URL}/db/events", params={"limit": 25})
+            r.raise_for_status()
+            payload = r.json() or {}
+            events = payload.get("events", []) or []
+            # Compute forwarding latency: created_at (SFM ingest) − timestamp (event).
+            now = datetime.now(timezone.utc)
+            for ev in events:
+                ev.pop("waveform_blob", None)
+                ev.pop("a5_pickle_filename", None)
+                ts_str = ev.get("timestamp")
+                ca_str = ev.get("created_at")
+                latency_seconds = None
+                try:
+                    if ts_str and ca_str:
+                        ts = datetime.fromisoformat(ts_str.replace("Z", "+00:00"))
+                        ca = datetime.fromisoformat(ca_str.replace("Z", "+00:00"))
+                        if ts.tzinfo is None: ts = ts.replace(tzinfo=timezone.utc)
+                        if ca.tzinfo is None: ca = ca.replace(tzinfo=timezone.utc)
+                        latency_seconds = (ca - ts).total_seconds()
+                except ValueError:
+                    pass
+                ev["forwarding_latency_seconds"] = latency_seconds
+            overview["events"] = events
+        except Exception as e:  # noqa: BLE001
+            overview["errors"]["events"] = str(e)
+
+        # Stale tables (deprecated by the watcher-forward pipeline but still
+        # present in SFM's SQLite).  Surface as counts only.
+        for key, path in (("monitor_log", "/db/monitor_log"),
+                          ("sessions",    "/db/sessions")):
+            try:
+                r = await client.get(f"{SFM_BASE_URL}{path}", params={"limit": 1})
+                r.raise_for_status()
+                payload = r.json() or {}
+                # SFM returns count = total when limit covers all rows; we
+                # query with limit=1 just to be polite, then ask again with
+                # a high limit if we need the real total.
+                first_count = payload.get("count")
+                if first_count is None:
+                    overview["stale"][key] = None
+                    continue
+                # Re-query with high limit to get the true total.
+                r2 = await client.get(f"{SFM_BASE_URL}{path}", params={"limit": 100000})
+                r2.raise_for_status()
+                overview["stale"][key] = (r2.json() or {}).get("count")
+            except Exception as e:  # noqa: BLE001
+                overview["errors"][f"stale_{key}"] = str(e)
+
+        # Cache stats (in-memory device cache on SFM)
+        try:
+            r = await client.get(f"{SFM_BASE_URL}/cache/stats")
+            r.raise_for_status()
+            overview["cache_stats"] = r.json()
+        except Exception as e:  # noqa: BLE001
+            overview["errors"]["cache_stats"] = str(e)
+
+    # Aggregate counts the UI can render without re-walking arrays
+    overview["totals"] = {
+        "units":               len(overview["units"]),
+        "events_total":        sum(u.get("total_events", 0) for u in overview["units"]),
+        "stale_monitor_log":   overview["stale"]["monitor_log"],
+        "stale_sessions":      overview["stale"]["sessions"],
+    }
+
+    return JSONResponse(overview)
+
+
+# ── SLMM ──────────────────────────────────────────────────────────────────────
+
+
+@router.get("/admin/slmm", response_class=HTMLResponse)
+def admin_slmm_page(request: Request):
+    return templates.TemplateResponse("admin_slmm.html", {
+        "request": request,
+        "slmm_base_url": SLMM_BASE_URL,
+    })
+
+
+@router.get("/api/admin/slmm/overview")
+async def admin_slmm_overview() -> JSONResponse:
+    """Aggregated SLMM diagnostic snapshot."""
+    overview: Dict[str, Any] = {
+        "slmm_base_url": SLMM_BASE_URL,
+        "checked_at":    datetime.now(timezone.utc).isoformat(),
+        "health":        None,
+        "reachable":     False,
+        "devices":       [],
+        "errors":        {},
+    }
+
+    async with httpx.AsyncClient(timeout=5.0) as client:
+        try:
+            r = await client.get(f"{SLMM_BASE_URL}/health")
+            r.raise_for_status()
+            overview["health"]    = r.json()
+            overview["reachable"] = True
+        except Exception as e:  # noqa: BLE001
+            overview["errors"]["health"] = str(e)
+            return JSONResponse(overview)
+
+        # Pull a roster of configured devices (SLMM exposes per-unit
+        # config + status under /api/nl43/*).  This is a best-effort probe
+        # — SLMM doesn't expose a "list all devices" endpoint, so we ask
+        # terra-view's RosterUnit table what serials it knows about for
+        # SLMs and just check each one.  For now, just surface the health
+        # payload and let the operator click through to /sound-level-meters
+        # for the per-device details.
+
+    return JSONResponse(overview)