2026-06-22 18:07:37 -04:00
8 changed files with 1066 additions and 21 deletions
@@ -0,0 +1,244 @@
 """
 Threshold alert engine.
 Each unit can have any number of AlertRules. A rule is evaluated against the
 unit's live monitor snapshots via a small per-(unit, rule) state machine:
    IDLE  --(metric exceeds threshold for duration_s)-->  ACTIVE   (fire ONSET)
    ACTIVE --(metric recovers past hysteresis for duration_s)--> IDLE (fire CLEAR)
 duration_s debounces both edges; clear_margin_db adds hysteresis so a level
 hovering at the threshold doesn't flap. Onset and clear are distinct events.
 The state-machine logic (`_evaluate_step`) is intentionally pure — no DB, no
 real clock — so it can be unit-tested with a synthetic level series and a fake
 clock. The AlertEvaluator wraps it with rule loading, scheduling, persistence,
 and dispatch. Dispatch is a server log for now (POC); the seam to POST events to
 a Terra-View webhook (email/SMS) is _dispatch().
 """
 import asyncio
 import logging
 import os
 from dataclasses import dataclass
 from datetime import datetime, timedelta
 from typing import Dict, List, Optional, Tuple
 logger = logging.getLogger(__name__)
 # Local timezone offset for schedule windows (same env var services.py uses).
 _TZ_OFFSET_HOURS = float(os.getenv("TIMEZONE_OFFSET", "-5"))
 # How long to cache a unit's rules before re-querying the DB (rules change rarely).
 _RULE_CACHE_TTL_S = 15.0
@dataclass
 class RuleState:
    """In-memory runtime state for one (unit, rule)."""
    phase: str = "idle"                 # "idle" | "active"
    edge_since: Optional[float] = None  # when the current edge condition began (clock time)
    peak: float = 0.0
    event_id: Optional[int] = None      # the open AlertEvent row (for the clear update)
 def _exceeds(value: float, rule) -> bool:
    if rule.comparison == "below":
        return value < rule.threshold_db
    return value > rule.threshold_db
 def _recovered(value: float, rule) -> bool:
    margin = rule.clear_margin_db or 0.0
    if rule.comparison == "below":
        return value > rule.threshold_db + margin
    return value < rule.threshold_db - margin
 def _evaluate_step(state: RuleState, value: float, now: float, rule) -> Optional[str]:
    """Advance the state machine by one reading.
    Pure: mutates `state`, returns 'onset' | 'clear' | None. `now` is injected so
    tests can drive a fake clock.
    """
    duration = rule.duration_s or 0
    if state.phase == "idle":
        if _exceeds(value, rule):
            if state.edge_since is None:
                state.edge_since = now
            if now - state.edge_since >= duration:
                state.phase = "active"
                state.edge_since = None
                state.peak = value
                return "onset"
        else:
            state.edge_since = None
        return None
    # active
    if rule.comparison == "below":
        state.peak = min(state.peak, value)
    else:
        state.peak = max(state.peak, value)
    if _recovered(value, rule):
        if state.edge_since is None:
            state.edge_since = now
        if now - state.edge_since >= duration:
            state.phase = "idle"
            state.edge_since = None
            return "clear"
    else:
        state.edge_since = None
    return None
 def _in_window(now_minutes: int, start: str, end: str) -> bool:
    """Is now_minutes (minutes since local midnight) within [start, end)?
    Handles wraparound windows like 22:00–07:00."""
    def _m(s: str) -> int:
        h, m = s.split(":")
        return int(h) * 60 + int(m)
    s, e = _m(start), _m(end)
    if s == e:
        return True
    if s < e:
        return s <= now_minutes < e
    return now_minutes >= s or now_minutes < e  # wraparound
 class AlertEvaluator:
    def __init__(self):
        self._states: Dict[Tuple[str, int], RuleState] = {}
        self._rule_cache: Dict[str, Tuple[float, list]] = {}  # unit_id -> (fetched_at, rules)
        logger.info("[ALERT] rule-based evaluator ready")
    async def evaluate(self, unit_id: str, snap) -> None:
        """Evaluate every enabled rule for this unit against one snapshot."""
        rules = self._get_rules(unit_id)
        if not rules:
            return
        now = asyncio.get_running_loop().time()
        for rule in rules:
            if not self._in_schedule(rule):
                continue
            raw = getattr(snap, rule.metric, None)
            try:
                value = float(raw)
            except (TypeError, ValueError):
                continue  # missing / non-numeric ("-.-")
            state = self._states.setdefault((unit_id, rule.id), RuleState())
            action = _evaluate_step(state, value, now, rule)
            if action == "onset":
                await self._on_onset(unit_id, rule, value, state)
            elif action == "clear":
                await self._on_clear(unit_id, rule, value, state)
    # -- rule loading (cached) ----------------------------------------------
    def _get_rules(self, unit_id: str) -> list:
        loop_now = asyncio.get_running_loop().time()
        cached = self._rule_cache.get(unit_id)
        if cached and loop_now - cached[0] < _RULE_CACHE_TTL_S:
            return cached[1]
        rules = self._load_rules(unit_id)
        self._rule_cache[unit_id] = (loop_now, rules)
        return rules
    def _load_rules(self, unit_id: str) -> list:
        from app.database import SessionLocal
        from app.models import AlertRule
        db = SessionLocal()
        try:
            return db.query(AlertRule).filter_by(unit_id=unit_id, enabled=True).all()
        except Exception as e:
            logger.warning(f"[ALERT] failed to load rules for {unit_id}: {e}")
            return []
        finally:
            db.close()
    def invalidate(self, unit_id: Optional[str] = None) -> None:
        """Drop cached rules so a change is picked up immediately."""
        if unit_id is None:
            self._rule_cache.clear()
        else:
            self._rule_cache.pop(unit_id, None)
    # -- scheduling ----------------------------------------------------------
    def _in_schedule(self, rule) -> bool:
        if not rule.schedule_start or not rule.schedule_end:
            day_ok = self._day_ok(rule)
            return day_ok
        local = datetime.utcnow() + timedelta(hours=_TZ_OFFSET_HOURS)
        if not self._day_ok(rule, local):
            return False
        return _in_window(local.hour * 60 + local.minute, rule.schedule_start, rule.schedule_end)
    @staticmethod
    def _day_ok(rule, local: Optional[datetime] = None) -> bool:
        if not rule.schedule_days:
            return True
        if local is None:
            local = datetime.utcnow() + timedelta(hours=_TZ_OFFSET_HOURS)
        allowed = {int(d) for d in str(rule.schedule_days).split(",") if d.strip() != ""}
        return local.weekday() in allowed  # Mon=0
    # -- event persistence + dispatch ---------------------------------------
    async def _on_onset(self, unit_id: str, rule, value: float, state: RuleState) -> None:
        from app.database import SessionLocal
        from app.models import AlertEvent
        db = SessionLocal()
        try:
            evt = AlertEvent(
                rule_id=rule.id, unit_id=unit_id, rule_name=rule.name,
                metric=rule.metric, threshold_db=rule.threshold_db,
                onset_value=value, peak_value=value, status="active",
            )
            db.add(evt)
            db.commit()
            db.refresh(evt)
            state.event_id = evt.id
        except Exception as e:
            logger.warning(f"[ALERT] failed to record onset for {unit_id}: {e}")
        finally:
            db.close()
        await self._dispatch(
            "ONSET", unit_id, rule,
            f"{rule.metric.upper()}={value:.1f} dB "
            f"{'<' if rule.comparison == 'below' else '>'} {rule.threshold_db:.1f} dB"
            f"{f' for {rule.duration_s}s' if rule.duration_s else ''}",
        )
    async def _on_clear(self, unit_id: str, rule, value: float, state: RuleState) -> None:
        peak = state.peak
        from app.database import SessionLocal
        from app.models import AlertEvent
        db = SessionLocal()
        try:
            if state.event_id is not None:
                evt = db.query(AlertEvent).filter_by(id=state.event_id).first()
                if evt:
                    evt.clear_at = datetime.utcnow()
                    evt.peak_value = peak
                    evt.status = "cleared"
                    db.commit()
        except Exception as e:
            logger.warning(f"[ALERT] failed to record clear for {unit_id}: {e}")
        finally:
            db.close()
        state.event_id = None
        await self._dispatch(
            "CLEAR", unit_id, rule,
            f"recovered to {value:.1f} dB (peak {peak:.1f} dB)",
        )
    async def _dispatch(self, kind: str, unit_id: str, rule, detail: str) -> None:
        """POC dispatch: server log. Swap in a Terra-View webhook (email/SMS) here."""
        logger.warning(f"[ALERT:{kind}] {unit_id} '{rule.name}': {detail}")
 # Module-level singleton (the monitor calls alert_evaluator.evaluate per snapshot)
 alert_evaluator = AlertEvaluator()
@@ -8,6 +8,7 @@ for fast API access without querying devices on every request.
 import asyncio
 import logging
 import os
 from datetime import datetime, timedelta
 from typing import Optional
@@ -20,6 +21,11 @@ from app.device_logger import log_device_event, cleanup_old_logs
 logger = logging.getLogger(__name__)
 # Global polling default. Set SLMM_POLLING_ENABLED=false to start an instance in
 # standby (running but not polling and not holding device connections) — e.g. a
 # dev box that must not latch onto a device that a prod instance owns.
 POLLING_ENABLED_DEFAULT = os.getenv("SLMM_POLLING_ENABLED", "true").lower() == "true"
 class BackgroundPoller:
    """
@@ -39,6 +45,7 @@ class BackgroundPoller:
        self._logger = logger
        self._last_cleanup = None  # Track last log cleanup time
        self._last_pool_log = None  # Track last connection pool heartbeat log
        self._active = POLLING_ENABLED_DEFAULT  # Global polling on/off (standby toggle)
    async def start(self):
        """Start the background polling task."""
@@ -71,15 +78,48 @@ class BackgroundPoller:
        self._logger.info("Background poller stopped")
    def is_active(self) -> bool:
        """Whether background polling is currently active (vs standby)."""
        return self._active
    async def set_active(self, active: bool):
        """Globally enable/disable polling at runtime.
        When deactivated, the loop stays alive but polls nothing and releases all
        device connections, so this SLMM instance stops occupying the devices'
        single connection slots (e.g. so a prod instance can take over). Runtime
        state only — on restart the instance returns to SLMM_POLLING_ENABLED.
        """
        self._active = active
        if active:
            self._logger.info("[SYSTEM] Background polling ACTIVATED")
        else:
            self._logger.info("[SYSTEM] Background polling DEACTIVATED (standby) — releasing connections")
            await self._release_all_connections()
    async def _release_all_connections(self):
        """Gracefully close every pooled device connection (no-op if none)."""
        from app.services import _connection_pool
        for device_key in list(_connection_pool.get_stats().get("connections", {})):
            await _connection_pool.discard(device_key)
    async def _poll_loop(self):
        """Main polling loop that runs continuously."""
        self._logger.info("Background polling loop started")
        while self._running:
            if self._active:
                try:
                    await self._poll_all_devices()
                except Exception as e:
                    self._logger.error(f"Error in poll loop: {e}", exc_info=True)
            else:
                # Standby: poll nothing, and keep holding no device connection slots
                # so another SLMM instance (e.g. prod) can talk to the devices.
                try:
                    await self._release_all_connections()
                except Exception as e:
                    self._logger.warning(f"Standby connection release failed: {e}")
            # Run log cleanup once per hour
            try:
@@ -138,10 +178,19 @@ class BackgroundPoller:
            now = datetime.utcnow()
            polled_count = 0
            from app.monitor import monitor_manager
            for cfg in configs:
                if not self._running:
                    break
                # Skip units with an active live monitor: it polls them at ~1Hz and
                # keeps the status cache fresh, so a redundant background poll would just
                # add load/lock-contention on the device's single connection.
                if monitor_manager.is_active(cfg.unit_id):
                    self._logger.debug(f"Skipping {cfg.unit_id} — live monitor active")
                    continue
                # Get current status
                status = db.query(NL43Status).filter_by(unit_id=cfg.unit_id).first()
@@ -1,4 +1,4 @@
-from sqlalchemy import Column, String, DateTime, Boolean, Integer, Text, func
+from sqlalchemy import Column, String, DateTime, Boolean, Integer, Float, Text, func
 from app.database import Base
@@ -41,6 +41,8 @@ class NL43Status(Base):
    lmax = Column(String, nullable=True)  # Maximum level
    lmin = Column(String, nullable=True)  # Minimum level
    lpeak = Column(String, nullable=True)  # Peak level
    ln1 = Column(String, nullable=True)  # Percentile slot LN1 (configurable; device default L5, contract L1)
    ln2 = Column(String, nullable=True)  # Percentile slot LN2 (configurable; device default L10)
    battery_level = Column(String, nullable=True)
    power_source = Column(String, nullable=True)
    sd_remaining_mb = Column(String, nullable=True)
@@ -72,3 +74,53 @@ class DeviceLog(Base):
    level = Column(String, default="INFO")  # DEBUG, INFO, WARNING, ERROR
    category = Column(String, default="GENERAL")  # TCP, FTP, POLL, COMMAND, STATE, SYNC
    message = Column(Text, nullable=False)
 class AlertRule(Base):
    """A threshold-alert rule evaluated against a unit's live monitor feed.
    Source-agnostic: today it runs over the DOD monitor; the same rule transfers
    unchanged if a unit's feed is later sourced from FTP intervals.
    """
    __tablename__ = "alert_rules"
    id = Column(Integer, primary_key=True, autoincrement=True)
    unit_id = Column(String, index=True, nullable=False)
    name = Column(String, nullable=False, default="Alert")
    metric = Column(String, nullable=False, default="lp")  # lp/leq/lmax/lmin/lpeak/ln1/ln2
    comparison = Column(String, nullable=False, default="above")  # above | below
    threshold_db = Column(Float, nullable=False)
    duration_s = Column(Integer, nullable=False, default=0)       # sustained seconds (0 = instant)
    clear_margin_db = Column(Float, nullable=False, default=2.0)  # hysteresis band
    cooldown_s = Column(Integer, nullable=False, default=300)     # min seconds between onsets
    # Optional time-of-day scoping (local time). schedule_start/end as "HH:MM";
    # null = always active. schedule_days = CSV of 0-6 (Mon=0); null = every day.
    schedule_start = Column(String, nullable=True)
    schedule_end = Column(String, nullable=True)
    schedule_days = Column(String, nullable=True)
    channels = Column(String, nullable=False, default="log")  # CSV: log,email,sms
    recipients = Column(Text, nullable=True)                  # CSV of emails/phones
    enabled = Column(Boolean, default=True)
    created_at = Column(DateTime, default=func.now())
 class AlertEvent(Base):
    """A fired alert (onset → clear), for history / inbox / acknowledgement."""
    __tablename__ = "alert_events"
    id = Column(Integer, primary_key=True, autoincrement=True)
    rule_id = Column(Integer, index=True, nullable=False)
    unit_id = Column(String, index=True, nullable=False)
    rule_name = Column(String, nullable=True)
    metric = Column(String, nullable=False)
    threshold_db = Column(Float, nullable=False)
    onset_at = Column(DateTime, default=func.now(), index=True)
    onset_value = Column(Float, nullable=True)
    peak_value = Column(Float, nullable=True)
    clear_at = Column(DateTime, nullable=True)
    status = Column(String, default="active")  # active | cleared
    acknowledged_at = Column(DateTime, nullable=True)
    acknowledged_by = Column(String, nullable=True)
    notes = Column(Text, nullable=True)
@@ -0,0 +1,229 @@
 """
 Per-device live monitor (fan-out hub).
 ONE DOD poll loop per device, broadcast to many subscribers:
 - browser WebSocket clients (live view) — they no longer each open their own
  device stream, so the NL43's single-connection limit stops causing the
  "second viewer sees nothing" contention.
 - the alert evaluator (threshold alerts), which can keep a device's feed running
  even with no browser attached.
 - persistence (each snapshot is written to NL43Status, like the poller does).
 The device's one TCP connection is respected: every poll goes through the same
 per-device lock + connection pool in services.py, so the monitor, the background
 poller, and on-demand commands all serialize safely.
 """
 import asyncio
 import logging
 import os
 from datetime import datetime
 from typing import Dict, Optional, Set
 from app.database import SessionLocal
 from app.models import NL43Config, NL43Status
 from app.services import NL43Client, persist_snapshot
 from app.alerts import alert_evaluator
 logger = logging.getLogger(__name__)
 # Sleep between DOD polls. Note the 1s device rate-limit (and DOD?+Measure? per
 # poll) already paces the effective rate to a few seconds; this is the extra idle.
 MONITOR_POLL_INTERVAL = float(os.getenv("MONITOR_POLL_INTERVAL", "1.0"))
 # If nothing has been broadcast in this many seconds (e.g. device offline and
 # silent), send a keepalive frame so reverse proxies don't drop the idle WS.
 MONITOR_HEARTBEAT_S = float(os.getenv("MONITOR_HEARTBEAT_S", "25"))
 def _snapshot_payload(snap, unit_id: str, measurement_start_time) -> dict:
    """Build the broadcast payload — same shape as the DRD stream, but DOD-sourced
    so it carries ln1/ln2 (which DRD cannot)."""
    return {
        "unit_id": unit_id,
        "timestamp": datetime.utcnow().isoformat(),
        "measurement_state": snap.measurement_state,
        "measurement_start_time": measurement_start_time,
        "counter": snap.counter,
        "lp": snap.lp,
        "leq": snap.leq,
        "lmax": snap.lmax,
        "lmin": snap.lmin,
        "lpeak": snap.lpeak,
        "ln1": snap.ln1,
        "ln2": snap.ln2,
        "raw_payload": snap.raw_payload,
    }
 class DeviceMonitor:
    """Owns a single DOD poll loop for one device and fans each snapshot out to
    all subscribers. Runs while it has at least one browser subscriber OR the
    server-side keep-alive (alerting) flag is set."""
    def __init__(self, unit_id: str):
        self.unit_id = unit_id
        self._subscribers: Set[asyncio.Queue] = set()
        self._keepalive = False
        self._task: Optional[asyncio.Task] = None
        self._lock = asyncio.Lock()
        self._last_payload: Optional[dict] = None  # replayed to new subscribers
        self._consec_fail = 0
        self._reachable = True  # last broadcast reachability (for transition frames)
    @property
    def running(self) -> bool:
        return self._task is not None and not self._task.done()
    def subscriber_count(self) -> int:
        return len(self._subscribers)
    def _has_demand(self) -> bool:
        return bool(self._subscribers) or self._keepalive
    def _ensure_task(self) -> None:
        if self._task is None or self._task.done():
            self._task = asyncio.create_task(self._run())
    async def subscribe(self) -> asyncio.Queue:
        q: asyncio.Queue = asyncio.Queue(maxsize=5)
        async with self._lock:
            self._subscribers.add(q)
            # Replay the last frame so a client connecting mid-stream sees data
            # (or the current 'unreachable' state) immediately, not after a poll.
            if self._last_payload is not None:
                try:
                    q.put_nowait(self._last_payload)
                except asyncio.QueueFull:
                    pass
            self._ensure_task()
        return q
    async def unsubscribe(self, q: asyncio.Queue) -> None:
        async with self._lock:
            self._subscribers.discard(q)
    async def set_keepalive(self, on: bool) -> None:
        async with self._lock:
            self._keepalive = on
            if on:
                self._ensure_task()
    async def _run(self) -> None:
        logger.info(f"[MONITOR] {self.unit_id}: feed started")
        loop = asyncio.get_running_loop()
        last_send = loop.time()
        try:
            while self._has_demand():
                snap, mst = await self._poll_once()
                if snap is not None:
                    self._consec_fail = 0
                    self._reachable = True
                    payload = _snapshot_payload(snap, self.unit_id, mst)
                    payload["feed_status"] = "ok"
                    self._broadcast(payload)
                    last_send = loop.time()
                    try:
                        await alert_evaluator.evaluate(self.unit_id, snap)
                    except Exception as e:
                        logger.warning(f"[MONITOR] {self.unit_id}: alert eval failed: {e}")
                else:
                    # Tell clients the device went offline — once, on transition, after a
                    # few failures so a momentary blip doesn't flap the UI.
                    self._consec_fail += 1
                    if self._reachable and self._consec_fail >= 3:
                        self._reachable = False
                        self._broadcast({
                            "unit_id": self.unit_id,
                            "timestamp": datetime.utcnow().isoformat(),
                            "feed_status": "unreachable",
                        })
                        last_send = loop.time()
                # Heartbeat: during quiet/offline stretches, send a keepalive so an
                # idle WS isn't dropped by a reverse proxy. Not cached (new subscribers
                # should still get the last real frame, not a heartbeat).
                if loop.time() - last_send >= MONITOR_HEARTBEAT_S:
                    self._broadcast({
                        "unit_id": self.unit_id,
                        "timestamp": datetime.utcnow().isoformat(),
                        "feed_status": "ok" if self._reachable else "unreachable",
                        "heartbeat": True,
                    }, cache=False)
                    last_send = loop.time()
                await asyncio.sleep(MONITOR_POLL_INTERVAL)
        finally:
            logger.info(f"[MONITOR] {self.unit_id}: feed stopped")
    async def _poll_once(self):
        """One DOD poll: read, persist, return (snapshot, measurement_start_iso)."""
        db = SessionLocal()
        try:
            cfg = db.query(NL43Config).filter_by(unit_id=self.unit_id).first()
            if not cfg or not cfg.tcp_enabled:
                return None, None
            client = NL43Client(
                cfg.host, cfg.tcp_port,
                ftp_username=cfg.ftp_username, ftp_password=cfg.ftp_password,
                ftp_port=cfg.ftp_port or 21,
            )
            snap = await client.request_dod()
            snap.unit_id = self.unit_id
            persist_snapshot(snap, db)
            db.commit()
            status = db.query(NL43Status).filter_by(unit_id=self.unit_id).first()
            mst = (status.measurement_start_time.isoformat()
                   if status and status.measurement_start_time else None)
            return snap, mst
        except Exception as e:
            logger.warning(f"[MONITOR] {self.unit_id}: poll failed: {e}")
            return None, None
        finally:
            db.close()
    def _broadcast(self, payload: dict, cache: bool = True) -> None:
        if cache:
            self._last_payload = payload  # replayed to new subscribers
        for q in list(self._subscribers):
            try:
                q.put_nowait(payload)
            except asyncio.QueueFull:
                # Slow consumer — drop this frame rather than stall the whole feed.
                pass
 class MonitorManager:
    """Registry of per-device monitors (one per unit_id)."""
    def __init__(self):
        self._monitors: Dict[str, DeviceMonitor] = {}
        self._lock = asyncio.Lock()
    async def get(self, unit_id: str) -> DeviceMonitor:
        async with self._lock:
            m = self._monitors.get(unit_id)
            if m is None:
                m = DeviceMonitor(unit_id)
                self._monitors[unit_id] = m
            return m
    def is_active(self, unit_id: str) -> bool:
        """True if this unit has a running monitor feed (so the background poller
        can skip it — the monitor already polls it more often)."""
        m = self._monitors.get(unit_id)
        return m is not None and m.running
    def status(self) -> dict:
        return {
            uid: {
                "running": m.running,
                "subscribers": m.subscriber_count(),
                "keepalive": m._keepalive,
            }
            for uid, m in self._monitors.items()
        }
 # Module-level singleton
 monitor_manager = MonitorManager()
@@ -11,7 +11,7 @@ import os
 import asyncio
 from app.database import get_db
-from app.models import NL43Config, NL43Status
+from app.models import NL43Config, NL43Status, AlertRule, AlertEvent
 from app.services import NL43Client, persist_snapshot
 logger = logging.getLogger(__name__)
@@ -121,6 +121,310 @@ async def flush_connection_pool():
    return {"status": "ok", "message": "All cached connections closed"}
@router.post("/{unit_id}/disconnect")
 async def disconnect_device(unit_id: str, db: Session = Depends(get_db)):
    """Cleanly close SLMM's persistent TCP connection to a single device.
    Gracefully closes (TCP FIN + wait_closed) the pooled connection for this
    device and removes it from the pool, freeing the NL43's single connection
    slot. Idempotent — a no-op if no connection is currently cached.
    Note: this releases the *idle* pooled connection. It does not interrupt an
    in-progress DRD stream or an in-flight command (those have the socket
    checked out of the pool) — close the stream WebSocket to end a live stream.
    """
    cfg = db.query(NL43Config).filter_by(unit_id=unit_id).first()
    if not cfg:
        raise HTTPException(status_code=404, detail="NL43 config not found")
    from app.services import _connection_pool
    device_key = f"{cfg.host}:{cfg.tcp_port}"
    had_conn = device_key in _connection_pool.get_stats().get("connections", {})
    await _connection_pool.discard(device_key)
    return {
        "status": "ok",
        "unit_id": unit_id,
        "device_key": device_key,
        "disconnected": had_conn,
        "message": "Connection closed" if had_conn else "No cached connection to close",
    }
@router.post("/{unit_id}/deactivate")
 async def deactivate_device(unit_id: str, db: Session = Depends(get_db)):
    """Make a single unit dormant: stop background polling for it AND drop its
    connection, freeing the device's connection slot. poll_enabled=False is
    persisted, so the unit stays dormant across restarts until /activate.
    """
    cfg = db.query(NL43Config).filter_by(unit_id=unit_id).first()
    if not cfg:
        raise HTTPException(status_code=404, detail="NL43 config not found")
    cfg.poll_enabled = False
    db.commit()
    from app.services import _connection_pool, _get_device_lock
    device_key = f"{cfg.host}:{cfg.tcp_port}"
    # Wait briefly for any in-flight poll/command to finish (so its connection is
    # back in the pool), then drop it. If a long-lived stream holds the lock we
    # don't block forever — discard the pooled connection regardless.
    lock = await _get_device_lock(device_key)
    acquired = False
    try:
        await asyncio.wait_for(lock.acquire(), timeout=10.0)
        acquired = True
    except asyncio.TimeoutError:
        acquired = False
    try:
        await _connection_pool.discard(device_key)
    finally:
        if acquired:
            lock.release()
    return {
        "status": "ok",
        "unit_id": unit_id,
        "poll_enabled": False,
        "message": "Polling disabled and connection closed for this unit",
    }
@router.post("/{unit_id}/activate")
 async def activate_device(unit_id: str, db: Session = Depends(get_db)):
    """Resume background polling for a unit previously deactivated."""
    cfg = db.query(NL43Config).filter_by(unit_id=unit_id).first()
    if not cfg:
        raise HTTPException(status_code=404, detail="NL43 config not found")
    cfg.poll_enabled = True
    db.commit()
    return {
        "status": "ok",
        "unit_id": unit_id,
        "poll_enabled": True,
        "message": "Polling enabled for this unit",
    }
@router.get("/_system/status")
 async def system_status():
    """Report whether this SLMM instance is actively polling or in standby."""
    from app.background_poller import poller
    from app.services import _connection_pool
    return {
        "status": "ok",
        "mode": "active" if poller.is_active() else "standby",
        "polling_active": poller.is_active(),
        "active_connections": _connection_pool.get_stats().get("active_connections", 0),
    }
@router.post("/_system/standby")
 async def system_standby():
    """Put this SLMM instance into standby: stop polling ALL devices and release
    every connection, so it stops occupying device slots (e.g. so a prod instance
    can take over). Runtime-only — on restart the instance returns to its
    SLMM_POLLING_ENABLED default.
    """
    from app.background_poller import poller
    await poller.set_active(False)
    return {"status": "ok", "mode": "standby",
            "message": "Polling stopped and all device connections released"}
@router.post("/_system/resume")
 async def system_resume():
    """Resume polling after standby (global)."""
    from app.background_poller import poller
    await poller.set_active(True)
    return {"status": "ok", "mode": "active", "message": "Polling resumed"}
 # ============================================================================
 # LIVE MONITOR (fan-out) — one DOD feed per device, broadcast to many clients
 # ============================================================================
@router.websocket("/{unit_id}/monitor")
 async def monitor_stream(websocket: WebSocket, unit_id: str):
    """Subscribe a browser to the device's shared 1 Hz DOD feed.
    Any number of clients can attach without each opening its own device
    connection (one poll loop per device, fanned out). Same JSON shape as the
    DRD stream, but DOD-sourced so it includes ln1/ln2 (L1/L10).
    """
    await websocket.accept()
    from app.monitor import monitor_manager
    monitor = await monitor_manager.get(unit_id)
    queue = await monitor.subscribe()
    logger.info(f"Monitor subscriber attached for {unit_id} ({monitor.subscriber_count()} total)")
    async def _watch_disconnect():
        # Completes when the client disconnects, so an idle feed (no data) still
        # detects the drop and we don't leak a subscription that keeps the device
        # feed (and its connection) alive.
        try:
            while True:
                msg = await websocket.receive()
                if msg.get("type") == "websocket.disconnect":
                    return
        except Exception:
            return
    gone = asyncio.ensure_future(_watch_disconnect())
    try:
        while not gone.done():
            try:
                payload = await asyncio.wait_for(queue.get(), timeout=1.0)
            except asyncio.TimeoutError:
                continue  # re-check gone.done()
            await websocket.send_json(payload)
    except WebSocketDisconnect:
        logger.info(f"Monitor subscriber disconnected for {unit_id}")
    except Exception as e:
        logger.warning(f"Monitor stream error for {unit_id}: {e}")
    finally:
        gone.cancel()
        await monitor.unsubscribe(queue)
@router.post("/{unit_id}/monitor/start")
 async def monitor_start(unit_id: str):
    """Keep the device's feed running even with no browser attached, so alerting
    evaluates continuously. Runtime-only (resets on restart)."""
    from app.monitor import monitor_manager
    monitor = await monitor_manager.get(unit_id)
    await monitor.set_keepalive(True)
    return {"status": "ok", "unit_id": unit_id, "running": monitor.running, "keepalive": True}
@router.post("/{unit_id}/monitor/stop")
 async def monitor_stop(unit_id: str):
    """Drop the keep-alive; the feed stops once no browser subscribers remain."""
    from app.monitor import monitor_manager
    monitor = await monitor_manager.get(unit_id)
    await monitor.set_keepalive(False)
    return {"status": "ok", "unit_id": unit_id, "keepalive": False}
@router.get("/_monitor/status")
 async def monitor_status():
    """Status of every device monitor (running, subscriber count, keep-alive)."""
    from app.monitor import monitor_manager
    return {"status": "ok", "monitors": monitor_manager.status()}
 # ============================================================================
 # ALERTS — threshold rules + fired events
 # ============================================================================
 class AlertRulePayload(BaseModel):
    name: str = "Alert"
    metric: str = "lp"            # lp/leq/lmax/lmin/lpeak/ln1/ln2
    comparison: str = "above"     # above | below
    threshold_db: float
    duration_s: int = 0           # sustained seconds before firing (0 = instant)
    clear_margin_db: float = 2.0  # hysteresis band
    cooldown_s: int = 300
    schedule_start: str | None = None  # "HH:MM" local; null = always
    schedule_end: str | None = None
    schedule_days: str | None = None   # CSV of 0-6 (Mon=0); null = every day
    channels: str = "log"
    recipients: str | None = None
    enabled: bool = True
 def _rule_dict(r: AlertRule) -> dict:
    return {
        "id": r.id, "unit_id": r.unit_id, "name": r.name, "metric": r.metric,
        "comparison": r.comparison, "threshold_db": r.threshold_db,
        "duration_s": r.duration_s, "clear_margin_db": r.clear_margin_db,
        "cooldown_s": r.cooldown_s, "schedule_start": r.schedule_start,
        "schedule_end": r.schedule_end, "schedule_days": r.schedule_days,
        "channels": r.channels, "recipients": r.recipients, "enabled": r.enabled,
    }
 def _event_dict(e: AlertEvent) -> dict:
    return {
        "id": e.id, "rule_id": e.rule_id, "unit_id": e.unit_id,
        "rule_name": e.rule_name, "metric": e.metric, "threshold_db": e.threshold_db,
        "onset_at": e.onset_at.isoformat() if e.onset_at else None,
        "onset_value": e.onset_value, "peak_value": e.peak_value,
        "clear_at": e.clear_at.isoformat() if e.clear_at else None,
        "status": e.status,
        "acknowledged_at": e.acknowledged_at.isoformat() if e.acknowledged_at else None,
        "acknowledged_by": e.acknowledged_by,
    }
@router.post("/{unit_id}/alerts/rules")
 def create_alert_rule(unit_id: str, payload: AlertRulePayload, db: Session = Depends(get_db)):
    rule = AlertRule(unit_id=unit_id, **payload.model_dump())
    db.add(rule)
    db.commit()
    db.refresh(rule)
    from app.alerts import alert_evaluator
    alert_evaluator.invalidate(unit_id)
    return {"status": "ok", "rule": _rule_dict(rule)}
@router.get("/{unit_id}/alerts/rules")
 def list_alert_rules(unit_id: str, db: Session = Depends(get_db)):
    rules = db.query(AlertRule).filter_by(unit_id=unit_id).all()
    return {"status": "ok", "rules": [_rule_dict(r) for r in rules]}
@router.put("/{unit_id}/alerts/rules/{rule_id}")
 def update_alert_rule(unit_id: str, rule_id: int, payload: AlertRulePayload, db: Session = Depends(get_db)):
    rule = db.query(AlertRule).filter_by(id=rule_id, unit_id=unit_id).first()
    if not rule:
        raise HTTPException(status_code=404, detail="Alert rule not found")
    for field, value in payload.model_dump().items():
        setattr(rule, field, value)
    db.commit()
    db.refresh(rule)
    from app.alerts import alert_evaluator
    alert_evaluator.invalidate(unit_id)
    return {"status": "ok", "rule": _rule_dict(rule)}
@router.delete("/{unit_id}/alerts/rules/{rule_id}")
 def delete_alert_rule(unit_id: str, rule_id: int, db: Session = Depends(get_db)):
    rule = db.query(AlertRule).filter_by(id=rule_id, unit_id=unit_id).first()
    if not rule:
        raise HTTPException(status_code=404, detail="Alert rule not found")
    db.delete(rule)
    db.commit()
    from app.alerts import alert_evaluator
    alert_evaluator.invalidate(unit_id)
    return {"status": "ok", "deleted": rule_id}
@router.get("/{unit_id}/alerts/events")
 def list_alert_events(unit_id: str, limit: int = 50, db: Session = Depends(get_db)):
    events = (db.query(AlertEvent).filter_by(unit_id=unit_id)
              .order_by(AlertEvent.onset_at.desc()).limit(limit).all())
    return {"status": "ok", "events": [_event_dict(e) for e in events]}
@router.post("/{unit_id}/alerts/events/{event_id}/ack")
 def ack_alert_event(unit_id: str, event_id: int, by: str | None = None, db: Session = Depends(get_db)):
    evt = db.query(AlertEvent).filter_by(id=event_id, unit_id=unit_id).first()
    if not evt:
        raise HTTPException(status_code=404, detail="Alert event not found")
    evt.acknowledged_at = datetime.utcnow()
    evt.acknowledged_by = by
    db.commit()
    return {"status": "ok", "acknowledged": event_id}
 # ============================================================================
 # GLOBAL POLLING STATUS ENDPOINT (must be before /{unit_id} routes)
 # ============================================================================
@@ -450,6 +754,8 @@ def get_status(unit_id: str, db: Session = Depends(get_db)):
            "lmax": status.lmax,
            "lmin": status.lmin,
            "lpeak": status.lpeak,
            "ln1": status.ln1,
            "ln2": status.ln2,
            "battery_level": status.battery_level,
            "power_source": status.power_source,
            "sd_remaining_mb": status.sd_remaining_mb,
@@ -472,6 +778,8 @@ class StatusPayload(BaseModel):
    lmax: str | None = None
    lmin: str | None = None
    lpeak: str | None = None
    ln1: str | None = None
    ln2: str | None = None
    battery_level: str | None = None
    power_source: str | None = None
    sd_remaining_mb: str | None = None
@@ -504,6 +812,8 @@ def upsert_status(unit_id: str, payload: StatusPayload, db: Session = Depends(ge
            "lmax": status.lmax,
            "lmin": status.lmin,
            "lpeak": status.lpeak,
            "ln1": status.ln1,
            "ln2": status.ln2,
            "battery_level": status.battery_level,
            "power_source": status.power_source,
            "sd_remaining_mb": status.sd_remaining_mb,
@@ -1205,6 +1515,8 @@ async def stream_live(websocket: WebSocket, unit_id: str):
                    "lmax": snap.lmax,  # Maximum level
                    "lmin": snap.lmin,  # Minimum level
                    "lpeak": snap.lpeak,  # Peak level
                    "ln1": snap.ln1,    # LN1 percentile (L1/L10 contract); null on DRD stream
                    "ln2": snap.ln2,    # LN2 percentile; null on DRD stream
                    "raw_payload": snap.raw_payload,
                })
            except Exception as e:
@@ -1876,6 +2188,8 @@ async def run_diagnostics(unit_id: str, db: Session = Depends(get_db)):
            "lmax": status.lmax,
            "lmin": status.lmin,
            "lpeak": status.lpeak,
            "ln1": status.ln1,
            "ln2": status.ln2,
            "battery_level": status.battery_level,
            "power_source": status.power_source,
            "sd_remaining_mb": status.sd_remaining_mb,
@@ -46,6 +46,8 @@ class NL43Snapshot:
    lmax: Optional[str] = None  # Maximum level
    lmin: Optional[str] = None  # Minimum level
    lpeak: Optional[str] = None  # Peak level
    ln1: Optional[str] = None  # Percentile slot LN1 (configurable; device default L5, contract L1)
    ln2: Optional[str] = None  # Percentile slot LN2 (configurable; device default L10)
    battery_level: Optional[str] = None
    power_source: Optional[str] = None
    sd_remaining_mb: Optional[str] = None
@@ -69,10 +71,27 @@ def persist_snapshot(s: NL43Snapshot, db: Session):
        logger.info(f"State transition check for {s.unit_id}: '{previous_state}' -> '{new_state}'")
-        # Device returns "Start" when measuring, "Stop" when stopped
+        # The device reports "Start" while measuring; the DOD path uses that string,
-        # Normalize to previous behavior for backward compatibility
+        # but the DRD stream path tags snapshots "Measure" (and the DOD fallback also
-        is_measuring = new_state == "Start"
+        # uses "Measure"). Treat ALL of these as "measuring" — otherwise opening and
-        was_measuring = previous_state == "Start"
+        # closing the live stream flips state "Start"->"Measure"->"Start", which the
        # old equality check misread as stop-then-start and reset measurement_start_time.
        #
        # Also: only act on RECOGNIZED states. A buffer desync on the shared connection
        # (e.g. right after a DRD/DOD test) can make a Measure? read return a stray,
        # garbled value; treating that as "not measuring" produced constant phantom
        # "STOPPED -> STARTED" log pairs and reset the timer. Ignore unknown reads.
        MEASURING_STATES = {"Start", "Measure"}
        STOPPED_STATES = {"Stop"}
        was_measuring = previous_state in MEASURING_STATES
        if new_state in MEASURING_STATES:
            is_measuring = True
        elif new_state in STOPPED_STATES:
            is_measuring = False
        else:
            logger.warning(f"Ignoring unrecognized measurement state for {s.unit_id}: {new_state!r}")
            is_measuring = was_measuring  # garbled/unknown read — no transition
        if not was_measuring and is_measuring:
            # Measurement just started - record the start time
@@ -95,6 +114,9 @@ def persist_snapshot(s: NL43Snapshot, db: Session):
            except Exception:
                pass
        # Only persist a recognized state so one garbled read can't poison the next
        # transition check (which would manufacture the phantom STOPPED/STARTED pair).
        if new_state in MEASURING_STATES or new_state in STOPPED_STATES:
            row.measurement_state = new_state
        row.counter = s.counter
        row.lp = s.lp
@@ -102,6 +124,8 @@ def persist_snapshot(s: NL43Snapshot, db: Session):
        row.lmax = s.lmax
        row.lmin = s.lmin
        row.lpeak = s.lpeak
        row.ln1 = s.ln1
        row.ln2 = s.ln2
        row.battery_level = s.battery_level
        row.power_source = s.power_source
        row.sd_remaining_mb = s.sd_remaining_mb
@@ -691,22 +715,29 @@ class NL43Client:
        snap = NL43Snapshot(unit_id="", raw_payload=resp, measurement_state=measurement_state)
-        # Parse known positions (based on NL43 communication guide - DRD format)
+        # Parse DOD positional fields. DOD's layout is DIFFERENT from DRD: it has NO
-        # DRD format: d0=counter, d1=Lp, d2=Leq, d3=Lmax, d4=Lmin, d5=Lpeak, d6=LIeq, ...
+        # leading counter and it includes LE plus LN1–LN5. The device returns 4 channels
        # of 16 fields each — [Lp, Leq, LE, Lmax, Lmin, LN1, LN2, LN3, LN4, LN5, Lpeak,
        # LIeq, Leq_mov, Ltm5, over, under] — and channel 1 (parts[0:16]) is the main
        # display. The previous code reused the DRD map (treating parts[0] as a counter),
        # which shifted everything: Lp was reported as the counter, Leq as Lp, LE as Leq,
        # and LN1 as Lpeak (you could spot it because "Lpeak" came out < Lmax).
        try:
            # Capture d0 (counter) for timer synchronization
            if len(parts) >= 1:
-                snap.counter = parts[0]  # d0: Measurement interval counter (1-600)
+                snap.lp = parts[0]      # Lp: instantaneous sound pressure level
            if len(parts) >= 2:
-                snap.lp = parts[1]     # d1: Instantaneous sound pressure level
+                snap.leq = parts[1]     # Leq: equivalent continuous level
-            if len(parts) >= 3:
+            # parts[2] = LE (sound exposure level) — not currently surfaced
                snap.leq = parts[2]    # d2: Equivalent continuous sound level
            if len(parts) >= 4:
-                snap.lmax = parts[3]   # d3: Maximum level
+                snap.lmax = parts[3]    # Lmax
            if len(parts) >= 5:
-                snap.lmin = parts[4]   # d4: Minimum level
+                snap.lmin = parts[4]    # Lmin
            if len(parts) >= 11:
                snap.lpeak = parts[10]  # Lpeak  (parts[5] is LN1, NOT Lpeak)
            if len(parts) >= 6:
-                snap.lpeak = parts[5]  # d5: Peak level
+                snap.ln1 = parts[5]     # LN1 percentile slot (device default L5; contract L1)
            if len(parts) >= 7:
                snap.ln2 = parts[6]     # LN2 percentile slot (device default L10)
        except (IndexError, ValueError) as e:
            logger.warning(f"Error parsing DOD data points: {e}")
@@ -0,0 +1,58 @@
 #!/usr/bin/env python3
 """
 Migration script to add ln1 and ln2 percentile columns to the nl43_status table.
 The NL-43 DOD response carries percentile slots LN1-LN5; the live SLM display
 (Terra-View) shows two of them (default L1/L10). This adds storage for the two
 surfaced slots. Run once per database to update existing schema.
 """
 import sqlite3
 import sys
 from pathlib import Path
 DB_PATH = Path(__file__).parent / "data" / "slmm.db"
 def migrate():
    """Add ln1 and ln2 columns to the nl43_status table."""
    if not DB_PATH.exists():
        print(f"Database not found at {DB_PATH}")
        print("No migration needed - database will be created with new schema")
        return
    conn = sqlite3.connect(DB_PATH)
    cursor = conn.cursor()
    try:
        cursor.execute("PRAGMA table_info(nl43_status)")
        columns = [row[1] for row in cursor.fetchall()]
        if "ln1" in columns and "ln2" in columns:
            print("✓ ln1/ln2 columns already exist, no migration needed")
            return
        if "ln1" not in columns:
            print("Adding ln1 column...")
            cursor.execute("ALTER TABLE nl43_status ADD COLUMN ln1 TEXT")
            print("✓ Added ln1 column")
        if "ln2" not in columns:
            print("Adding ln2 column...")
            cursor.execute("ALTER TABLE nl43_status ADD COLUMN ln2 TEXT")
            print("✓ Added ln2 column")
        conn.commit()
        print("\n✓ Migration completed successfully!")
    except Exception as e:
        conn.rollback()
        print(f"✗ Migration failed: {e}", file=sys.stderr)
        sys.exit(1)
    finally:
        conn.close()
 if __name__ == "__main__":
    migrate()
@@ -0,0 +1,68 @@
 """
 Synthetic unit test for the alert state machine — no DB, no device.
 Drives `_evaluate_step` with a fake clock + a level series and checks that
 onset/clear fire with the right debounce + hysteresis. Run:
    docker compose exec -T slmm python3 test_alert_evaluator.py
    # or, if app.alerts imports cleanly standalone:  python3 test_alert_evaluator.py
 """
 from types import SimpleNamespace
 from app.alerts import RuleState, _evaluate_step
 def rule(**kw):
    base = dict(threshold_db=85.0, duration_s=3, clear_margin_db=2.0, comparison="above")
    base.update(kw)
    return SimpleNamespace(**base)
 def run(series, r):
    st = RuleState()
    events = [(now, a) for value, now in series
              if (a := _evaluate_step(st, value, now, r))]
    return events, st
 def main():
    failures = 0
    def check(label, cond, detail=""):
        nonlocal failures
        print(("PASS" if cond else "FAIL"), label, detail)
        if not cond:
            failures += 1
    # 1) sustained exceedance -> onset after duration; recovery -> clear after duration
    r = rule(threshold_db=85, duration_s=3, clear_margin_db=2)
    ev, _ = run([(80, 0), (86, 1), (87, 2), (88, 3), (88, 4),
                 (88, 5), (82, 6), (82, 7), (82, 8), (82, 9)], r)
    onsets = [t for t, a in ev if a == "onset"]
    clears = [t for t, a in ev if a == "clear"]
    check("1 sustained onset@4 / clear@9", onsets == [4] and clears == [9], str(ev))
    # 2) brief spike under duration -> no onset (debounce)
    ev, _ = run([(80, 0), (90, 1), (90, 2), (80, 3), (80, 4)], rule(duration_s=3))
    check("2 brief spike debounced", ev == [], str(ev))
    # 3) hysteresis: a dip into the margin (below threshold, above threshold-margin)
    #    does NOT clear
    r = rule(threshold_db=85, duration_s=0, clear_margin_db=3)
    ev, st = run([(86, 0), (84, 1), (84, 2), (84, 3)], r)
    check("3 hysteresis holds ACTIVE", ev == [(0, "onset")] and st.phase == "active",
          f"{ev} phase={st.phase}")
    # 4) 'below' comparison (device too quiet) -> onset when value < threshold
    ev, _ = run([(30, 0), (15, 1)], rule(threshold_db=20, duration_s=0,
                                         clear_margin_db=2, comparison="below"))
    check("4 below-comparison onset@1", ev == [(1, "onset")], str(ev))
    print()
    print("ALL PASS" if failures == 0 else f"{failures} FAILURE(S)")
    return failures
 if __name__ == "__main__":
    import sys
    sys.exit(1 if main() else 0)