feat: alert engine stage 1 — rules, events, state machine, CRUD

Replaces the POC single-threshold check with a real per-rule engine over the live monitor feed. - AlertRule / AlertEvent tables (auto-created via create_all; no migration). Rule = {metric, comparison, threshold_db, duration_s, clear_margin_db, schedule, channels, recipients}. - alerts.py: per-(unit,rule) state machine IDLE->ACTIVE->IDLE with duration debounce (both edges) + clear_margin hysteresis; onset/clear are distinct events; optional nighttime schedule; rule cache w/ invalidation. The state-machine core (_evaluate_step) is pure (no DB/clock) for testing. - Dispatch is a server log (POC); _dispatch() is the seam for a Terra-View webhook (email/SMS) later. - CRUD: POST/GET/PUT/DELETE /{unit}/alerts/rules, GET /{unit}/alerts/events, POST /{unit}/alerts/events/{id}/ack. - test_alert_evaluator.py: synthetic level series proves onset debounce, spike rejection, hysteresis hold, and below-comparison (4/4 pass, no device). Source-agnostic: the same rules transfer unchanged if a unit's feed is later sourced from FTP intervals instead of the DOD monitor. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-09 01:04:03 +00:00
parent aa3e088b64
commit 9c43e68534
4 changed files with 444 additions and 48 deletions
@@ -1,71 +1,244 @@
 """
-Alert evaluation (POC).
+Threshold alert engine.

-Receives each monitor snapshot and fires an alert when a configured metric
-exceeds a threshold, with a cooldown so a sustained loud period doesn't spam.
+Each unit can have any number of AlertRules. A rule is evaluated against the
+unit's live monitor snapshots via a small per-(unit, rule) state machine:

-The RULE here is intentionally simple and swappable. Instantaneous Lp vs a
-sustained window vs L10 is still an open design decision — this evaluator is the
-single plug point for it. For the POC the rule is "instantaneous metric >
-threshold, rate-limited by a cooldown", and dispatch is just a server-side log.
-Wire email/SMS (likely via a Terra-View webhook) into _dispatch() later.
+    IDLE  --(metric exceeds threshold for duration_s)-->  ACTIVE   (fire ONSET)
+    ACTIVE --(metric recovers past hysteresis for duration_s)--> IDLE (fire CLEAR)

-Config via env:
- ALERT_ENABLED          (default true)
- ALERT_METRIC           which snapshot field to test: lp/leq/lmax/ln1/ln2 (default lp)
- ALERT_THRESHOLD_DB     numeric dB threshold (default 85)
- ALERT_COOLDOWN_SECONDS min seconds between alerts per unit (default 60)
+duration_s debounces both edges; clear_margin_db adds hysteresis so a level
+hovering at the threshold doesn't flap. Onset and clear are distinct events.
+
+The state-machine logic (`_evaluate_step`) is intentionally pure — no DB, no
+real clock — so it can be unit-tested with a synthetic level series and a fake
+clock. The AlertEvaluator wraps it with rule loading, scheduling, persistence,
+and dispatch. Dispatch is a server log for now (POC); the seam to POST events to
+a Terra-View webhook (email/SMS) is _dispatch().
 """

 import asyncio
 import logging
 import os
-from typing import Dict
+from dataclasses import dataclass
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional, Tuple

 logger = logging.getLogger(__name__)

+# Local timezone offset for schedule windows (same env var services.py uses).
+_TZ_OFFSET_HOURS = float(os.getenv("TIMEZONE_OFFSET", "-5"))
+
+# How long to cache a unit's rules before re-querying the DB (rules change rarely).
+_RULE_CACHE_TTL_S = 15.0
+
+
+@dataclass
+class RuleState:
+    """In-memory runtime state for one (unit, rule)."""
+    phase: str = "idle"                 # "idle" | "active"
+    edge_since: Optional[float] = None  # when the current edge condition began (clock time)
+    peak: float = 0.0
+    event_id: Optional[int] = None      # the open AlertEvent row (for the clear update)
+
+
+def _exceeds(value: float, rule) -> bool:
+    if rule.comparison == "below":
+        return value < rule.threshold_db
+    return value > rule.threshold_db
+
+
+def _recovered(value: float, rule) -> bool:
+    margin = rule.clear_margin_db or 0.0
+    if rule.comparison == "below":
+        return value > rule.threshold_db + margin
+    return value < rule.threshold_db - margin
+
+
+def _evaluate_step(state: RuleState, value: float, now: float, rule) -> Optional[str]:
+    """Advance the state machine by one reading.
+
+    Pure: mutates `state`, returns 'onset' | 'clear' | None. `now` is injected so
+    tests can drive a fake clock.
+    """
+    duration = rule.duration_s or 0
+
+    if state.phase == "idle":
+        if _exceeds(value, rule):
+            if state.edge_since is None:
+                state.edge_since = now
+            if now - state.edge_since >= duration:
+                state.phase = "active"
+                state.edge_since = None
+                state.peak = value
+                return "onset"
+        else:
+            state.edge_since = None
+        return None
+
+    # active
+    if rule.comparison == "below":
+        state.peak = min(state.peak, value)
+    else:
+        state.peak = max(state.peak, value)
+
+    if _recovered(value, rule):
+        if state.edge_since is None:
+            state.edge_since = now
+        if now - state.edge_since >= duration:
+            state.phase = "idle"
+            state.edge_since = None
+            return "clear"
+    else:
+        state.edge_since = None
+    return None
+
+
+def _in_window(now_minutes: int, start: str, end: str) -> bool:
+    """Is now_minutes (minutes since local midnight) within [start, end)?
+    Handles wraparound windows like 22:00–07:00."""
+    def _m(s: str) -> int:
+        h, m = s.split(":")
+        return int(h) * 60 + int(m)
+    s, e = _m(start), _m(end)
+    if s == e:
+        return True
+    if s < e:
+        return s <= now_minutes < e
+    return now_minutes >= s or now_minutes < e  # wraparound
+

 class AlertEvaluator:
    def __init__(self):
-        self.enabled = os.getenv("ALERT_ENABLED", "true").lower() == "true"
-        self.metric = os.getenv("ALERT_METRIC", "lp").lower()
-        self.threshold_db = float(os.getenv("ALERT_THRESHOLD_DB", "85"))
-        self.cooldown_s = float(os.getenv("ALERT_COOLDOWN_SECONDS", "60"))
-        self._last_fired: Dict[str, float] = {}
-        logger.info(
-            f"[ALERT] evaluator ready: enabled={self.enabled} metric={self.metric} "
-            f"threshold={self.threshold_db}dB cooldown={self.cooldown_s}s"
-        )
+        self._states: Dict[Tuple[str, int], RuleState] = {}
+        self._rule_cache: Dict[str, Tuple[float, list]] = {}  # unit_id -> (fetched_at, rules)
+        logger.info("[ALERT] rule-based evaluator ready")

    async def evaluate(self, unit_id: str, snap) -> None:
-        """Evaluate one snapshot; fire (log) if the metric exceeds threshold."""
-        if not self.enabled:
+        """Evaluate every enabled rule for this unit against one snapshot."""
+        rules = self._get_rules(unit_id)
+        if not rules:
            return
-
-        raw = getattr(snap, self.metric, None)
-        try:
-            level = float(raw)
-        except (TypeError, ValueError):
-            return  # missing / non-numeric (e.g. "-.-")
-
-        if level <= self.threshold_db:
-            return
-
-        # Cooldown — use the event loop clock (Math.random/Date.now-free).
        now = asyncio.get_running_loop().time()
-        if now - self._last_fired.get(unit_id, 0.0) < self.cooldown_s:
-            return
-        self._last_fired[unit_id] = now
+        for rule in rules:
+            if not self._in_schedule(rule):
+                continue
+            raw = getattr(snap, rule.metric, None)
+            try:
+                value = float(raw)
+            except (TypeError, ValueError):
+                continue  # missing / non-numeric ("-.-")
+            state = self._states.setdefault((unit_id, rule.id), RuleState())
+            action = _evaluate_step(state, value, now, rule)
+            if action == "onset":
+                await self._on_onset(unit_id, rule, value, state)
+            elif action == "clear":
+                await self._on_clear(unit_id, rule, value, state)

-        await self._dispatch(unit_id, level)
+    # -- rule loading (cached) ----------------------------------------------

-    async def _dispatch(self, unit_id: str, level: float) -> None:
-        """POC dispatch: server-side log. Swap in email/SMS here later."""
-        logger.warning(
-            f"[ALERT] {unit_id}: {self.metric.upper()}={level:.1f} dB exceeded "
-            f"threshold {self.threshold_db:.1f} dB"
+    def _get_rules(self, unit_id: str) -> list:
+        loop_now = asyncio.get_running_loop().time()
+        cached = self._rule_cache.get(unit_id)
+        if cached and loop_now - cached[0] < _RULE_CACHE_TTL_S:
+            return cached[1]
+        rules = self._load_rules(unit_id)
+        self._rule_cache[unit_id] = (loop_now, rules)
+        return rules
+
+    def _load_rules(self, unit_id: str) -> list:
+        from app.database import SessionLocal
+        from app.models import AlertRule
+        db = SessionLocal()
+        try:
+            return db.query(AlertRule).filter_by(unit_id=unit_id, enabled=True).all()
+        except Exception as e:
+            logger.warning(f"[ALERT] failed to load rules for {unit_id}: {e}")
+            return []
+        finally:
+            db.close()
+
+    def invalidate(self, unit_id: Optional[str] = None) -> None:
+        """Drop cached rules so a change is picked up immediately."""
+        if unit_id is None:
+            self._rule_cache.clear()
+        else:
+            self._rule_cache.pop(unit_id, None)
+
+    # -- scheduling ----------------------------------------------------------
+
+    def _in_schedule(self, rule) -> bool:
+        if not rule.schedule_start or not rule.schedule_end:
+            day_ok = self._day_ok(rule)
+            return day_ok
+        local = datetime.utcnow() + timedelta(hours=_TZ_OFFSET_HOURS)
+        if not self._day_ok(rule, local):
+            return False
+        return _in_window(local.hour * 60 + local.minute, rule.schedule_start, rule.schedule_end)
+
+    @staticmethod
+    def _day_ok(rule, local: Optional[datetime] = None) -> bool:
+        if not rule.schedule_days:
+            return True
+        if local is None:
+            local = datetime.utcnow() + timedelta(hours=_TZ_OFFSET_HOURS)
+        allowed = {int(d) for d in str(rule.schedule_days).split(",") if d.strip() != ""}
+        return local.weekday() in allowed  # Mon=0
+
+    # -- event persistence + dispatch ---------------------------------------
+
+    async def _on_onset(self, unit_id: str, rule, value: float, state: RuleState) -> None:
+        from app.database import SessionLocal
+        from app.models import AlertEvent
+        db = SessionLocal()
+        try:
+            evt = AlertEvent(
+                rule_id=rule.id, unit_id=unit_id, rule_name=rule.name,
+                metric=rule.metric, threshold_db=rule.threshold_db,
+                onset_value=value, peak_value=value, status="active",
+            )
+            db.add(evt)
+            db.commit()
+            db.refresh(evt)
+            state.event_id = evt.id
+        except Exception as e:
+            logger.warning(f"[ALERT] failed to record onset for {unit_id}: {e}")
+        finally:
+            db.close()
+        await self._dispatch(
+            "ONSET", unit_id, rule,
+            f"{rule.metric.upper()}={value:.1f} dB "
+            f"{'<' if rule.comparison == 'below' else '>'} {rule.threshold_db:.1f} dB"
+            f"{f' for {rule.duration_s}s' if rule.duration_s else ''}",
        )

+    async def _on_clear(self, unit_id: str, rule, value: float, state: RuleState) -> None:
+        peak = state.peak
+        from app.database import SessionLocal
+        from app.models import AlertEvent
+        db = SessionLocal()
+        try:
+            if state.event_id is not None:
+                evt = db.query(AlertEvent).filter_by(id=state.event_id).first()
+                if evt:
+                    evt.clear_at = datetime.utcnow()
+                    evt.peak_value = peak
+                    evt.status = "cleared"
+                    db.commit()
+        except Exception as e:
+            logger.warning(f"[ALERT] failed to record clear for {unit_id}: {e}")
+        finally:
+            db.close()
+        state.event_id = None
+        await self._dispatch(
+            "CLEAR", unit_id, rule,
+            f"recovered to {value:.1f} dB (peak {peak:.1f} dB)",
+        )

-# Module-level singleton
+    async def _dispatch(self, kind: str, unit_id: str, rule, detail: str) -> None:
+        """POC dispatch: server log. Swap in a Terra-View webhook (email/SMS) here."""
+        logger.warning(f"[ALERT:{kind}] {unit_id} '{rule.name}': {detail}")
+
+
+# Module-level singleton (the monitor calls alert_evaluator.evaluate per snapshot)
 alert_evaluator = AlertEvaluator()