9c43e68534
Replaces the POC single-threshold check with a real per-rule engine over
the live monitor feed.
- AlertRule / AlertEvent tables (auto-created via create_all; no migration).
Rule = {metric, comparison, threshold_db, duration_s, clear_margin_db,
schedule, channels, recipients}.
- alerts.py: per-(unit,rule) state machine IDLE->ACTIVE->IDLE with duration
debounce (both edges) + clear_margin hysteresis; onset/clear are distinct
events; optional nighttime schedule; rule cache w/ invalidation. The
state-machine core (_evaluate_step) is pure (no DB/clock) for testing.
- Dispatch is a server log (POC); _dispatch() is the seam for a Terra-View
webhook (email/SMS) later.
- CRUD: POST/GET/PUT/DELETE /{unit}/alerts/rules, GET /{unit}/alerts/events,
POST /{unit}/alerts/events/{id}/ack.
- test_alert_evaluator.py: synthetic level series proves onset debounce,
spike rejection, hysteresis hold, and below-comparison (4/4 pass, no device).
Source-agnostic: the same rules transfer unchanged if a unit's feed is later
sourced from FTP intervals instead of the DOD monitor.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
245 lines
9.0 KiB
Python
245 lines
9.0 KiB
Python
"""
|
||
Threshold alert engine.
|
||
|
||
Each unit can have any number of AlertRules. A rule is evaluated against the
|
||
unit's live monitor snapshots via a small per-(unit, rule) state machine:
|
||
|
||
IDLE --(metric exceeds threshold for duration_s)--> ACTIVE (fire ONSET)
|
||
ACTIVE --(metric recovers past hysteresis for duration_s)--> IDLE (fire CLEAR)
|
||
|
||
duration_s debounces both edges; clear_margin_db adds hysteresis so a level
|
||
hovering at the threshold doesn't flap. Onset and clear are distinct events.
|
||
|
||
The state-machine logic (`_evaluate_step`) is intentionally pure — no DB, no
|
||
real clock — so it can be unit-tested with a synthetic level series and a fake
|
||
clock. The AlertEvaluator wraps it with rule loading, scheduling, persistence,
|
||
and dispatch. Dispatch is a server log for now (POC); the seam to POST events to
|
||
a Terra-View webhook (email/SMS) is _dispatch().
|
||
"""
|
||
|
||
import asyncio
|
||
import logging
|
||
import os
|
||
from dataclasses import dataclass
|
||
from datetime import datetime, timedelta
|
||
from typing import Dict, List, Optional, Tuple
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Local timezone offset for schedule windows (same env var services.py uses).
|
||
_TZ_OFFSET_HOURS = float(os.getenv("TIMEZONE_OFFSET", "-5"))
|
||
|
||
# How long to cache a unit's rules before re-querying the DB (rules change rarely).
|
||
_RULE_CACHE_TTL_S = 15.0
|
||
|
||
|
||
@dataclass
|
||
class RuleState:
|
||
"""In-memory runtime state for one (unit, rule)."""
|
||
phase: str = "idle" # "idle" | "active"
|
||
edge_since: Optional[float] = None # when the current edge condition began (clock time)
|
||
peak: float = 0.0
|
||
event_id: Optional[int] = None # the open AlertEvent row (for the clear update)
|
||
|
||
|
||
def _exceeds(value: float, rule) -> bool:
|
||
if rule.comparison == "below":
|
||
return value < rule.threshold_db
|
||
return value > rule.threshold_db
|
||
|
||
|
||
def _recovered(value: float, rule) -> bool:
|
||
margin = rule.clear_margin_db or 0.0
|
||
if rule.comparison == "below":
|
||
return value > rule.threshold_db + margin
|
||
return value < rule.threshold_db - margin
|
||
|
||
|
||
def _evaluate_step(state: RuleState, value: float, now: float, rule) -> Optional[str]:
|
||
"""Advance the state machine by one reading.
|
||
|
||
Pure: mutates `state`, returns 'onset' | 'clear' | None. `now` is injected so
|
||
tests can drive a fake clock.
|
||
"""
|
||
duration = rule.duration_s or 0
|
||
|
||
if state.phase == "idle":
|
||
if _exceeds(value, rule):
|
||
if state.edge_since is None:
|
||
state.edge_since = now
|
||
if now - state.edge_since >= duration:
|
||
state.phase = "active"
|
||
state.edge_since = None
|
||
state.peak = value
|
||
return "onset"
|
||
else:
|
||
state.edge_since = None
|
||
return None
|
||
|
||
# active
|
||
if rule.comparison == "below":
|
||
state.peak = min(state.peak, value)
|
||
else:
|
||
state.peak = max(state.peak, value)
|
||
|
||
if _recovered(value, rule):
|
||
if state.edge_since is None:
|
||
state.edge_since = now
|
||
if now - state.edge_since >= duration:
|
||
state.phase = "idle"
|
||
state.edge_since = None
|
||
return "clear"
|
||
else:
|
||
state.edge_since = None
|
||
return None
|
||
|
||
|
||
def _in_window(now_minutes: int, start: str, end: str) -> bool:
|
||
"""Is now_minutes (minutes since local midnight) within [start, end)?
|
||
Handles wraparound windows like 22:00–07:00."""
|
||
def _m(s: str) -> int:
|
||
h, m = s.split(":")
|
||
return int(h) * 60 + int(m)
|
||
s, e = _m(start), _m(end)
|
||
if s == e:
|
||
return True
|
||
if s < e:
|
||
return s <= now_minutes < e
|
||
return now_minutes >= s or now_minutes < e # wraparound
|
||
|
||
|
||
class AlertEvaluator:
|
||
def __init__(self):
|
||
self._states: Dict[Tuple[str, int], RuleState] = {}
|
||
self._rule_cache: Dict[str, Tuple[float, list]] = {} # unit_id -> (fetched_at, rules)
|
||
logger.info("[ALERT] rule-based evaluator ready")
|
||
|
||
async def evaluate(self, unit_id: str, snap) -> None:
|
||
"""Evaluate every enabled rule for this unit against one snapshot."""
|
||
rules = self._get_rules(unit_id)
|
||
if not rules:
|
||
return
|
||
now = asyncio.get_running_loop().time()
|
||
for rule in rules:
|
||
if not self._in_schedule(rule):
|
||
continue
|
||
raw = getattr(snap, rule.metric, None)
|
||
try:
|
||
value = float(raw)
|
||
except (TypeError, ValueError):
|
||
continue # missing / non-numeric ("-.-")
|
||
state = self._states.setdefault((unit_id, rule.id), RuleState())
|
||
action = _evaluate_step(state, value, now, rule)
|
||
if action == "onset":
|
||
await self._on_onset(unit_id, rule, value, state)
|
||
elif action == "clear":
|
||
await self._on_clear(unit_id, rule, value, state)
|
||
|
||
# -- rule loading (cached) ----------------------------------------------
|
||
|
||
def _get_rules(self, unit_id: str) -> list:
|
||
loop_now = asyncio.get_running_loop().time()
|
||
cached = self._rule_cache.get(unit_id)
|
||
if cached and loop_now - cached[0] < _RULE_CACHE_TTL_S:
|
||
return cached[1]
|
||
rules = self._load_rules(unit_id)
|
||
self._rule_cache[unit_id] = (loop_now, rules)
|
||
return rules
|
||
|
||
def _load_rules(self, unit_id: str) -> list:
|
||
from app.database import SessionLocal
|
||
from app.models import AlertRule
|
||
db = SessionLocal()
|
||
try:
|
||
return db.query(AlertRule).filter_by(unit_id=unit_id, enabled=True).all()
|
||
except Exception as e:
|
||
logger.warning(f"[ALERT] failed to load rules for {unit_id}: {e}")
|
||
return []
|
||
finally:
|
||
db.close()
|
||
|
||
def invalidate(self, unit_id: Optional[str] = None) -> None:
|
||
"""Drop cached rules so a change is picked up immediately."""
|
||
if unit_id is None:
|
||
self._rule_cache.clear()
|
||
else:
|
||
self._rule_cache.pop(unit_id, None)
|
||
|
||
# -- scheduling ----------------------------------------------------------
|
||
|
||
def _in_schedule(self, rule) -> bool:
|
||
if not rule.schedule_start or not rule.schedule_end:
|
||
day_ok = self._day_ok(rule)
|
||
return day_ok
|
||
local = datetime.utcnow() + timedelta(hours=_TZ_OFFSET_HOURS)
|
||
if not self._day_ok(rule, local):
|
||
return False
|
||
return _in_window(local.hour * 60 + local.minute, rule.schedule_start, rule.schedule_end)
|
||
|
||
@staticmethod
|
||
def _day_ok(rule, local: Optional[datetime] = None) -> bool:
|
||
if not rule.schedule_days:
|
||
return True
|
||
if local is None:
|
||
local = datetime.utcnow() + timedelta(hours=_TZ_OFFSET_HOURS)
|
||
allowed = {int(d) for d in str(rule.schedule_days).split(",") if d.strip() != ""}
|
||
return local.weekday() in allowed # Mon=0
|
||
|
||
# -- event persistence + dispatch ---------------------------------------
|
||
|
||
async def _on_onset(self, unit_id: str, rule, value: float, state: RuleState) -> None:
|
||
from app.database import SessionLocal
|
||
from app.models import AlertEvent
|
||
db = SessionLocal()
|
||
try:
|
||
evt = AlertEvent(
|
||
rule_id=rule.id, unit_id=unit_id, rule_name=rule.name,
|
||
metric=rule.metric, threshold_db=rule.threshold_db,
|
||
onset_value=value, peak_value=value, status="active",
|
||
)
|
||
db.add(evt)
|
||
db.commit()
|
||
db.refresh(evt)
|
||
state.event_id = evt.id
|
||
except Exception as e:
|
||
logger.warning(f"[ALERT] failed to record onset for {unit_id}: {e}")
|
||
finally:
|
||
db.close()
|
||
await self._dispatch(
|
||
"ONSET", unit_id, rule,
|
||
f"{rule.metric.upper()}={value:.1f} dB "
|
||
f"{'<' if rule.comparison == 'below' else '>'} {rule.threshold_db:.1f} dB"
|
||
f"{f' for {rule.duration_s}s' if rule.duration_s else ''}",
|
||
)
|
||
|
||
async def _on_clear(self, unit_id: str, rule, value: float, state: RuleState) -> None:
|
||
peak = state.peak
|
||
from app.database import SessionLocal
|
||
from app.models import AlertEvent
|
||
db = SessionLocal()
|
||
try:
|
||
if state.event_id is not None:
|
||
evt = db.query(AlertEvent).filter_by(id=state.event_id).first()
|
||
if evt:
|
||
evt.clear_at = datetime.utcnow()
|
||
evt.peak_value = peak
|
||
evt.status = "cleared"
|
||
db.commit()
|
||
except Exception as e:
|
||
logger.warning(f"[ALERT] failed to record clear for {unit_id}: {e}")
|
||
finally:
|
||
db.close()
|
||
state.event_id = None
|
||
await self._dispatch(
|
||
"CLEAR", unit_id, rule,
|
||
f"recovered to {value:.1f} dB (peak {peak:.1f} dB)",
|
||
)
|
||
|
||
async def _dispatch(self, kind: str, unit_id: str, rule, detail: str) -> None:
|
||
"""POC dispatch: server log. Swap in a Terra-View webhook (email/SMS) here."""
|
||
logger.warning(f"[ALERT:{kind}] {unit_id} '{rule.name}': {detail}")
|
||
|
||
|
||
# Module-level singleton (the monitor calls alert_evaluator.evaluate per snapshot)
|
||
alert_evaluator = AlertEvaluator()
|