From ad6071b79031d237c39cfaeed32573ee1b131554 Mon Sep 17 00:00:00 2001
From: serversdown <brian@serversdown.net>
Date: Thu, 11 Jun 2026 23:40:52 +0000
Subject: [PATCH] fix(alerts): reset rule state + close open event on rule
 edit/delete
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

invalidate() only dropped the rule cache, not the per-(unit,rule) state machine —
so editing a rule's metric/threshold left a stale 'active' phase that mis-evaluated
against the new config (spurious clear, or suppressed onset), and deleting an
in-alarm rule left an open AlertEvent that kept the client portal stuck "in alarm"
forever. update/delete now call _reset_rule_runtime: forget_rule() drops the state
machine and any open event for that rule is closed.

Verified: existing evaluator tests + cooldown scenario still pass; compiles.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 app/alerts.py  |  6 ++++++
 app/routers.py | 15 +++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/app/alerts.py b/app/alerts.py
index cb89e2d..062730c 100644
--- a/app/alerts.py
+++ b/app/alerts.py
@@ -175,6 +175,12 @@ class AlertEvaluator:
         else:
             self._rule_cache.pop(unit_id, None)
 
+    def forget_rule(self, unit_id: str, rule_id: int) -> None:
+        """Drop a rule's per-(unit, rule) state machine after the rule is edited or
+        deleted, so a stale 'active' phase / open event_id from the old config
+        doesn't bleed into the new one (mis-firing a clear or suppressing an onset)."""
+        self._states.pop((unit_id, rule_id), None)
+
     # -- scheduling ----------------------------------------------------------
 
     def _in_schedule(self, rule) -> bool:
diff --git a/app/routers.py b/app/routers.py
index ac9d72b..7764b48 100644
--- a/app/routers.py
+++ b/app/routers.py
@@ -428,6 +428,19 @@ async def _sync_keepalive_to_rules(unit_id: str, db: Session):
     await m.set_keepalive(True)
 
 
+def _reset_rule_runtime(unit_id: str, rule_id: int, db: Session):
+    """After a rule edit/delete: drop its evaluator state machine and close any open
+    event, so a stale 'active' phase doesn't mis-evaluate against the new config and
+    the client portal doesn't stay 'in alarm' on a rule that changed or is gone."""
+    from app.alerts import alert_evaluator
+    alert_evaluator.forget_rule(unit_id, rule_id)
+    now = datetime.utcnow()
+    for evt in db.query(AlertEvent).filter_by(unit_id=unit_id, rule_id=rule_id, status="active").all():
+        evt.clear_at = now
+        evt.status = "cleared"
+    db.commit()
+
+
 @router.post("/{unit_id}/alerts/rules")
 async def create_alert_rule(unit_id: str, payload: AlertRulePayload, db: Session = Depends(get_db)):
     rule = AlertRule(unit_id=unit_id, **payload.model_dump())
@@ -457,6 +470,7 @@ async def update_alert_rule(unit_id: str, rule_id: int, payload: AlertRulePayloa
     db.refresh(rule)
     from app.alerts import alert_evaluator
     alert_evaluator.invalidate(unit_id)
+    _reset_rule_runtime(unit_id, rule_id, db)
     await _sync_keepalive_to_rules(unit_id, db)
     return {"status": "ok", "rule": _rule_dict(rule)}
 
@@ -470,6 +484,7 @@ async def delete_alert_rule(unit_id: str, rule_id: int, db: Session = Depends(ge
     db.commit()
     from app.alerts import alert_evaluator
     alert_evaluator.invalidate(unit_id)
+    _reset_rule_runtime(unit_id, rule_id, db)   # close its open event so the portal doesn't stay red
     await _sync_keepalive_to_rules(unit_id, db)  # no-op if no enabled rules remain
     return {"status": "ok", "deleted": rule_id}