From ad6071b79031d237c39cfaeed32573ee1b131554 Mon Sep 17 00:00:00 2001 From: serversdown Date: Thu, 11 Jun 2026 23:40:52 +0000 Subject: [PATCH] fix(alerts): reset rule state + close open event on rule edit/delete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit invalidate() only dropped the rule cache, not the per-(unit,rule) state machine — so editing a rule's metric/threshold left a stale 'active' phase that mis-evaluated against the new config (spurious clear, or suppressed onset), and deleting an in-alarm rule left an open AlertEvent that kept the client portal stuck "in alarm" forever. update/delete now call _reset_rule_runtime: forget_rule() drops the state machine and any open event for that rule is closed. Verified: existing evaluator tests + cooldown scenario still pass; compiles. Co-Authored-By: Claude Opus 4.8 --- app/alerts.py | 6 ++++++ app/routers.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/app/alerts.py b/app/alerts.py index cb89e2d..062730c 100644 --- a/app/alerts.py +++ b/app/alerts.py @@ -175,6 +175,12 @@ class AlertEvaluator: else: self._rule_cache.pop(unit_id, None) + def forget_rule(self, unit_id: str, rule_id: int) -> None: + """Drop a rule's per-(unit, rule) state machine after the rule is edited or + deleted, so a stale 'active' phase / open event_id from the old config + doesn't bleed into the new one (mis-firing a clear or suppressing an onset).""" + self._states.pop((unit_id, rule_id), None) + # -- scheduling ---------------------------------------------------------- def _in_schedule(self, rule) -> bool: diff --git a/app/routers.py b/app/routers.py index ac9d72b..7764b48 100644 --- a/app/routers.py +++ b/app/routers.py @@ -428,6 +428,19 @@ async def _sync_keepalive_to_rules(unit_id: str, db: Session): await m.set_keepalive(True) +def _reset_rule_runtime(unit_id: str, rule_id: int, db: Session): + """After a rule edit/delete: drop its evaluator state machine and close any open + event, so a stale 'active' phase doesn't mis-evaluate against the new config and + the client portal doesn't stay 'in alarm' on a rule that changed or is gone.""" + from app.alerts import alert_evaluator + alert_evaluator.forget_rule(unit_id, rule_id) + now = datetime.utcnow() + for evt in db.query(AlertEvent).filter_by(unit_id=unit_id, rule_id=rule_id, status="active").all(): + evt.clear_at = now + evt.status = "cleared" + db.commit() + + @router.post("/{unit_id}/alerts/rules") async def create_alert_rule(unit_id: str, payload: AlertRulePayload, db: Session = Depends(get_db)): rule = AlertRule(unit_id=unit_id, **payload.model_dump()) @@ -457,6 +470,7 @@ async def update_alert_rule(unit_id: str, rule_id: int, payload: AlertRulePayloa db.refresh(rule) from app.alerts import alert_evaluator alert_evaluator.invalidate(unit_id) + _reset_rule_runtime(unit_id, rule_id, db) await _sync_keepalive_to_rules(unit_id, db) return {"status": "ok", "rule": _rule_dict(rule)} @@ -470,6 +484,7 @@ async def delete_alert_rule(unit_id: str, rule_id: int, db: Session = Depends(ge db.commit() from app.alerts import alert_evaluator alert_evaluator.invalidate(unit_id) + _reset_rule_runtime(unit_id, rule_id, db) # close its open event so the portal doesn't stay red await _sync_keepalive_to_rules(unit_id, db) # no-op if no enabled rules remain return {"status": "ok", "deleted": rule_id}