From 3c5e830f9c70cba7d06c1857c8d7b03266debe43 Mon Sep 17 00:00:00 2001 From: serversdown Date: Tue, 16 Jun 2026 02:54:37 +0000 Subject: [PATCH] feat(reports): one safe restart-retry in the cycle before alerting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit If the post-restart DOD check shows the meter isn't measuring, retry once with start_recording (a plain start that does NOT re-index, unlike start_cycle) and re-verify before raising the schedule-failed alert. Retry fires only on a confident not-measuring reading — never on a failed/inconclusive DOD read — so a flaky read can't disrupt an already-running measurement or split the night across two store folders. Turns a transient restart hiccup into a self-heal instead of a meter left stopped overnight. Co-Authored-By: Claude Opus 4.8 (1M context) --- backend/services/scheduler.py | 69 +++++++++++++++++++++++------------ 1 file changed, 45 insertions(+), 24 deletions(-) diff --git a/backend/services/scheduler.py b/backend/services/scheduler.py index fc16ee9..ec03cd1 100644 --- a/backend/services/scheduler.py +++ b/backend/services/scheduler.py @@ -680,31 +680,52 @@ class SchedulerService: logger.info(f"[CYCLE] New measurement started, session {new_session.id}") # Step 6b: Verify the meter actually resumed measuring (fresh DOD). - # Polling is still paused here, so query directly. Advisory: a - # failure alerts loudly but doesn't fail the cycle (DOD reads can - # be transiently flaky); the keepalive poll re-confirms within ~10s. + # Polling is still paused here, so query directly. If it didn't + # resume, retry ONCE with a plain start (start_recording — does NOT + # re-index, unlike start_cycle) before alerting: a meter left + # stopped overnight is the costly failure, and a transient restart + # hiccup is common on the NL-43. We retry only on a *confident* + # not-measuring reading — never on a failed/inconclusive DOD read — + # so a flaky read can't disrupt an already-running measurement. if action.device_type == "slm": - try: - await asyncio.sleep(2) - live = await self.device_controller.get_live_data(unit_id, action.device_type) - state = ((live or {}).get("measurement_state") - or ((live or {}).get("data") or {}).get("measurement_state") or "") - measuring = str(state).strip().lower() in ("start", "measure", "measuring", "run", "running") - result["steps"]["restart_verified"] = measuring - if measuring: - logger.info(f"[CYCLE] Restart verified — {unit_id} is measuring (state={state}).") - else: - logger.error(f"[CYCLE] Restart NOT verified for {unit_id} — state={state!r}") - try: - get_alert_service(db).create_schedule_failed_alert( - schedule_id=action.id, action_type="cycle", unit_id=unit_id, - error_message=f"Meter did not resume measuring after the cycle (state={state!r}).", - project_id=action.project_id, location_id=action.location_id, - ) - except Exception as ae: - logger.warning(f"[CYCLE] restart-verify alert failed: {ae}") - except Exception as e: - logger.warning(f"[CYCLE] Restart verification skipped (DOD read failed): {e}") + async def _check_measuring(): + """Return (measuring, state); measuring is None if the DOD read failed.""" + try: + await asyncio.sleep(2) + live = await self.device_controller.get_live_data(unit_id, action.device_type) + state = ((live or {}).get("measurement_state") + or ((live or {}).get("data") or {}).get("measurement_state") or "") + ok = str(state).strip().lower() in ("start", "measure", "measuring", "run", "running") + return ok, state + except Exception as e: + logger.warning(f"[CYCLE] Restart-verify DOD read failed: {e}") + return None, None + + measuring, state = await _check_measuring() + if measuring is False: + logger.warning(f"[CYCLE] {unit_id} not measuring after restart (state={state!r}) — retrying start once.") + result["steps"]["restart_retry"] = True + try: + await self.device_controller.start_recording(unit_id, action.device_type) + measuring, state = await _check_measuring() + except Exception as e: + logger.error(f"[CYCLE] Restart retry (start_recording) failed for {unit_id}: {e}") + + result["steps"]["restart_verified"] = measuring + if measuring: + logger.info(f"[CYCLE] Restart verified — {unit_id} is measuring (state={state}).") + elif measuring is False: + logger.error(f"[CYCLE] Restart NOT verified for {unit_id} after retry — state={state!r}") + try: + get_alert_service(db).create_schedule_failed_alert( + schedule_id=action.id, action_type="cycle", unit_id=unit_id, + error_message=f"Meter did not resume measuring after the cycle + one retry (state={state!r}).", + project_id=action.project_id, location_id=action.location_id, + ) + except Exception as ae: + logger.warning(f"[CYCLE] restart-verify alert failed: {ae}") + else: + logger.warning(f"[CYCLE] Restart verification inconclusive for {unit_id} (DOD read failed); keepalive poll will re-confirm.") except Exception as e: logger.error(f"[CYCLE] Start failed: {e}")