feat(reports): one safe restart-retry in the cycle before alerting
If the post-restart DOD check shows the meter isn't measuring, retry once with start_recording (a plain start that does NOT re-index, unlike start_cycle) and re-verify before raising the schedule-failed alert. Retry fires only on a confident not-measuring reading — never on a failed/inconclusive DOD read — so a flaky read can't disrupt an already-running measurement or split the night across two store folders. Turns a transient restart hiccup into a self-heal instead of a meter left stopped overnight. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -680,31 +680,52 @@ class SchedulerService:
|
||||
logger.info(f"[CYCLE] New measurement started, session {new_session.id}")
|
||||
|
||||
# Step 6b: Verify the meter actually resumed measuring (fresh DOD).
|
||||
# Polling is still paused here, so query directly. Advisory: a
|
||||
# failure alerts loudly but doesn't fail the cycle (DOD reads can
|
||||
# be transiently flaky); the keepalive poll re-confirms within ~10s.
|
||||
# Polling is still paused here, so query directly. If it didn't
|
||||
# resume, retry ONCE with a plain start (start_recording — does NOT
|
||||
# re-index, unlike start_cycle) before alerting: a meter left
|
||||
# stopped overnight is the costly failure, and a transient restart
|
||||
# hiccup is common on the NL-43. We retry only on a *confident*
|
||||
# not-measuring reading — never on a failed/inconclusive DOD read —
|
||||
# so a flaky read can't disrupt an already-running measurement.
|
||||
if action.device_type == "slm":
|
||||
try:
|
||||
await asyncio.sleep(2)
|
||||
live = await self.device_controller.get_live_data(unit_id, action.device_type)
|
||||
state = ((live or {}).get("measurement_state")
|
||||
or ((live or {}).get("data") or {}).get("measurement_state") or "")
|
||||
measuring = str(state).strip().lower() in ("start", "measure", "measuring", "run", "running")
|
||||
result["steps"]["restart_verified"] = measuring
|
||||
if measuring:
|
||||
logger.info(f"[CYCLE] Restart verified — {unit_id} is measuring (state={state}).")
|
||||
else:
|
||||
logger.error(f"[CYCLE] Restart NOT verified for {unit_id} — state={state!r}")
|
||||
try:
|
||||
get_alert_service(db).create_schedule_failed_alert(
|
||||
schedule_id=action.id, action_type="cycle", unit_id=unit_id,
|
||||
error_message=f"Meter did not resume measuring after the cycle (state={state!r}).",
|
||||
project_id=action.project_id, location_id=action.location_id,
|
||||
)
|
||||
except Exception as ae:
|
||||
logger.warning(f"[CYCLE] restart-verify alert failed: {ae}")
|
||||
except Exception as e:
|
||||
logger.warning(f"[CYCLE] Restart verification skipped (DOD read failed): {e}")
|
||||
async def _check_measuring():
|
||||
"""Return (measuring, state); measuring is None if the DOD read failed."""
|
||||
try:
|
||||
await asyncio.sleep(2)
|
||||
live = await self.device_controller.get_live_data(unit_id, action.device_type)
|
||||
state = ((live or {}).get("measurement_state")
|
||||
or ((live or {}).get("data") or {}).get("measurement_state") or "")
|
||||
ok = str(state).strip().lower() in ("start", "measure", "measuring", "run", "running")
|
||||
return ok, state
|
||||
except Exception as e:
|
||||
logger.warning(f"[CYCLE] Restart-verify DOD read failed: {e}")
|
||||
return None, None
|
||||
|
||||
measuring, state = await _check_measuring()
|
||||
if measuring is False:
|
||||
logger.warning(f"[CYCLE] {unit_id} not measuring after restart (state={state!r}) — retrying start once.")
|
||||
result["steps"]["restart_retry"] = True
|
||||
try:
|
||||
await self.device_controller.start_recording(unit_id, action.device_type)
|
||||
measuring, state = await _check_measuring()
|
||||
except Exception as e:
|
||||
logger.error(f"[CYCLE] Restart retry (start_recording) failed for {unit_id}: {e}")
|
||||
|
||||
result["steps"]["restart_verified"] = measuring
|
||||
if measuring:
|
||||
logger.info(f"[CYCLE] Restart verified — {unit_id} is measuring (state={state}).")
|
||||
elif measuring is False:
|
||||
logger.error(f"[CYCLE] Restart NOT verified for {unit_id} after retry — state={state!r}")
|
||||
try:
|
||||
get_alert_service(db).create_schedule_failed_alert(
|
||||
schedule_id=action.id, action_type="cycle", unit_id=unit_id,
|
||||
error_message=f"Meter did not resume measuring after the cycle + one retry (state={state!r}).",
|
||||
project_id=action.project_id, location_id=action.location_id,
|
||||
)
|
||||
except Exception as ae:
|
||||
logger.warning(f"[CYCLE] restart-verify alert failed: {ae}")
|
||||
else:
|
||||
logger.warning(f"[CYCLE] Restart verification inconclusive for {unit_id} (DOD read failed); keepalive poll will re-confirm.")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"[CYCLE] Start failed: {e}")
|
||||
|
||||
Reference in New Issue
Block a user