feat(reports): one safe restart-retry in the cycle before alerting
If the post-restart DOD check shows the meter isn't measuring, retry once with start_recording (a plain start that does NOT re-index, unlike start_cycle) and re-verify before raising the schedule-failed alert. Retry fires only on a confident not-measuring reading — never on a failed/inconclusive DOD read — so a flaky read can't disrupt an already-running measurement or split the night across two store folders. Turns a transient restart hiccup into a self-heal instead of a meter left stopped overnight. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -680,31 +680,52 @@ class SchedulerService:
|
|||||||
logger.info(f"[CYCLE] New measurement started, session {new_session.id}")
|
logger.info(f"[CYCLE] New measurement started, session {new_session.id}")
|
||||||
|
|
||||||
# Step 6b: Verify the meter actually resumed measuring (fresh DOD).
|
# Step 6b: Verify the meter actually resumed measuring (fresh DOD).
|
||||||
# Polling is still paused here, so query directly. Advisory: a
|
# Polling is still paused here, so query directly. If it didn't
|
||||||
# failure alerts loudly but doesn't fail the cycle (DOD reads can
|
# resume, retry ONCE with a plain start (start_recording — does NOT
|
||||||
# be transiently flaky); the keepalive poll re-confirms within ~10s.
|
# re-index, unlike start_cycle) before alerting: a meter left
|
||||||
|
# stopped overnight is the costly failure, and a transient restart
|
||||||
|
# hiccup is common on the NL-43. We retry only on a *confident*
|
||||||
|
# not-measuring reading — never on a failed/inconclusive DOD read —
|
||||||
|
# so a flaky read can't disrupt an already-running measurement.
|
||||||
if action.device_type == "slm":
|
if action.device_type == "slm":
|
||||||
try:
|
async def _check_measuring():
|
||||||
await asyncio.sleep(2)
|
"""Return (measuring, state); measuring is None if the DOD read failed."""
|
||||||
live = await self.device_controller.get_live_data(unit_id, action.device_type)
|
try:
|
||||||
state = ((live or {}).get("measurement_state")
|
await asyncio.sleep(2)
|
||||||
or ((live or {}).get("data") or {}).get("measurement_state") or "")
|
live = await self.device_controller.get_live_data(unit_id, action.device_type)
|
||||||
measuring = str(state).strip().lower() in ("start", "measure", "measuring", "run", "running")
|
state = ((live or {}).get("measurement_state")
|
||||||
result["steps"]["restart_verified"] = measuring
|
or ((live or {}).get("data") or {}).get("measurement_state") or "")
|
||||||
if measuring:
|
ok = str(state).strip().lower() in ("start", "measure", "measuring", "run", "running")
|
||||||
logger.info(f"[CYCLE] Restart verified — {unit_id} is measuring (state={state}).")
|
return ok, state
|
||||||
else:
|
except Exception as e:
|
||||||
logger.error(f"[CYCLE] Restart NOT verified for {unit_id} — state={state!r}")
|
logger.warning(f"[CYCLE] Restart-verify DOD read failed: {e}")
|
||||||
try:
|
return None, None
|
||||||
get_alert_service(db).create_schedule_failed_alert(
|
|
||||||
schedule_id=action.id, action_type="cycle", unit_id=unit_id,
|
measuring, state = await _check_measuring()
|
||||||
error_message=f"Meter did not resume measuring after the cycle (state={state!r}).",
|
if measuring is False:
|
||||||
project_id=action.project_id, location_id=action.location_id,
|
logger.warning(f"[CYCLE] {unit_id} not measuring after restart (state={state!r}) — retrying start once.")
|
||||||
)
|
result["steps"]["restart_retry"] = True
|
||||||
except Exception as ae:
|
try:
|
||||||
logger.warning(f"[CYCLE] restart-verify alert failed: {ae}")
|
await self.device_controller.start_recording(unit_id, action.device_type)
|
||||||
except Exception as e:
|
measuring, state = await _check_measuring()
|
||||||
logger.warning(f"[CYCLE] Restart verification skipped (DOD read failed): {e}")
|
except Exception as e:
|
||||||
|
logger.error(f"[CYCLE] Restart retry (start_recording) failed for {unit_id}: {e}")
|
||||||
|
|
||||||
|
result["steps"]["restart_verified"] = measuring
|
||||||
|
if measuring:
|
||||||
|
logger.info(f"[CYCLE] Restart verified — {unit_id} is measuring (state={state}).")
|
||||||
|
elif measuring is False:
|
||||||
|
logger.error(f"[CYCLE] Restart NOT verified for {unit_id} after retry — state={state!r}")
|
||||||
|
try:
|
||||||
|
get_alert_service(db).create_schedule_failed_alert(
|
||||||
|
schedule_id=action.id, action_type="cycle", unit_id=unit_id,
|
||||||
|
error_message=f"Meter did not resume measuring after the cycle + one retry (state={state!r}).",
|
||||||
|
project_id=action.project_id, location_id=action.location_id,
|
||||||
|
)
|
||||||
|
except Exception as ae:
|
||||||
|
logger.warning(f"[CYCLE] restart-verify alert failed: {ae}")
|
||||||
|
else:
|
||||||
|
logger.warning(f"[CYCLE] Restart verification inconclusive for {unit_id} (DOD read failed); keepalive poll will re-confirm.")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"[CYCLE] Start failed: {e}")
|
logger.error(f"[CYCLE] Start failed: {e}")
|
||||||
|
|||||||
Reference in New Issue
Block a user