From 9d34779171694270b302df18dd835f0192de2507 Mon Sep 17 00:00:00 2001
From: serversdown <brian@serversdown.net>
Date: Tue, 9 Jun 2026 18:52:13 +0000
Subject: [PATCH] perf: monitor caches run state, ~halving live-feed latency

Each monitor poll was sending DOD? + Measure? (two commands), and the NL43
enforces >=1s between commands, so updates were ~2.5s apart. The run state
changes rarely, so cache it and refresh via Measure? only every
MONITOR_STATE_REFRESH_S (default 30s); most polls now send just DOD? (one
rate-limited command) -> ~1.3s/update. Also trim MONITOR_POLL_INTERVAL to
0.25s since the device rate-limit is the real pacer.

request_dod() gains an optional measurement_state arg: when supplied it
reuses that state and skips the Measure? round-trip; None preserves the old
query-every-time behavior.

~1Hz is the device floor for DOD (the >=1s command spacing); DRD's 10Hz
push isn't reachable via polling, but ~1s is a normal cadence for SLM levels.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 app/monitor.py  | 26 ++++++++++++++++++++++----
 app/services.py | 21 +++++++++++++--------
 2 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/app/monitor.py b/app/monitor.py
index 47c21be..5f7c8b3 100644
--- a/app/monitor.py
+++ b/app/monitor.py
@@ -27,9 +27,14 @@ from app.alerts import alert_evaluator
 
 logger = logging.getLogger(__name__)
 
-# Sleep between DOD polls. Note the 1s device rate-limit (and DOD?+Measure? per
-# poll) already paces the effective rate to a few seconds; this is the extra idle.
-MONITOR_POLL_INTERVAL = float(os.getenv("MONITOR_POLL_INTERVAL", "1.0"))
+# Extra idle between DOD polls. The 1s device rate-limit already paces consecutive
+# DOD? commands, so this just needs to be small — the rate-limit is the real floor.
+MONITOR_POLL_INTERVAL = float(os.getenv("MONITOR_POLL_INTERVAL", "0.25"))
+
+# How often to refresh the run state (Measure?). It changes rarely, so we cache it
+# and skip that second rate-limited command on most polls — roughly halving the
+# per-update latency (~2.5s -> ~1.3s).
+MONITOR_STATE_REFRESH_S = float(os.getenv("MONITOR_STATE_REFRESH_S", "30"))
 
 # If nothing has been broadcast in this many seconds (e.g. device offline and
 # silent), send a keepalive frame so reverse proxies don't drop the idle WS.
@@ -70,6 +75,8 @@ class DeviceMonitor:
         self._last_payload: Optional[dict] = None  # replayed to new subscribers
         self._consec_fail = 0
         self._reachable = True  # last broadcast reachability (for transition frames)
+        self._cached_state: Optional[str] = None  # run state, refreshed periodically
+        self._last_state_refresh = 0.0
 
     @property
     def running(self) -> bool:
@@ -168,7 +175,18 @@ class DeviceMonitor:
                 ftp_username=cfg.ftp_username, ftp_password=cfg.ftp_password,
                 ftp_port=cfg.ftp_port or 21,
             )
-            snap = await client.request_dod()
+            # Refresh the run state only every MONITOR_STATE_REFRESH_S; reuse the
+            # cached state otherwise so most polls send just DOD? (one rate-limited
+            # command) instead of DOD? + Measure?.
+            now = asyncio.get_running_loop().time()
+            refresh_state = (self._cached_state is None
+                             or now - self._last_state_refresh >= MONITOR_STATE_REFRESH_S)
+            snap = await client.request_dod(
+                measurement_state=None if refresh_state else self._cached_state
+            )
+            if refresh_state:
+                self._cached_state = snap.measurement_state
+                self._last_state_refresh = now
             snap.unit_id = self.unit_id
             persist_snapshot(snap, db)
             db.commit()
diff --git a/app/services.py b/app/services.py
index b5c68ea..c608226 100644
--- a/app/services.py
+++ b/app/services.py
@@ -680,10 +680,12 @@ class NL43Client:
         else:
             raise ValueError(f"Unknown result code: {result_code}")
 
-    async def request_dod(self) -> NL43Snapshot:
+    async def request_dod(self, measurement_state: Optional[str] = None) -> NL43Snapshot:
         """Request DOD (Data Output Display) snapshot from device.
 
-        Returns parsed measurement data from the device display.
+        Returns parsed measurement data from the device display. Pass
+        measurement_state to reuse a cached run state and skip the extra Measure?
+        round-trip (the state changes rarely); leave it None to query it.
         """
         # _send_command now handles result code validation and returns the data line
         resp = await self._send_command("DOD?\r\n")
@@ -706,12 +708,15 @@ class NL43Client:
 
         logger.info(f"Parsed {len(parts)} data points from DOD response")
 
-        # Query actual measurement state (DOD doesn't include this information)
-        try:
-            measurement_state = await self.get_measurement_state()
-        except Exception as e:
-            logger.warning(f"Failed to get measurement state, defaulting to 'Measure': {e}")
-            measurement_state = "Measure"
+        # DOD doesn't include the run state. Query it only when not supplied by the
+        # caller — the monitor passes a cached state most cycles and refreshes it
+        # occasionally, avoiding a second rate-limited command on every poll.
+        if measurement_state is None:
+            try:
+                measurement_state = await self.get_measurement_state()
+            except Exception as e:
+                logger.warning(f"Failed to get measurement state, defaulting to 'Measure': {e}")
+                measurement_state = "Measure"
 
         snap = NL43Snapshot(unit_id="", raw_payload=resp, measurement_state=measurement_state)