Handle cold-boot timeout for TCP connections

- bridges/tcp_serial_bridge.py: increase default boot_delay 2s → 8s to
  cover MiniMate Plus cold-start time (unit wakes from RS-232 line
  assertion but takes 5-10s to be ready for POLL_PROBE).
- sfm/server.py: add _run_with_retry() — on TCP connections only, retries
  once on ProtocolError. Serial timeouts are not retried (usually a real
  fault). Confirmed behaviour: unit wakes purely from RS-232 line voltage,
  no software wake-up frame needed.
This commit is contained in:
Brian Harrison
2026-03-31 12:02:52 -04:00
parent da446cb2e3
commit de02f9cccf
2 changed files with 42 additions and 10 deletions

View File

@@ -55,8 +55,10 @@ DEFAULT_TCP_PORT = 12345
CHUNK = 256 # bytes per read call CHUNK = 256 # bytes per read call
SERIAL_TIMEOUT = 0.02 # serial read timeout (s) — non-blocking in practice SERIAL_TIMEOUT = 0.02 # serial read timeout (s) — non-blocking in practice
TCP_TIMEOUT = 0.02 # socket recv timeout (s) TCP_TIMEOUT = 0.02 # socket recv timeout (s)
BOOT_DELAY = 2.0 # seconds to wait after opening serial port before BOOT_DELAY = 8.0 # seconds to wait after opening serial port before
# forwarding data — mirrors the unit's startup beep # forwarding data — unit cold-boot (beep + OS init)
# takes 5-10s from first RS-232 line assertion.
# Set to 0 if unit was already running before connect.
# ── Bridge session ───────────────────────────────────────────────────────────── # ── Bridge session ─────────────────────────────────────────────────────────────

View File

@@ -170,6 +170,31 @@ def _build_client(
) )
def _is_tcp(host: Optional[str]) -> bool:
return bool(host)
def _run_with_retry(fn, *, is_tcp: bool):
"""
Call fn() and, for TCP connections only, retry once on ProtocolError.
Rationale: when a MiniMate Plus is cold (just had its serial lines asserted
by the modem or a local bridge), it takes 5-10 seconds to boot before it
will respond to POLL_PROBE. The first request may time out during that boot
window; a single automatic retry is enough to recover once the unit is up.
Serial connections are NOT retried — a timeout there usually means a real
problem (wrong port, wrong baud, cable unplugged).
"""
try:
return fn()
except ProtocolError as exc:
if not is_tcp:
raise
log.info("TCP poll timed out (unit may have been cold) — retrying once")
return fn() # let any second failure propagate normally
# ── Endpoints ────────────────────────────────────────────────────────────────── # ── Endpoints ──────────────────────────────────────────────────────────────────
@app.get("/health") @app.get("/health")
@@ -195,8 +220,10 @@ def device_info(
log.info("GET /device/info port=%s host=%s tcp_port=%d", port, host, tcp_port) log.info("GET /device/info port=%s host=%s tcp_port=%d", port, host, tcp_port)
try: try:
with _build_client(port, baud, host, tcp_port) as client: def _do():
info = client.connect() with _build_client(port, baud, host, tcp_port) as client:
return client.connect()
info = _run_with_retry(_do, is_tcp=_is_tcp(host))
except HTTPException: except HTTPException:
raise raise
except ProtocolError as exc: except ProtocolError as exc:
@@ -242,9 +269,10 @@ def device_events(
log.info("GET /device/events port=%s host=%s", port, host) log.info("GET /device/events port=%s host=%s", port, host)
try: try:
with _build_client(port, baud, host, tcp_port) as client: def _do():
info = client.connect() with _build_client(port, baud, host, tcp_port) as client:
events = client.get_events() return client.connect(), client.get_events()
info, events = _run_with_retry(_do, is_tcp=_is_tcp(host))
except HTTPException: except HTTPException:
raise raise
except ProtocolError as exc: except ProtocolError as exc:
@@ -278,9 +306,11 @@ def device_event(
log.info("GET /device/event/%d port=%s host=%s", index, port, host) log.info("GET /device/event/%d port=%s host=%s", index, port, host)
try: try:
with _build_client(port, baud, host, tcp_port) as client: def _do():
client.connect() with _build_client(port, baud, host, tcp_port) as client:
events = client.get_events() client.connect()
return client.get_events()
events = _run_with_retry(_do, is_tcp=_is_tcp(host))
except HTTPException: except HTTPException:
raise raise
except ProtocolError as exc: except ProtocolError as exc: