From a5f8d1b2c77b630336c20a33063c15c6990fcdca Mon Sep 17 00:00:00 2001 From: serversdwn Date: Tue, 17 Feb 2026 02:41:09 +0000 Subject: [PATCH] Persistent polling interval increased. Healthcheck now uses poll instead of separate handshakes. --- app/main.py | 34 +++-- app/routers.py | 84 +++-------- app/services.py | 4 +- templates/roster.html | 329 ++++++++++++++++++++++++++++++++++++++---- 4 files changed, 352 insertions(+), 99 deletions(-) diff --git a/app/main.py b/app/main.py index 176de97..abf0879 100644 --- a/app/main.py +++ b/app/main.py @@ -92,10 +92,14 @@ async def health(): @app.get("/health/devices") async def health_devices(): - """Enhanced health check that tests device connectivity.""" + """Enhanced health check that tests device connectivity. + + Uses the connection pool to avoid unnecessary TCP handshakes — if a + cached connection exists and is alive, the device is reachable. + """ from sqlalchemy.orm import Session from app.database import SessionLocal - from app.services import NL43Client + from app.services import _connection_pool from app.models import NL43Config db: Session = SessionLocal() @@ -105,7 +109,7 @@ async def health_devices(): configs = db.query(NL43Config).filter_by(tcp_enabled=True).all() for cfg in configs: - client = NL43Client(cfg.host, cfg.tcp_port, timeout=2.0, ftp_username=cfg.ftp_username, ftp_password=cfg.ftp_password) + device_key = f"{cfg.host}:{cfg.tcp_port}" status = { "unit_id": cfg.unit_id, "host": cfg.host, @@ -115,14 +119,22 @@ async def health_devices(): } try: - # Try to connect (don't send command to avoid rate limiting issues) - import asyncio - reader, writer = await asyncio.wait_for( - asyncio.open_connection(cfg.host, cfg.tcp_port), timeout=2.0 - ) - writer.close() - await writer.wait_closed() - status["reachable"] = True + # Check if pool already has a live connection (zero-cost check) + pool_stats = _connection_pool.get_stats() + conn_info = pool_stats["connections"].get(device_key) + if conn_info and conn_info["alive"]: + status["reachable"] = True + status["source"] = "pool" + else: + # No cached connection — do a lightweight acquire/release + # This opens a connection if needed but keeps it in the pool + import asyncio + reader, writer, from_cache = await _connection_pool.acquire( + device_key, cfg.host, cfg.tcp_port, timeout=2.0 + ) + await _connection_pool.release(device_key, reader, writer, cfg.host, cfg.tcp_port) + status["reachable"] = True + status["source"] = "cached" if from_cache else "new" except Exception as e: status["error"] = str(type(e).__name__) logger.warning(f"Device {cfg.unit_id} health check failed: {e}") diff --git a/app/routers.py b/app/routers.py index 89d8ce7..a21c928 100644 --- a/app/routers.py +++ b/app/routers.py @@ -1755,74 +1755,38 @@ async def run_diagnostics(unit_id: str, db: Session = Depends(get_db)): "message": "TCP communication enabled" } - # Test 3: Modem/Router reachable (check port 443 HTTPS) + # Test 3: TCP connection reachable (device port) — uses connection pool + # This avoids extra TCP handshakes over cellular. If a cached connection + # exists and is alive, we skip the handshake entirely. + from app.services import _connection_pool + device_key = f"{cfg.host}:{cfg.tcp_port}" try: - reader, writer = await asyncio.wait_for( - asyncio.open_connection(cfg.host, 443), timeout=3.0 - ) - writer.close() - await writer.wait_closed() - diagnostics["tests"]["modem_reachable"] = { - "status": "pass", - "message": f"Modem/router reachable at {cfg.host}" - } - except asyncio.TimeoutError: - diagnostics["tests"]["modem_reachable"] = { - "status": "fail", - "message": f"Modem/router timeout at {cfg.host} (network issue)" - } - diagnostics["overall_status"] = "fail" - return diagnostics - except ConnectionRefusedError: - # Connection refused means host is up but port 443 closed - that's ok - diagnostics["tests"]["modem_reachable"] = { - "status": "pass", - "message": f"Modem/router reachable at {cfg.host} (HTTPS closed)" - } - except Exception as e: - diagnostics["tests"]["modem_reachable"] = { - "status": "fail", - "message": f"Cannot reach modem/router at {cfg.host}: {str(e)}" - } - diagnostics["overall_status"] = "fail" - return diagnostics - - # Test 4: TCP connection reachable (device port) - try: - reader, writer = await asyncio.wait_for( - asyncio.open_connection(cfg.host, cfg.tcp_port), timeout=3.0 - ) - writer.close() - await writer.wait_closed() - diagnostics["tests"]["tcp_connection"] = { - "status": "pass", - "message": f"TCP connection successful to {cfg.host}:{cfg.tcp_port}" - } - except asyncio.TimeoutError: - diagnostics["tests"]["tcp_connection"] = { - "status": "fail", - "message": f"Connection timeout to {cfg.host}:{cfg.tcp_port}" - } - diagnostics["overall_status"] = "fail" - return diagnostics - except ConnectionRefusedError: - diagnostics["tests"]["tcp_connection"] = { - "status": "fail", - "message": f"Connection refused by {cfg.host}:{cfg.tcp_port}" - } - diagnostics["overall_status"] = "fail" - return diagnostics + pool_stats = _connection_pool.get_stats() + conn_info = pool_stats["connections"].get(device_key) + if conn_info and conn_info["alive"]: + # Pool already has a live connection — device is reachable + diagnostics["tests"]["tcp_connection"] = { + "status": "pass", + "message": f"TCP connection alive in pool for {cfg.host}:{cfg.tcp_port}" + } + else: + # Acquire through the pool (opens new if needed, keeps it cached) + reader, writer, from_cache = await _connection_pool.acquire( + device_key, cfg.host, cfg.tcp_port, timeout=3.0 + ) + await _connection_pool.release(device_key, reader, writer, cfg.host, cfg.tcp_port) + diagnostics["tests"]["tcp_connection"] = { + "status": "pass", + "message": f"TCP connection successful to {cfg.host}:{cfg.tcp_port}" + } except Exception as e: diagnostics["tests"]["tcp_connection"] = { "status": "fail", - "message": f"Connection error: {str(e)}" + "message": f"Connection error to {cfg.host}:{cfg.tcp_port}: {str(e)}" } diagnostics["overall_status"] = "fail" return diagnostics - # Wait a bit after connection test to let device settle - await asyncio.sleep(1.5) - # Test 5: Device responds to commands # Use longer timeout to account for rate limiting (device requires ≥1s between commands) client = NL43Client(cfg.host, cfg.tcp_port, timeout=10.0, ftp_username=cfg.ftp_username, ftp_password=cfg.ftp_password) diff --git a/app/services.py b/app/services.py index d85cb4b..860bffa 100644 --- a/app/services.py +++ b/app/services.py @@ -242,8 +242,8 @@ async def _get_device_lock(device_key: str) -> asyncio.Lock: # Configuration via environment variables TCP_PERSISTENT_ENABLED = os.getenv("TCP_PERSISTENT_ENABLED", "true").lower() == "true" -TCP_IDLE_TTL = float(os.getenv("TCP_IDLE_TTL", "120")) # Close idle connections after N seconds -TCP_MAX_AGE = float(os.getenv("TCP_MAX_AGE", "300")) # Force reconnect after N seconds +TCP_IDLE_TTL = float(os.getenv("TCP_IDLE_TTL", "300")) # Close idle connections after N seconds +TCP_MAX_AGE = float(os.getenv("TCP_MAX_AGE", "1800")) # Force reconnect after N seconds TCP_KEEPALIVE_IDLE = int(os.getenv("TCP_KEEPALIVE_IDLE", "15")) # Seconds idle before probes TCP_KEEPALIVE_INTERVAL = int(os.getenv("TCP_KEEPALIVE_INTERVAL", "10")) # Seconds between probes TCP_KEEPALIVE_COUNT = int(os.getenv("TCP_KEEPALIVE_COUNT", "3")) # Failed probes before dead diff --git a/templates/roster.html b/templates/roster.html index 6c8d23d..d22d086 100644 --- a/templates/roster.html +++ b/templates/roster.html @@ -3,7 +3,7 @@ - SLMM Roster - Sound Level Meter Configuration + SLMM - Device Roster & Connections
-

📊 Sound Level Meter Roster

+

SLMM - Roster & Connections

-
- - - - - - - - - - - - - - - - - - - -
Unit IDHost / IPTCP PortFTP PortTCPFTPPollingStatusActions
- Loading... -
+
+ + +
+ + +
+
+ + + + + + + + + + + + + + + + + + + +
Unit IDHost / IPTCP PortFTP PortTCPFTPPollingStatusActions
+ Loading... +
+
+
+ + +
+
+
+ + +
+ +

Pool Configuration

+
+
+
Status
+
--
+
+
+ +

Active Connections

+
+
Loading...
+
+
@@ -619,6 +743,159 @@ closeModal(); } }); + + // ========== Tab Switching ========== + + function switchTab(tabName) { + document.querySelectorAll('.tab-btn').forEach(btn => btn.classList.remove('active')); + document.querySelectorAll('.tab-panel').forEach(panel => panel.classList.remove('active')); + + document.querySelector(`.tab-btn[onclick="switchTab('${tabName}')"]`).classList.add('active'); + document.getElementById(`tab-${tabName}`).classList.add('active'); + + if (tabName === 'connections') { + loadConnections(); + } + } + + // ========== Connection Pool ========== + + let connectionsRefreshTimer = null; + + async function loadConnections() { + try { + const res = await fetch('/api/nl43/_connections/status'); + const data = await res.json(); + + if (!res.ok) { + showToast('Failed to load connection pool status', 'error'); + return; + } + + const pool = data.pool; + renderPoolConfig(pool); + renderConnections(pool.connections); + + // Auto-refresh while tab is active + clearTimeout(connectionsRefreshTimer); + if (document.getElementById('tab-connections').classList.contains('active')) { + connectionsRefreshTimer = setTimeout(loadConnections, 5000); + } + } catch (err) { + showToast('Error loading connections: ' + err.message, 'error'); + console.error('Load connections error:', err); + } + } + + function renderPoolConfig(pool) { + document.getElementById('poolConfig').innerHTML = ` +
+
Persistent
+
${pool.enabled ? 'Enabled' : 'Disabled'}
+
+
+
Active
+
${pool.active_connections}
+
+
+
Idle TTL
+
${pool.idle_ttl}s
+
+
+
Max Age
+
${pool.max_age}s
+
+
+
KA Idle
+
${pool.keepalive_idle}s
+
+
+
KA Interval
+
${pool.keepalive_interval}s
+
+
+
KA Probes
+
${pool.keepalive_count}
+
+ `; + } + + function renderConnections(connections) { + const container = document.getElementById('connectionsList'); + const keys = Object.keys(connections); + + if (keys.length === 0) { + container.innerHTML = ` +
+
~
+
No active connections
+
+ Connections appear here when devices are actively being polled and the connection is cached between commands. +
+
+ `; + return; + } + + container.innerHTML = keys.map(key => { + const conn = connections[key]; + const aliveColor = conn.alive ? '#1a7f37' : '#cf222e'; + const aliveText = conn.alive ? 'Alive' : 'Stale'; + return ` +
+
+ ${escapeHtml(key)} + ${aliveText} +
+
+
+
Host
+
${escapeHtml(conn.host)}
+
+
+
Port
+
${conn.port}
+
+
+
Age
+
${formatSeconds(conn.age_seconds)}
+
+
+
Idle
+
${formatSeconds(conn.idle_seconds)}
+
+
+
+ `; + }).join(''); + } + + function formatSeconds(s) { + if (s < 60) return Math.round(s) + 's'; + if (s < 3600) return Math.floor(s / 60) + 'm ' + Math.round(s % 60) + 's'; + return Math.floor(s / 3600) + 'h ' + Math.floor((s % 3600) / 60) + 'm'; + } + + async function flushConnections() { + if (!confirm('Close all cached TCP connections?\n\nDevices will reconnect on the next poll cycle.')) { + return; + } + + try { + const res = await fetch('/api/nl43/_connections/flush', { method: 'POST' }); + const data = await res.json(); + + if (!res.ok) { + showToast(data.detail || 'Failed to flush connections', 'error'); + return; + } + + showToast('All connections flushed', 'success'); + await loadConnections(); + } catch (err) { + showToast('Error flushing connections: ' + err.message, 'error'); + } + }