Files
terra-view/backend/routers/slm_dashboard.py
T
serversdown c049ac8a41 fix: SLM control no longer shows false "Unknown error" on start
Starting a measurement could pop "Error: Unknown error" in the browser
even though the device started recording fine. Two causes: the proxy's
10s timeout was shorter than a real device start over cellular, and on
an httpx timeout str(e) is empty, so the relayed detail was "" -> the
frontend's `result.detail || 'Unknown error'` rendered "Unknown error".

- Raise the control proxy timeout to 30s so a healthy start isn't cut off.
- Surface SLMM's own error detail on non-200 responses.
- Add an explicit, honest timeout message.
- Never return an empty detail (which rendered as "Unknown error").

Pairs with the SLMM-side fix that makes /start confirm promptly.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-21 20:22:52 +00:00

386 lines
14 KiB
Python

"""
SLM Dashboard Router
Provides API endpoints for the Sound Level Meters dashboard page.
"""
from fastapi import APIRouter, Request, Depends, Query
from fastapi.responses import HTMLResponse
from sqlalchemy.orm import Session
from sqlalchemy import func
from datetime import datetime, timedelta
import asyncio
import httpx
import logging
import os
from backend.database import get_db
from backend.models import RosterUnit
from backend.routers.roster_edit import sync_slm_to_slmm_cache
from backend.templates_config import templates
logger = logging.getLogger(__name__)
router = APIRouter(prefix="/api/slm-dashboard", tags=["slm-dashboard"])
# SLMM backend URL - configurable via environment variable
SLMM_BASE_URL = os.getenv("SLMM_BASE_URL", "http://localhost:8100")
@router.get("/stats", response_class=HTMLResponse)
async def get_slm_stats(request: Request, db: Session = Depends(get_db)):
"""
Get summary statistics for SLM dashboard.
Returns HTML partial with stat cards.
"""
# Query all SLMs
all_slms = db.query(RosterUnit).filter_by(device_type="slm").all()
# Count deployed vs benched
deployed_count = sum(1 for slm in all_slms if slm.deployed and not slm.retired)
benched_count = sum(1 for slm in all_slms if not slm.deployed and not slm.retired)
retired_count = sum(1 for slm in all_slms if slm.retired)
# Count recently active (checked in last hour)
one_hour_ago = datetime.utcnow() - timedelta(hours=1)
active_count = sum(1 for slm in all_slms
if slm.slm_last_check and slm.slm_last_check > one_hour_ago)
return templates.TemplateResponse("partials/slm_stats.html", {
"request": request,
"total_count": len(all_slms),
"deployed_count": deployed_count,
"benched_count": benched_count,
"active_count": active_count,
"retired_count": retired_count
})
@router.get("/units", response_class=HTMLResponse)
async def get_slm_units(
request: Request,
db: Session = Depends(get_db),
search: str = Query(None),
project: str = Query(None),
include_measurement: bool = Query(False),
):
"""
Get list of SLM units for the sidebar.
Returns HTML partial with unit cards.
"""
query = db.query(RosterUnit).filter_by(device_type="slm")
# Filter by project if provided
if project:
query = query.filter(RosterUnit.project_id == project)
# Filter by search term if provided
if search:
search_term = f"%{search}%"
query = query.filter(
(RosterUnit.id.like(search_term)) |
(RosterUnit.slm_model.like(search_term)) |
(RosterUnit.address.like(search_term))
)
units = query.order_by(
RosterUnit.retired.asc(),
RosterUnit.deployed.desc(),
RosterUnit.id.asc()
).all()
one_hour_ago = datetime.utcnow() - timedelta(hours=1)
for unit in units:
# Legacy default from the roster field; refined from SLMM's cached status below.
unit.is_recent = bool(unit.slm_last_check and unit.slm_last_check > one_hour_ago)
unit.measurement_state = None
unit.cache_last_seen = None # SLMM cache last_seen (real monitoring freshness)
if include_measurement:
# SLMM's /roster carries each unit's CACHED status (last_seen,
# measurement_state) from NL43Status — a DB read on SLMM's side, NOT a device
# call. The live monitor refreshes that cache ~every 1.3s, so this reflects
# real monitoring without sending Measure? to the device (which the old
# /measurement-state did) and competing with DOD polling. One call covers all.
slmm_status = {}
try:
async with httpx.AsyncClient(timeout=3.0) as client:
r = await client.get(f"{SLMM_BASE_URL}/api/nl43/roster")
if r.status_code == 200:
for dev in (r.json().get("devices") or []):
slmm_status[dev.get("unit_id")] = dev.get("status") or {}
except Exception:
slmm_status = {}
# "Recent" = the monitor has a fresh successful read. last_seen only advances
# on a successful poll, so staleness == the device isn't being reached.
recent_cutoff = datetime.utcnow() - timedelta(minutes=5)
for unit in units:
st = slmm_status.get(unit.id)
if not st:
continue
unit.measurement_state = st.get("measurement_state")
last_seen = st.get("last_seen")
if last_seen:
try:
ls = datetime.fromisoformat(last_seen.replace("Z", ""))
unit.is_recent = ls > recent_cutoff
unit.cache_last_seen = ls # the real freshness the monitor updates
except Exception:
pass
return templates.TemplateResponse("partials/slm_device_list.html", {
"request": request,
"units": units
})
@router.get("/live-view/{unit_id}", response_class=HTMLResponse)
async def get_live_view(request: Request, unit_id: str, db: Session = Depends(get_db)):
"""
Get live view panel for a specific SLM unit.
Returns HTML partial with live metrics and chart.
"""
# Get unit from database
unit = db.query(RosterUnit).filter_by(id=unit_id, device_type="slm").first()
if not unit:
return templates.TemplateResponse("partials/slm_live_view_error.html", {
"request": request,
"error": f"Unit {unit_id} not found"
})
# Get modem information if assigned
modem = None
modem_ip = None
if unit.deployed_with_modem_id:
modem = db.query(RosterUnit).filter_by(id=unit.deployed_with_modem_id, device_type="modem").first()
if modem:
modem_ip = modem.ip_address
else:
logger.warning(f"SLM {unit_id} is assigned to modem {unit.deployed_with_modem_id} but modem not found")
# Fallback to direct slm_host if no modem assigned (backward compatibility)
if not modem_ip and unit.slm_host:
modem_ip = unit.slm_host
logger.info(f"Using legacy slm_host for {unit_id}: {modem_ip}")
# Try to get current status from SLMM
current_status = None
measurement_state = None
is_measuring = False
try:
# Read SLMM's CACHED status (NL43Status) — no device call. The live monitor
# keeps it fresh (~1.3s) and the live-stream WS provides ongoing updates, so we
# no longer fire Measure? + a fresh DOD read at the device on every command-
# center load (which competed with DOD polling for the single connection).
async with httpx.AsyncClient(timeout=5.0) as client:
r = await client.get(f"{SLMM_BASE_URL}/api/nl43/{unit_id}/status")
if r.status_code == 200:
current_status = r.json().get("data", {})
measurement_state = current_status.get("measurement_state")
is_measuring = measurement_state in ("Start", "Measure")
except Exception as e:
logger.error(f"Failed to get cached status for {unit_id}: {e}")
return templates.TemplateResponse("partials/slm_live_view.html", {
"request": request,
"unit": unit,
"modem": modem,
"modem_ip": modem_ip,
"current_status": current_status,
"measurement_state": measurement_state,
"is_measuring": is_measuring
})
@router.post("/control/{unit_id}/{action}")
async def control_slm(unit_id: str, action: str):
"""
Send control commands to SLM (start, stop, pause, resume, reset).
Proxies to SLMM backend.
"""
valid_actions = ["start", "stop", "pause", "resume", "reset"]
if action not in valid_actions:
return {"status": "error", "detail": f"Invalid action. Must be one of: {valid_actions}"}
try:
# 30s: a real device start confirms over cellular in a few seconds, but
# leave headroom so a healthy start is never cut off mid-flight (which
# surfaced to users as a misleading "Unknown error").
async with httpx.AsyncClient(timeout=30.0) as client:
response = await client.post(
f"{SLMM_BASE_URL}/api/nl43/{unit_id}/{action}"
)
if response.status_code == 200:
return response.json()
# Surface SLMM's own error detail when it provides one.
detail = f"SLMM returned status {response.status_code}"
try:
body = response.json()
if isinstance(body, dict) and body.get("detail"):
detail = body["detail"]
except Exception:
pass
return {"status": "error", "detail": detail}
except httpx.TimeoutException:
logger.error(f"Timeout controlling {unit_id} (action={action}) via SLMM")
return {
"status": "error",
"detail": (
f"Timed out waiting for the device to {action}. "
f"The command may still have been applied — refresh to confirm."
),
}
except Exception as e:
logger.error(f"Failed to control {unit_id}: {e}")
# Never return an empty detail — it renders to users as "Unknown error".
return {"status": "error", "detail": str(e) or f"{type(e).__name__}"}
@router.get("/config/{unit_id}", response_class=HTMLResponse)
async def get_slm_config(request: Request, unit_id: str, db: Session = Depends(get_db)):
"""
Get configuration form for a specific SLM unit.
Returns HTML partial with configuration form.
"""
unit = db.query(RosterUnit).filter_by(id=unit_id, device_type="slm").first()
if not unit:
return HTMLResponse(
content=f'<div class="text-red-500">Unit {unit_id} not found</div>',
status_code=404
)
return templates.TemplateResponse("partials/slm_config_form.html", {
"request": request,
"unit": unit
})
@router.post("/config/{unit_id}")
async def save_slm_config(request: Request, unit_id: str, db: Session = Depends(get_db)):
"""
Save SLM configuration.
Updates unit parameters in the database.
"""
unit = db.query(RosterUnit).filter_by(id=unit_id, device_type="slm").first()
if not unit:
return {"status": "error", "detail": f"Unit {unit_id} not found"}
try:
# Get form data
form_data = await request.form()
# Update SLM-specific fields
unit.slm_model = form_data.get("slm_model") or None
unit.slm_serial_number = form_data.get("slm_serial_number") or None
unit.slm_frequency_weighting = form_data.get("slm_frequency_weighting") or None
unit.slm_time_weighting = form_data.get("slm_time_weighting") or None
unit.slm_measurement_range = form_data.get("slm_measurement_range") or None
# Update network configuration
modem_id = form_data.get("deployed_with_modem_id")
unit.deployed_with_modem_id = modem_id if modem_id else None
# Always update TCP and FTP ports (used regardless of modem assignment)
unit.slm_tcp_port = int(form_data.get("slm_tcp_port")) if form_data.get("slm_tcp_port") else None
unit.slm_ftp_port = int(form_data.get("slm_ftp_port")) if form_data.get("slm_ftp_port") else None
# Only update direct IP if no modem is assigned
if not modem_id:
unit.slm_host = form_data.get("slm_host") or None
else:
# Clear legacy direct IP field when modem is assigned
unit.slm_host = None
db.commit()
logger.info(f"Updated configuration for SLM {unit_id}")
# Sync updated configuration to SLMM cache
logger.info(f"Syncing SLM {unit_id} config changes to SLMM cache...")
result = await sync_slm_to_slmm_cache(
unit_id=unit_id,
host=unit.slm_host, # Use the updated host from Terra-View
tcp_port=unit.slm_tcp_port,
ftp_port=unit.slm_ftp_port,
deployed_with_modem_id=unit.deployed_with_modem_id, # Resolve modem IP if assigned
db=db
)
if not result["success"]:
logger.warning(f"SLMM cache sync warning for {unit_id}: {result['message']}")
# Config still saved in Terra-View (source of truth)
return {"status": "success", "unit_id": unit_id}
except Exception as e:
db.rollback()
logger.error(f"Failed to save config for {unit_id}: {e}")
return {"status": "error", "detail": str(e)}
@router.get("/test-modem/{modem_id}")
async def test_modem_connection(modem_id: str, db: Session = Depends(get_db)):
"""
Test modem connectivity with a simple ping/health check.
Returns response time and connection status.
"""
import subprocess
import time
# Get modem from database
modem = db.query(RosterUnit).filter_by(id=modem_id, device_type="modem").first()
if not modem:
return {"status": "error", "detail": f"Modem {modem_id} not found"}
if not modem.ip_address:
return {"status": "error", "detail": f"Modem {modem_id} has no IP address configured"}
try:
# Ping the modem (1 packet, 2 second timeout)
start_time = time.time()
result = subprocess.run(
["ping", "-c", "1", "-W", "2", modem.ip_address],
capture_output=True,
text=True,
timeout=3
)
response_time = int((time.time() - start_time) * 1000) # Convert to milliseconds
if result.returncode == 0:
return {
"status": "success",
"modem_id": modem_id,
"ip_address": modem.ip_address,
"response_time": response_time,
"message": "Modem is responding to ping"
}
else:
return {
"status": "error",
"modem_id": modem_id,
"ip_address": modem.ip_address,
"detail": "Modem not responding to ping"
}
except subprocess.TimeoutExpired:
return {
"status": "error",
"modem_id": modem_id,
"ip_address": modem.ip_address,
"detail": "Ping timeout (> 2 seconds)"
}
except Exception as e:
logger.error(f"Failed to ping modem {modem_id}: {e}")
return {
"status": "error",
"modem_id": modem_id,
"detail": str(e)
}