- db cache dump on diagnostics request.
- individual device logs, db and files.
-Device logs api endpoints and diagnostics UI.

Fix:
- slmm standalone now uses local TZ (was UTC only before)
- fixed measurement start time logic.
This commit is contained in:
serversdwn
2026-01-29 18:50:47 +00:00
parent eb39a9d1d0
commit bf5f222511
8 changed files with 877 additions and 1 deletions

View File

@@ -15,7 +15,8 @@ from sqlalchemy.orm import Session
from app.database import SessionLocal
from app.models import NL43Config, NL43Status
from app.services import NL43Client, persist_snapshot
from app.services import NL43Client, persist_snapshot, sync_measurement_start_time_from_ftp
from app.device_logger import log_device_event, cleanup_old_logs
logger = logging.getLogger(__name__)
@@ -36,6 +37,7 @@ class BackgroundPoller:
self._task: Optional[asyncio.Task] = None
self._running = False
self._logger = logger
self._last_cleanup = None # Track last log cleanup time
async def start(self):
"""Start the background polling task."""
@@ -78,6 +80,15 @@ class BackgroundPoller:
except Exception as e:
self._logger.error(f"Error in poll loop: {e}", exc_info=True)
# Run log cleanup once per hour
try:
now = datetime.utcnow()
if self._last_cleanup is None or (now - self._last_cleanup).total_seconds() > 3600:
cleanup_old_logs()
self._last_cleanup = now
except Exception as e:
self._logger.warning(f"Log cleanup failed: {e}")
# Calculate dynamic sleep interval
sleep_time = self._calculate_sleep_interval()
self._logger.debug(f"Sleeping for {sleep_time} seconds until next poll cycle")
@@ -205,6 +216,71 @@ class BackgroundPoller:
db.commit()
self._logger.info(f"✓ Successfully polled {unit_id}")
# Log to device log
log_device_event(
unit_id, "INFO", "POLL",
f"Poll success: state={snap.measurement_state}, Leq={snap.leq}, Lp={snap.lp}",
db
)
# Check if device is measuring but has no start time recorded
# This happens if measurement was started before SLMM began polling
# or after a service restart
status = db.query(NL43Status).filter_by(unit_id=unit_id).first()
# Reset the sync flag when measurement stops (so next measurement can sync)
if status and status.measurement_state != "Start":
if status.start_time_sync_attempted:
status.start_time_sync_attempted = False
db.commit()
self._logger.debug(f"Reset FTP sync flag for {unit_id} (measurement stopped)")
log_device_event(unit_id, "DEBUG", "STATE", "Measurement stopped, reset FTP sync flag", db)
# Attempt FTP sync if:
# - Device is measuring
# - No start time recorded
# - FTP sync not already attempted for this measurement
# - FTP is configured
if (status and
status.measurement_state == "Start" and
status.measurement_start_time is None and
not status.start_time_sync_attempted and
cfg.ftp_enabled and
cfg.ftp_username and
cfg.ftp_password):
self._logger.info(
f"Device {unit_id} is measuring but has no start time - "
f"attempting FTP sync"
)
log_device_event(unit_id, "INFO", "SYNC", "Attempting FTP sync for measurement start time", db)
# Mark that we attempted sync (prevents repeated attempts on failure)
status.start_time_sync_attempted = True
db.commit()
try:
synced = await sync_measurement_start_time_from_ftp(
unit_id=unit_id,
host=cfg.host,
tcp_port=cfg.tcp_port,
ftp_port=cfg.ftp_port or 21,
ftp_username=cfg.ftp_username,
ftp_password=cfg.ftp_password,
db=db
)
if synced:
self._logger.info(f"✓ FTP sync succeeded for {unit_id}")
log_device_event(unit_id, "INFO", "SYNC", "FTP sync succeeded - measurement start time updated", db)
else:
self._logger.warning(f"FTP sync returned False for {unit_id}")
log_device_event(unit_id, "WARNING", "SYNC", "FTP sync returned False", db)
except Exception as sync_err:
self._logger.warning(
f"FTP sync failed for {unit_id}: {sync_err}"
)
log_device_event(unit_id, "ERROR", "SYNC", f"FTP sync failed: {sync_err}", db)
except Exception as e:
# Failure - increment counter and potentially mark offline
status.consecutive_failures += 1
@@ -217,11 +293,13 @@ class BackgroundPoller:
self._logger.warning(
f"Device {unit_id} marked unreachable after {status.consecutive_failures} failures: {error_msg}"
)
log_device_event(unit_id, "ERROR", "POLL", f"Device marked UNREACHABLE after {status.consecutive_failures} failures: {error_msg}", db)
status.is_reachable = False
else:
self._logger.warning(
f"Poll failed for {unit_id} (attempt {status.consecutive_failures}/3): {error_msg}"
)
log_device_event(unit_id, "WARNING", "POLL", f"Poll failed (attempt {status.consecutive_failures}/3): {error_msg}", db)
db.commit()

277
app/device_logger.py Normal file
View File

@@ -0,0 +1,277 @@
"""
Per-device logging system.
Provides dual output: database entries for structured queries and file logs for backup.
Each device gets its own log file in data/logs/{unit_id}.log with rotation.
"""
import logging
import os
from datetime import datetime, timedelta
from logging.handlers import RotatingFileHandler
from pathlib import Path
from typing import Optional
from sqlalchemy.orm import Session
from app.database import SessionLocal
from app.models import DeviceLog
# Configure base logger
logger = logging.getLogger(__name__)
# Log directory (persisted in Docker volume)
LOG_DIR = Path(os.path.dirname(os.path.dirname(__file__))) / "data" / "logs"
LOG_DIR.mkdir(parents=True, exist_ok=True)
# Per-device file loggers (cached)
_device_file_loggers: dict = {}
# Log retention (days)
LOG_RETENTION_DAYS = int(os.getenv("LOG_RETENTION_DAYS", "7"))
def _get_file_logger(unit_id: str) -> logging.Logger:
"""Get or create a file logger for a specific device."""
if unit_id in _device_file_loggers:
return _device_file_loggers[unit_id]
# Create device-specific logger
device_logger = logging.getLogger(f"device.{unit_id}")
device_logger.setLevel(logging.DEBUG)
# Avoid duplicate handlers
if not device_logger.handlers:
# Create rotating file handler (5 MB max, keep 3 backups)
log_file = LOG_DIR / f"{unit_id}.log"
handler = RotatingFileHandler(
log_file,
maxBytes=5 * 1024 * 1024, # 5 MB
backupCount=3,
encoding="utf-8"
)
handler.setLevel(logging.DEBUG)
# Format: timestamp [LEVEL] [CATEGORY] message
formatter = logging.Formatter(
"%(asctime)s [%(levelname)s] [%(category)s] %(message)s",
datefmt="%Y-%m-%d %H:%M:%S"
)
handler.setFormatter(formatter)
device_logger.addHandler(handler)
# Don't propagate to root logger
device_logger.propagate = False
_device_file_loggers[unit_id] = device_logger
return device_logger
def log_device_event(
unit_id: str,
level: str,
category: str,
message: str,
db: Optional[Session] = None
):
"""
Log an event for a specific device.
Writes to both:
1. Database (DeviceLog table) for structured queries
2. File (data/logs/{unit_id}.log) for backup/debugging
Args:
unit_id: Device identifier
level: Log level (DEBUG, INFO, WARNING, ERROR)
category: Event category (TCP, FTP, POLL, COMMAND, STATE, SYNC)
message: Log message
db: Optional database session (creates one if not provided)
"""
timestamp = datetime.utcnow()
# Write to file log
try:
file_logger = _get_file_logger(unit_id)
log_func = getattr(file_logger, level.lower(), file_logger.info)
# Pass category as extra for formatter
log_func(message, extra={"category": category})
except Exception as e:
logger.warning(f"Failed to write file log for {unit_id}: {e}")
# Write to database
close_db = False
try:
if db is None:
db = SessionLocal()
close_db = True
log_entry = DeviceLog(
unit_id=unit_id,
timestamp=timestamp,
level=level.upper(),
category=category.upper(),
message=message
)
db.add(log_entry)
db.commit()
except Exception as e:
logger.warning(f"Failed to write DB log for {unit_id}: {e}")
if db:
db.rollback()
finally:
if close_db and db:
db.close()
def cleanup_old_logs(retention_days: Optional[int] = None, db: Optional[Session] = None):
"""
Delete log entries older than retention period.
Args:
retention_days: Days to retain (default: LOG_RETENTION_DAYS env var or 7)
db: Optional database session
"""
if retention_days is None:
retention_days = LOG_RETENTION_DAYS
cutoff = datetime.utcnow() - timedelta(days=retention_days)
close_db = False
try:
if db is None:
db = SessionLocal()
close_db = True
deleted = db.query(DeviceLog).filter(DeviceLog.timestamp < cutoff).delete()
db.commit()
if deleted > 0:
logger.info(f"Cleaned up {deleted} log entries older than {retention_days} days")
except Exception as e:
logger.error(f"Failed to cleanup old logs: {e}")
if db:
db.rollback()
finally:
if close_db and db:
db.close()
def get_device_logs(
unit_id: str,
limit: int = 100,
offset: int = 0,
level: Optional[str] = None,
category: Optional[str] = None,
since: Optional[datetime] = None,
db: Optional[Session] = None
) -> list:
"""
Query log entries for a specific device.
Args:
unit_id: Device identifier
limit: Max entries to return (default: 100)
offset: Number of entries to skip (default: 0)
level: Filter by level (DEBUG, INFO, WARNING, ERROR)
category: Filter by category (TCP, FTP, POLL, COMMAND, STATE, SYNC)
since: Filter entries after this timestamp
db: Optional database session
Returns:
List of log entries as dicts
"""
close_db = False
try:
if db is None:
db = SessionLocal()
close_db = True
query = db.query(DeviceLog).filter(DeviceLog.unit_id == unit_id)
if level:
query = query.filter(DeviceLog.level == level.upper())
if category:
query = query.filter(DeviceLog.category == category.upper())
if since:
query = query.filter(DeviceLog.timestamp >= since)
# Order by newest first
query = query.order_by(DeviceLog.timestamp.desc())
# Apply pagination
entries = query.offset(offset).limit(limit).all()
return [
{
"id": e.id,
"timestamp": e.timestamp.isoformat() + "Z",
"level": e.level,
"category": e.category,
"message": e.message
}
for e in entries
]
finally:
if close_db and db:
db.close()
def get_log_stats(unit_id: str, db: Optional[Session] = None) -> dict:
"""
Get log statistics for a device.
Returns:
Dict with counts by level and category
"""
close_db = False
try:
if db is None:
db = SessionLocal()
close_db = True
total = db.query(DeviceLog).filter(DeviceLog.unit_id == unit_id).count()
# Count by level
level_counts = {}
for level in ["DEBUG", "INFO", "WARNING", "ERROR"]:
count = db.query(DeviceLog).filter(
DeviceLog.unit_id == unit_id,
DeviceLog.level == level
).count()
if count > 0:
level_counts[level] = count
# Count by category
category_counts = {}
for category in ["TCP", "FTP", "POLL", "COMMAND", "STATE", "SYNC", "GENERAL"]:
count = db.query(DeviceLog).filter(
DeviceLog.unit_id == unit_id,
DeviceLog.category == category
).count()
if count > 0:
category_counts[category] = count
# Get oldest and newest
oldest = db.query(DeviceLog).filter(
DeviceLog.unit_id == unit_id
).order_by(DeviceLog.timestamp.asc()).first()
newest = db.query(DeviceLog).filter(
DeviceLog.unit_id == unit_id
).order_by(DeviceLog.timestamp.desc()).first()
return {
"total": total,
"by_level": level_counts,
"by_category": category_counts,
"oldest": oldest.timestamp.isoformat() + "Z" if oldest else None,
"newest": newest.timestamp.isoformat() + "Z" if newest else None
}
finally:
if close_db and db:
db.close()

View File

@@ -53,3 +53,22 @@ class NL43Status(Base):
last_poll_attempt = Column(DateTime, nullable=True) # Last time background poller attempted to poll
last_success = Column(DateTime, nullable=True) # Last successful poll timestamp
last_error = Column(Text, nullable=True) # Last error message (truncated to 500 chars)
# FTP start time sync tracking
start_time_sync_attempted = Column(Boolean, default=False) # True if FTP sync was attempted for current measurement
class DeviceLog(Base):
"""
Per-device log entries for debugging and audit trail.
Stores events like commands, state changes, errors, and FTP operations.
"""
__tablename__ = "device_logs"
id = Column(Integer, primary_key=True, autoincrement=True)
unit_id = Column(String, index=True, nullable=False)
timestamp = Column(DateTime, default=func.now(), index=True)
level = Column(String, default="INFO") # DEBUG, INFO, WARNING, ERROR
category = Column(String, default="GENERAL") # TCP, FTP, POLL, COMMAND, STATE, SYNC
message = Column(Text, nullable=False)

View File

@@ -3,6 +3,7 @@ from fastapi.responses import FileResponse
from sqlalchemy.orm import Session
from datetime import datetime
from pydantic import BaseModel, field_validator, Field
from typing import Optional
import logging
import ipaddress
import json
@@ -1842,9 +1843,134 @@ async def run_diagnostics(unit_id: str, db: Session = Depends(get_db)):
# All tests passed
diagnostics["overall_status"] = "pass"
# Add database dump: config and status cache
diagnostics["database_dump"] = {
"config": {
"unit_id": cfg.unit_id,
"host": cfg.host,
"tcp_port": cfg.tcp_port,
"tcp_enabled": cfg.tcp_enabled,
"ftp_enabled": cfg.ftp_enabled,
"ftp_port": cfg.ftp_port,
"ftp_username": cfg.ftp_username,
"ftp_password": "***" if cfg.ftp_password else None, # Mask password
"web_enabled": cfg.web_enabled,
"poll_interval_seconds": cfg.poll_interval_seconds,
"poll_enabled": cfg.poll_enabled
},
"status_cache": None
}
# Get cached status if available
status = db.query(NL43Status).filter_by(unit_id=unit_id).first()
if status:
# Helper to format datetime as ISO with Z suffix to indicate UTC
def to_utc_iso(dt):
return dt.isoformat() + 'Z' if dt else None
diagnostics["database_dump"]["status_cache"] = {
"unit_id": status.unit_id,
"last_seen": to_utc_iso(status.last_seen),
"measurement_state": status.measurement_state,
"measurement_start_time": to_utc_iso(status.measurement_start_time),
"counter": status.counter,
"lp": status.lp,
"leq": status.leq,
"lmax": status.lmax,
"lmin": status.lmin,
"lpeak": status.lpeak,
"battery_level": status.battery_level,
"power_source": status.power_source,
"sd_remaining_mb": status.sd_remaining_mb,
"sd_free_ratio": status.sd_free_ratio,
"is_reachable": status.is_reachable,
"consecutive_failures": status.consecutive_failures,
"last_poll_attempt": to_utc_iso(status.last_poll_attempt),
"last_success": to_utc_iso(status.last_success),
"last_error": status.last_error,
"raw_payload": status.raw_payload
}
return diagnostics
# ============================================================================
# DEVICE LOGS ENDPOINTS
# ============================================================================
@router.get("/{unit_id}/logs")
def get_device_logs(
unit_id: str,
limit: int = 100,
offset: int = 0,
level: Optional[str] = None,
category: Optional[str] = None,
db: Session = Depends(get_db)
):
"""
Get log entries for a specific device.
Query parameters:
- limit: Max entries to return (default: 100, max: 1000)
- offset: Number of entries to skip (for pagination)
- level: Filter by level (DEBUG, INFO, WARNING, ERROR)
- category: Filter by category (TCP, FTP, POLL, COMMAND, STATE, SYNC)
Returns newest entries first.
"""
from app.device_logger import get_device_logs as fetch_logs, get_log_stats
# Validate limit
limit = min(limit, 1000)
logs = fetch_logs(
unit_id=unit_id,
limit=limit,
offset=offset,
level=level,
category=category,
db=db
)
stats = get_log_stats(unit_id, db)
return {
"status": "ok",
"unit_id": unit_id,
"logs": logs,
"count": len(logs),
"stats": stats,
"filters": {
"level": level,
"category": category
},
"pagination": {
"limit": limit,
"offset": offset
}
}
@router.delete("/{unit_id}/logs")
def clear_device_logs(unit_id: str, db: Session = Depends(get_db)):
"""
Clear all log entries for a specific device.
"""
from app.models import DeviceLog
deleted = db.query(DeviceLog).filter(DeviceLog.unit_id == unit_id).delete()
db.commit()
logger.info(f"Cleared {deleted} log entries for device {unit_id}")
return {
"status": "ok",
"message": f"Cleared {deleted} log entries for {unit_id}",
"deleted_count": deleted
}
# ============================================================================
# BACKGROUND POLLING CONFIGURATION ENDPOINTS
# ============================================================================

View File

@@ -76,10 +76,22 @@ def persist_snapshot(s: NL43Snapshot, db: Session):
# Measurement just started - record the start time
row.measurement_start_time = datetime.utcnow()
logger.info(f"✓ Measurement started on {s.unit_id} at {row.measurement_start_time}")
# Log state change (lazy import to avoid circular dependency)
try:
from app.device_logger import log_device_event
log_device_event(s.unit_id, "INFO", "STATE", f"Measurement STARTED at {row.measurement_start_time}", db)
except Exception:
pass
elif was_measuring and not is_measuring:
# Measurement stopped - clear the start time
row.measurement_start_time = None
logger.info(f"✓ Measurement stopped on {s.unit_id}")
# Log state change
try:
from app.device_logger import log_device_event
log_device_event(s.unit_id, "INFO", "STATE", "Measurement STOPPED", db)
except Exception:
pass
row.measurement_state = new_state
row.counter = s.counter
@@ -101,6 +113,109 @@ def persist_snapshot(s: NL43Snapshot, db: Session):
raise
async def sync_measurement_start_time_from_ftp(
unit_id: str,
host: str,
tcp_port: int,
ftp_port: int,
ftp_username: str,
ftp_password: str,
db: Session
) -> bool:
"""
Sync measurement start time from the FTP folder timestamp.
This is called when SLMM detects a device is already measuring but doesn't
have a recorded start time (e.g., after service restart or if measurement
was started before SLMM began polling).
The workflow:
1. Disable FTP (reset)
2. Enable FTP
3. List NL-43 folder to get measurement folder timestamps
4. Use the most recent folder's timestamp as the start time
5. Update the database
Args:
unit_id: Device identifier
host: Device IP/hostname
tcp_port: TCP control port
ftp_port: FTP port (usually 21)
ftp_username: FTP username (usually "USER")
ftp_password: FTP password (usually "0000")
db: Database session
Returns:
True if sync succeeded, False otherwise
"""
logger.info(f"[FTP-SYNC] Attempting to sync measurement start time for {unit_id} via FTP")
client = NL43Client(
host, tcp_port,
ftp_username=ftp_username,
ftp_password=ftp_password,
ftp_port=ftp_port
)
try:
# Step 1: Disable FTP to reset it
logger.info(f"[FTP-SYNC] Step 1: Disabling FTP on {unit_id}")
await client.disable_ftp()
await asyncio.sleep(1.5) # Wait for device to process
# Step 2: Enable FTP
logger.info(f"[FTP-SYNC] Step 2: Enabling FTP on {unit_id}")
await client.enable_ftp()
await asyncio.sleep(2.0) # Wait for FTP server to start
# Step 3: List NL-43 folder
logger.info(f"[FTP-SYNC] Step 3: Listing /NL-43 folder on {unit_id}")
files = await client.list_ftp_files("/NL-43")
# Filter for directories only (measurement folders)
folders = [f for f in files if f.get('is_dir', False)]
if not folders:
logger.warning(f"[FTP-SYNC] No measurement folders found on {unit_id}")
return False
# Sort by modified timestamp (newest first)
folders.sort(key=lambda f: f.get('modified_timestamp', ''), reverse=True)
latest_folder = folders[0]
folder_name = latest_folder['name']
logger.info(f"[FTP-SYNC] Found latest measurement folder: {folder_name}")
# Step 4: Parse timestamp
if 'modified_timestamp' in latest_folder and latest_folder['modified_timestamp']:
timestamp_str = latest_folder['modified_timestamp']
# Parse ISO format timestamp (already in UTC from SLMM FTP listing)
start_time = datetime.fromisoformat(timestamp_str.replace('Z', ''))
# Step 5: Update database
status = db.query(NL43Status).filter_by(unit_id=unit_id).first()
if status:
old_time = status.measurement_start_time
status.measurement_start_time = start_time
db.commit()
logger.info(f"[FTP-SYNC] ✓ Successfully synced start time for {unit_id}")
logger.info(f"[FTP-SYNC] Folder: {folder_name}")
logger.info(f"[FTP-SYNC] Old start time: {old_time}")
logger.info(f"[FTP-SYNC] New start time: {start_time}")
return True
else:
logger.warning(f"[FTP-SYNC] Status record not found for {unit_id}")
return False
else:
logger.warning(f"[FTP-SYNC] Could not parse timestamp from folder {folder_name}")
return False
except Exception as e:
logger.error(f"[FTP-SYNC] Failed to sync start time for {unit_id}: {e}")
return False
# Rate limiting: NL43 requires ≥1 second between commands
_last_command_time = {}
_rate_limit_lock = asyncio.Lock()