feat(reports): FTP night-report pipeline foundation

Terra-View side of the daily night-vs-baseline sound report for the John Myler
24/7 job. Engine is built and verified end-to-end against real meter data;
SMTP send + scheduler/capture wiring still pending.

- ingest: refactor upload_nrl_data into a callable ingest_nrl_zip(location_id,
  zip_bytes, db) sharing one core with the HTTP endpoint. Capture the .rnh
  percentile map + weightings into session metadata; dedup on store-name +
  start time. Ingest stays metric-agnostic (every Leq column preserved).
- report_pipeline.py: metric registry, Evening/Nighttime windows, correct
  aggregation (Lmax=max, Ln=arithmetic, Leq=logarithmic), baseline = typical
  night, per-location + per-project builders.
- report_renderers.py: HTML email-body renderer (Last/Base/delta layout).
- report_email.py: config-driven SMTP via stdlib (env vars) with a dry-run
  fallback so the pipeline runs without credentials.
- report_orchestrator.py: compute -> render -> always write report.html +
  report.json to disk -> best-effort email.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-10 20:41:05 +00:00
parent 38f2c751b8
commit ed195ed96b
6 changed files with 1142 additions and 144 deletions
+289 -144
View File
@@ -1712,6 +1712,19 @@ def _parse_rnh(content: bytes) -> dict:
result["stop_time_str"] = value
elif key == "Total Measurement Time":
result["total_time_str"] = value
elif key == "Frequency Weighting (Main)":
result["frequency_weighting"] = value
elif key == "Time Weighting (Main)":
result["time_weighting"] = value
elif key == "Leq Calculation Interval":
result["leq_interval"] = value
elif key.startswith("Percentile "):
# e.g. "Percentile 4,90.0" → percentiles["4"] = "90.0".
# Lets the report label the LN slots (here LN4 = L90) from the
# device's own config instead of hardcoding which slot is which —
# the percentile assignment is reconfigurable per job.
slot = key[len("Percentile "):].strip()
result.setdefault("percentiles", {})[slot] = value
except Exception:
pass
return result
@@ -1740,6 +1753,270 @@ def _classify_file(filename: str) -> str:
return "data"
def _is_wanted_nrl_file(fname: str) -> bool:
"""Keep only the files an NRL ingest cares about: .rnh metadata + the
averaged Leq .rnd. Drops the 1-second _Lp_ files and everything else.
- NL-43 writes two .rnd types: _Leq_ (15-min averages, wanted) and
_Lp_ (1-second granular, skipped).
- AU2 (NL-23/older Rion) writes a single Au2_####.rnd — always keep.
Note this is purely about which *files* to store, not which *metrics* to
report: the kept Leq file carries every column (Leq, Lmax, L1/L10/L50/
L90/L95, Lpeak, …), so the report layer can select any metric later.
"""
n = fname.lower()
if n.endswith(".rnh"):
return True
if n.endswith(".rnd"):
if "_leq_" in n: # NL-43 Leq file
return True
if n.startswith("au2_"): # AU2 format (NL-23) — Leq equivalent
return True
if "_lp" not in n and "_leq_" not in n:
# Unknown .rnd format — include it so we don't silently drop data
return True
return False
class IngestError(Exception):
"""Raised when an NRL upload/ZIP has no usable data or an invalid target.
Kept HTTP-agnostic so the ingest core can be driven programmatically (the
scheduled FTP pull) as well as from the HTTP upload endpoint. Callers
translate it: the endpoint → HTTP 400, the scheduler → logged failure.
"""
pass
def _find_existing_session(
db: Session,
location_id: str,
store_name: str,
started_at,
start_time_str: str,
):
"""Return an already-ingested session for this location that represents the
same measurement, or None.
Used to make FTP re-pulls idempotent: a daily cycle closes one Auto_####
folder per day, so a session is uniquely identified within a location by
(store_name + measurement start time). Store names recycle across jobs, so
we always match on start time too.
"""
if not store_name and not started_at:
return None
candidates = db.query(MonitoringSession).filter(
MonitoringSession.location_id == location_id,
MonitoringSession.session_type == "sound",
).all()
for s in candidates:
try:
meta = json.loads(s.session_metadata or "{}")
except (json.JSONDecodeError, TypeError):
meta = {}
if store_name and meta.get("store_name") != store_name:
continue
# Same store_name — confirm it's the same measurement by start time.
if start_time_str and meta.get("start_time_str") == start_time_str:
return s
if not meta.get("start_time_str") and started_at and s.started_at == started_at:
return s
return None
def _ingest_file_entries(
location: MonitoringLocation,
file_entries: list[tuple[str, bytes]],
db: Session,
*,
source: str = "manual_upload",
dedupe: bool = False,
) -> dict:
"""Core NRL ingest, shared by the HTTP upload and the programmatic FTP pull.
Takes already-normalized (filename, bytes) entries, keeps the wanted files,
parses the .rnh, and creates a MonitoringSession + DataFile rows under the
location's project. Metric-agnostic: the full Leq file is written to disk
and every column preserved; metric selection happens in the report layer.
Raises IngestError if no usable files are present.
"""
# --- Filter to the files we keep (.rnh + Leq .rnd) ---
file_entries = [(f, b) for f, b in file_entries if _is_wanted_nrl_file(f)]
if not file_entries:
raise IngestError(
"No usable .rnd or .rnh files found. Expected NL-43 _Leq_ files or AU2 format .rnd files."
)
# --- Parse .rnh metadata (first one wins) ---
rnh_meta = {}
for fname, fbytes in file_entries:
if fname.lower().endswith(".rnh"):
rnh_meta = _parse_rnh(fbytes)
break
# RNH stores local time (no UTC offset). Use local for period/label, then
# convert to UTC for storage so the local_datetime filter displays correctly.
started_at_local = _parse_rnh_datetime(rnh_meta.get("start_time_str")) or datetime.utcnow()
stopped_at_local = _parse_rnh_datetime(rnh_meta.get("stop_time_str"))
started_at = local_to_utc(started_at_local)
stopped_at = local_to_utc(stopped_at_local) if stopped_at_local else None
duration_seconds = (
int((stopped_at - started_at).total_seconds())
if (started_at and stopped_at) else None
)
store_name = rnh_meta.get("store_name", "")
serial_number = rnh_meta.get("serial_number", "")
index_number = rnh_meta.get("index_number", "")
start_time_str = rnh_meta.get("start_time_str", "")
# --- Dedupe: skip if this exact measurement is already ingested ---
if dedupe:
existing = _find_existing_session(db, location.id, store_name, started_at, start_time_str)
if existing:
return {
"success": True,
"deduped": True,
"session_id": existing.id,
"files_imported": 0,
"leq_files": 0,
"lp_files": 0,
"metadata_files": 0,
"store_name": store_name,
"started_at": started_at.isoformat() if started_at else None,
"stopped_at": stopped_at.isoformat() if stopped_at else None,
}
# --- Create MonitoringSession (local times drive period/label) ---
period_type = _derive_period_type(started_at_local) if started_at_local else None
session_label = (
_build_session_label(started_at_local, location.name, period_type)
if started_at_local else None
)
session_id = str(uuid.uuid4())
monitoring_session = MonitoringSession(
id=session_id,
project_id=location.project_id,
location_id=location.id,
unit_id=None,
session_type="sound",
started_at=started_at,
stopped_at=stopped_at,
duration_seconds=duration_seconds,
status="completed",
session_label=session_label,
period_type=period_type,
session_metadata=json.dumps({
"source": source,
"store_name": store_name,
"serial_number": serial_number,
"index_number": index_number,
"start_time_str": start_time_str,
# Captured from the .rnh so the report can label metrics from the
# device's own config (which LN slot is L90, the weightings, etc.).
"percentiles": rnh_meta.get("percentiles", {}),
"frequency_weighting": rnh_meta.get("frequency_weighting", ""),
"time_weighting": rnh_meta.get("time_weighting", ""),
"leq_interval": rnh_meta.get("leq_interval", ""),
}),
)
db.add(monitoring_session)
db.commit()
db.refresh(monitoring_session)
# --- Write files to disk + create DataFile records ---
output_dir = Path("data/Projects") / location.project_id / session_id
output_dir.mkdir(parents=True, exist_ok=True)
leq_count = lp_count = metadata_count = files_imported = 0
for fname, fbytes in file_entries:
fname_lower = fname.lower()
if fname_lower.endswith(".rnd"):
if "_leq_" in fname_lower:
leq_count += 1
elif "_lp" in fname_lower:
lp_count += 1
elif fname_lower.endswith(".rnh"):
metadata_count += 1
dest = output_dir / fname
dest.write_bytes(fbytes)
checksum = hashlib.sha256(fbytes).hexdigest()
rel_path = str(dest.relative_to("data"))
db.add(DataFile(
id=str(uuid.uuid4()),
session_id=session_id,
file_path=rel_path,
file_type=_classify_file(fname),
file_size_bytes=len(fbytes),
downloaded_at=datetime.utcnow(),
checksum=checksum,
file_metadata=json.dumps({
"source": source,
"original_filename": fname,
"store_name": store_name,
}),
))
files_imported += 1
db.commit()
return {
"success": True,
"deduped": False,
"session_id": session_id,
"files_imported": files_imported,
"leq_files": leq_count,
"lp_files": lp_count,
"metadata_files": metadata_count,
"store_name": store_name,
"started_at": started_at.isoformat() if started_at else None,
"stopped_at": stopped_at.isoformat() if stopped_at else None,
}
def ingest_nrl_zip(
location_id: str,
zip_bytes: bytes,
db: Session,
*,
source: str = "ftp_pull",
dedupe: bool = True,
) -> dict:
"""Programmatically ingest an Auto_#### ZIP (e.g. a scheduled FTP pull).
Extracts the ZIP (flattening any nested Auto_Leq/Auto_Lp_ folders), keeps
the .rnh + Leq .rnd, parses the header, and creates a MonitoringSession +
DataFile rows for `location_id`. Defaults to dedupe=True so repeated daily
pulls of the same closed folder don't create duplicate sessions.
Returns the same dict shape as the HTTP upload, plus a `deduped` flag.
Raises IngestError on a bad ZIP, no usable files, or unknown location.
"""
location = db.query(MonitoringLocation).filter_by(id=location_id).first()
if not location:
raise IngestError(f"Location {location_id} not found")
try:
with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
file_entries: list[tuple[str, bytes]] = []
for info in zf.infolist():
if info.is_dir():
continue
name = Path(info.filename).name # strip nested folder paths
if not name:
continue
file_entries.append((name, zf.read(info)))
except zipfile.BadZipFile:
raise IngestError("Downloaded data is not a valid ZIP archive.")
return _ingest_file_entries(location, file_entries, db, source=source, dedupe=dedupe)
@router.post("/nrl/{location_id}/upload-data")
async def upload_nrl_data(
project_id: str,
@@ -1754,11 +2031,13 @@ async def upload_nrl_data(
- A single .zip file (the Auto_#### folder zipped) — auto-extracted
- Multiple .rnd / .rnh files selected directly from the SD card folder
Creates a MonitoringSession from .rnh metadata and DataFile records
for each measurement file. No unit assignment required.
Normalizes the upload to (filename, bytes) entries, then hands off to the
shared ingest core (`_ingest_file_entries`) — the same path the scheduled
FTP pull uses via `ingest_nrl_zip`. Creates a MonitoringSession from the
.rnh metadata and DataFile records for each measurement file. No unit
assignment required. dedupe=False here preserves the prior manual-upload
behaviour (re-uploading creates a fresh session).
"""
from datetime import datetime
# Verify project and location exist
project = db.query(Project).filter_by(id=project_id).first()
_require_module(project, "sound_monitoring", db)
@@ -1769,7 +2048,7 @@ async def upload_nrl_data(
if not location:
raise HTTPException(status_code=404, detail="Location not found")
# --- Step 1: Normalize to (filename, bytes) list ---
# --- Normalize upload to (filename, bytes) entries ---
file_entries: list[tuple[str, bytes]] = []
if len(files) == 1 and files[0].filename.lower().endswith(".zip"):
@@ -1793,145 +2072,11 @@ async def upload_nrl_data(
if not file_entries:
raise HTTPException(status_code=400, detail="No usable files found in upload.")
# --- Step 1b: Filter to only relevant files ---
# Keep: .rnh (metadata) and measurement .rnd files
# NL-43 generates two .rnd types: _Leq_ (15-min averages, wanted) and _Lp_ (1-sec granular, skip)
# AU2 (NL-23/older Rion) generates a single Au2_####.rnd per session — always keep those
# Drop: _Lp_ .rnd, .xlsx, .mp3, and anything else
def _is_wanted(fname: str) -> bool:
n = fname.lower()
if n.endswith(".rnh"):
return True
if n.endswith(".rnd"):
if "_leq_" in n: # NL-43 Leq file
return True
if n.startswith("au2_"): # AU2 format (NL-23) — always Leq equivalent
return True
if "_lp" not in n and "_leq_" not in n:
# Unknown .rnd format — include it so we don't silently drop data
return True
return False
file_entries = [(fname, fbytes) for fname, fbytes in file_entries if _is_wanted(fname)]
if not file_entries:
raise HTTPException(status_code=400, detail="No usable .rnd or .rnh files found. Expected NL-43 _Leq_ files or AU2 format .rnd files.")
# --- Step 2: Find and parse .rnh metadata ---
rnh_meta = {}
for fname, fbytes in file_entries:
if fname.lower().endswith(".rnh"):
rnh_meta = _parse_rnh(fbytes)
break
# RNH files store local time (no UTC offset). Use local values for period
# classification / label generation, then convert to UTC for DB storage so
# the local_datetime Jinja filter displays the correct time.
started_at_local = _parse_rnh_datetime(rnh_meta.get("start_time_str")) or datetime.utcnow()
stopped_at_local = _parse_rnh_datetime(rnh_meta.get("stop_time_str"))
started_at = local_to_utc(started_at_local)
stopped_at = local_to_utc(stopped_at_local) if stopped_at_local else None
duration_seconds = None
if started_at and stopped_at:
duration_seconds = int((stopped_at - started_at).total_seconds())
store_name = rnh_meta.get("store_name", "")
serial_number = rnh_meta.get("serial_number", "")
index_number = rnh_meta.get("index_number", "")
# --- Step 3: Create MonitoringSession ---
# Use local times for period/label so classification reflects the clock at the site.
period_type = _derive_period_type(started_at_local) if started_at_local else None
session_label = _build_session_label(started_at_local, location.name, period_type) if started_at_local else None
session_id = str(uuid.uuid4())
monitoring_session = MonitoringSession(
id=session_id,
project_id=project_id,
location_id=location_id,
unit_id=None,
session_type="sound",
started_at=started_at,
stopped_at=stopped_at,
duration_seconds=duration_seconds,
status="completed",
session_label=session_label,
period_type=period_type,
session_metadata=json.dumps({
"source": "manual_upload",
"store_name": store_name,
"serial_number": serial_number,
"index_number": index_number,
}),
)
db.add(monitoring_session)
db.commit()
db.refresh(monitoring_session)
# --- Step 4: Write files to disk and create DataFile records ---
output_dir = Path("data/Projects") / project_id / session_id
output_dir.mkdir(parents=True, exist_ok=True)
leq_count = 0
lp_count = 0
metadata_count = 0
files_imported = 0
for fname, fbytes in file_entries:
file_type = _classify_file(fname)
fname_lower = fname.lower()
# Track counts for summary
if fname_lower.endswith(".rnd"):
if "_leq_" in fname_lower:
leq_count += 1
elif "_lp" in fname_lower:
lp_count += 1
elif fname_lower.endswith(".rnh"):
metadata_count += 1
# Write to disk
dest = output_dir / fname
dest.write_bytes(fbytes)
# Compute checksum
checksum = hashlib.sha256(fbytes).hexdigest()
# Store relative path from data/ dir
rel_path = str(dest.relative_to("data"))
data_file = DataFile(
id=str(uuid.uuid4()),
session_id=session_id,
file_path=rel_path,
file_type=file_type,
file_size_bytes=len(fbytes),
downloaded_at=datetime.utcnow(),
checksum=checksum,
file_metadata=json.dumps({
"source": "manual_upload",
"original_filename": fname,
"store_name": store_name,
}),
)
db.add(data_file)
files_imported += 1
db.commit()
return {
"success": True,
"session_id": session_id,
"files_imported": files_imported,
"leq_files": leq_count,
"lp_files": lp_count,
"metadata_files": metadata_count,
"store_name": store_name,
"started_at": started_at.isoformat() if started_at else None,
"stopped_at": stopped_at.isoformat() if stopped_at else None,
}
# --- Hand off to the shared ingest core ---
try:
return _ingest_file_entries(location, file_entries, db, source="manual_upload", dedupe=False)
except IngestError as e:
raise HTTPException(status_code=400, detail=str(e))
# ============================================================================