feat(reports): FTP night-report pipeline foundation

Terra-View side of the daily night-vs-baseline sound report for the John Myler 24/7 job. Engine is built and verified end-to-end against real meter data; SMTP send + scheduler/capture wiring still pending. - ingest: refactor upload_nrl_data into a callable ingest_nrl_zip(location_id, zip_bytes, db) sharing one core with the HTTP endpoint. Capture the .rnh percentile map + weightings into session metadata; dedup on store-name + start time. Ingest stays metric-agnostic (every Leq column preserved). - report_pipeline.py: metric registry, Evening/Nighttime windows, correct aggregation (Lmax=max, Ln=arithmetic, Leq=logarithmic), baseline = typical night, per-location + per-project builders. - report_renderers.py: HTML email-body renderer (Last/Base/delta layout). - report_email.py: config-driven SMTP via stdlib (env vars) with a dry-run fallback so the pipeline runs without credentials. - report_orchestrator.py: compute -> render -> always write report.html + report.json to disk -> best-effort email. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-10 20:41:05 +00:00
parent 38f2c751b8
commit ed195ed96b
6 changed files with 1142 additions and 144 deletions
@@ -1712,6 +1712,19 @@ def _parse_rnh(content: bytes) -> dict:
                    result["stop_time_str"] = value
                elif key == "Total Measurement Time":
                    result["total_time_str"] = value
+                elif key == "Frequency Weighting (Main)":
+                    result["frequency_weighting"] = value
+                elif key == "Time Weighting (Main)":
+                    result["time_weighting"] = value
+                elif key == "Leq Calculation Interval":
+                    result["leq_interval"] = value
+                elif key.startswith("Percentile "):
+                    # e.g. "Percentile 4,90.0" → percentiles["4"] = "90.0".
+                    # Lets the report label the LN slots (here LN4 = L90) from the
+                    # device's own config instead of hardcoding which slot is which —
+                    # the percentile assignment is reconfigurable per job.
+                    slot = key[len("Percentile "):].strip()
+                    result.setdefault("percentiles", {})[slot] = value
    except Exception:
        pass
    return result
@@ -1740,6 +1753,270 @@ def _classify_file(filename: str) -> str:
    return "data"


+def _is_wanted_nrl_file(fname: str) -> bool:
+    """Keep only the files an NRL ingest cares about: .rnh metadata + the
+    averaged Leq .rnd.  Drops the 1-second _Lp_ files and everything else.
+
+      - NL-43 writes two .rnd types: _Leq_ (15-min averages, wanted) and
+        _Lp_ (1-second granular, skipped).
+      - AU2 (NL-23/older Rion) writes a single Au2_####.rnd — always keep.
+
+    Note this is purely about which *files* to store, not which *metrics* to
+    report: the kept Leq file carries every column (Leq, Lmax, L1/L10/L50/
+    L90/L95, Lpeak, …), so the report layer can select any metric later.
+    """
+    n = fname.lower()
+    if n.endswith(".rnh"):
+        return True
+    if n.endswith(".rnd"):
+        if "_leq_" in n:          # NL-43 Leq file
+            return True
+        if n.startswith("au2_"):  # AU2 format (NL-23) — Leq equivalent
+            return True
+        if "_lp" not in n and "_leq_" not in n:
+            # Unknown .rnd format — include it so we don't silently drop data
+            return True
+    return False
+
+
+class IngestError(Exception):
+    """Raised when an NRL upload/ZIP has no usable data or an invalid target.
+
+    Kept HTTP-agnostic so the ingest core can be driven programmatically (the
+    scheduled FTP pull) as well as from the HTTP upload endpoint.  Callers
+    translate it: the endpoint → HTTP 400, the scheduler → logged failure.
+    """
+    pass
+
+
+def _find_existing_session(
+    db: Session,
+    location_id: str,
+    store_name: str,
+    started_at,
+    start_time_str: str,
+):
+    """Return an already-ingested session for this location that represents the
+    same measurement, or None.
+
+    Used to make FTP re-pulls idempotent: a daily cycle closes one Auto_####
+    folder per day, so a session is uniquely identified within a location by
+    (store_name + measurement start time).  Store names recycle across jobs, so
+    we always match on start time too.
+    """
+    if not store_name and not started_at:
+        return None
+    candidates = db.query(MonitoringSession).filter(
+        MonitoringSession.location_id == location_id,
+        MonitoringSession.session_type == "sound",
+    ).all()
+    for s in candidates:
+        try:
+            meta = json.loads(s.session_metadata or "{}")
+        except (json.JSONDecodeError, TypeError):
+            meta = {}
+        if store_name and meta.get("store_name") != store_name:
+            continue
+        # Same store_name — confirm it's the same measurement by start time.
+        if start_time_str and meta.get("start_time_str") == start_time_str:
+            return s
+        if not meta.get("start_time_str") and started_at and s.started_at == started_at:
+            return s
+    return None
+
+
+def _ingest_file_entries(
+    location: MonitoringLocation,
+    file_entries: list[tuple[str, bytes]],
+    db: Session,
+    *,
+    source: str = "manual_upload",
+    dedupe: bool = False,
+) -> dict:
+    """Core NRL ingest, shared by the HTTP upload and the programmatic FTP pull.
+
+    Takes already-normalized (filename, bytes) entries, keeps the wanted files,
+    parses the .rnh, and creates a MonitoringSession + DataFile rows under the
+    location's project.  Metric-agnostic: the full Leq file is written to disk
+    and every column preserved; metric selection happens in the report layer.
+
+    Raises IngestError if no usable files are present.
+    """
+    # --- Filter to the files we keep (.rnh + Leq .rnd) ---
+    file_entries = [(f, b) for f, b in file_entries if _is_wanted_nrl_file(f)]
+    if not file_entries:
+        raise IngestError(
+            "No usable .rnd or .rnh files found. Expected NL-43 _Leq_ files or AU2 format .rnd files."
+        )
+
+    # --- Parse .rnh metadata (first one wins) ---
+    rnh_meta = {}
+    for fname, fbytes in file_entries:
+        if fname.lower().endswith(".rnh"):
+            rnh_meta = _parse_rnh(fbytes)
+            break
+
+    # RNH stores local time (no UTC offset).  Use local for period/label, then
+    # convert to UTC for storage so the local_datetime filter displays correctly.
+    started_at_local = _parse_rnh_datetime(rnh_meta.get("start_time_str")) or datetime.utcnow()
+    stopped_at_local = _parse_rnh_datetime(rnh_meta.get("stop_time_str"))
+    started_at = local_to_utc(started_at_local)
+    stopped_at = local_to_utc(stopped_at_local) if stopped_at_local else None
+    duration_seconds = (
+        int((stopped_at - started_at).total_seconds())
+        if (started_at and stopped_at) else None
+    )
+
+    store_name = rnh_meta.get("store_name", "")
+    serial_number = rnh_meta.get("serial_number", "")
+    index_number = rnh_meta.get("index_number", "")
+    start_time_str = rnh_meta.get("start_time_str", "")
+
+    # --- Dedupe: skip if this exact measurement is already ingested ---
+    if dedupe:
+        existing = _find_existing_session(db, location.id, store_name, started_at, start_time_str)
+        if existing:
+            return {
+                "success": True,
+                "deduped": True,
+                "session_id": existing.id,
+                "files_imported": 0,
+                "leq_files": 0,
+                "lp_files": 0,
+                "metadata_files": 0,
+                "store_name": store_name,
+                "started_at": started_at.isoformat() if started_at else None,
+                "stopped_at": stopped_at.isoformat() if stopped_at else None,
+            }
+
+    # --- Create MonitoringSession (local times drive period/label) ---
+    period_type = _derive_period_type(started_at_local) if started_at_local else None
+    session_label = (
+        _build_session_label(started_at_local, location.name, period_type)
+        if started_at_local else None
+    )
+
+    session_id = str(uuid.uuid4())
+    monitoring_session = MonitoringSession(
+        id=session_id,
+        project_id=location.project_id,
+        location_id=location.id,
+        unit_id=None,
+        session_type="sound",
+        started_at=started_at,
+        stopped_at=stopped_at,
+        duration_seconds=duration_seconds,
+        status="completed",
+        session_label=session_label,
+        period_type=period_type,
+        session_metadata=json.dumps({
+            "source": source,
+            "store_name": store_name,
+            "serial_number": serial_number,
+            "index_number": index_number,
+            "start_time_str": start_time_str,
+            # Captured from the .rnh so the report can label metrics from the
+            # device's own config (which LN slot is L90, the weightings, etc.).
+            "percentiles": rnh_meta.get("percentiles", {}),
+            "frequency_weighting": rnh_meta.get("frequency_weighting", ""),
+            "time_weighting": rnh_meta.get("time_weighting", ""),
+            "leq_interval": rnh_meta.get("leq_interval", ""),
+        }),
+    )
+    db.add(monitoring_session)
+    db.commit()
+    db.refresh(monitoring_session)
+
+    # --- Write files to disk + create DataFile records ---
+    output_dir = Path("data/Projects") / location.project_id / session_id
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    leq_count = lp_count = metadata_count = files_imported = 0
+    for fname, fbytes in file_entries:
+        fname_lower = fname.lower()
+        if fname_lower.endswith(".rnd"):
+            if "_leq_" in fname_lower:
+                leq_count += 1
+            elif "_lp" in fname_lower:
+                lp_count += 1
+        elif fname_lower.endswith(".rnh"):
+            metadata_count += 1
+
+        dest = output_dir / fname
+        dest.write_bytes(fbytes)
+        checksum = hashlib.sha256(fbytes).hexdigest()
+        rel_path = str(dest.relative_to("data"))
+
+        db.add(DataFile(
+            id=str(uuid.uuid4()),
+            session_id=session_id,
+            file_path=rel_path,
+            file_type=_classify_file(fname),
+            file_size_bytes=len(fbytes),
+            downloaded_at=datetime.utcnow(),
+            checksum=checksum,
+            file_metadata=json.dumps({
+                "source": source,
+                "original_filename": fname,
+                "store_name": store_name,
+            }),
+        ))
+        files_imported += 1
+
+    db.commit()
+
+    return {
+        "success": True,
+        "deduped": False,
+        "session_id": session_id,
+        "files_imported": files_imported,
+        "leq_files": leq_count,
+        "lp_files": lp_count,
+        "metadata_files": metadata_count,
+        "store_name": store_name,
+        "started_at": started_at.isoformat() if started_at else None,
+        "stopped_at": stopped_at.isoformat() if stopped_at else None,
+    }
+
+
+def ingest_nrl_zip(
+    location_id: str,
+    zip_bytes: bytes,
+    db: Session,
+    *,
+    source: str = "ftp_pull",
+    dedupe: bool = True,
+) -> dict:
+    """Programmatically ingest an Auto_#### ZIP (e.g. a scheduled FTP pull).
+
+    Extracts the ZIP (flattening any nested Auto_Leq/Auto_Lp_ folders), keeps
+    the .rnh + Leq .rnd, parses the header, and creates a MonitoringSession +
+    DataFile rows for `location_id`.  Defaults to dedupe=True so repeated daily
+    pulls of the same closed folder don't create duplicate sessions.
+
+    Returns the same dict shape as the HTTP upload, plus a `deduped` flag.
+    Raises IngestError on a bad ZIP, no usable files, or unknown location.
+    """
+    location = db.query(MonitoringLocation).filter_by(id=location_id).first()
+    if not location:
+        raise IngestError(f"Location {location_id} not found")
+
+    try:
+        with zipfile.ZipFile(io.BytesIO(zip_bytes)) as zf:
+            file_entries: list[tuple[str, bytes]] = []
+            for info in zf.infolist():
+                if info.is_dir():
+                    continue
+                name = Path(info.filename).name  # strip nested folder paths
+                if not name:
+                    continue
+                file_entries.append((name, zf.read(info)))
+    except zipfile.BadZipFile:
+        raise IngestError("Downloaded data is not a valid ZIP archive.")
+
+    return _ingest_file_entries(location, file_entries, db, source=source, dedupe=dedupe)
+
+
@router.post("/nrl/{location_id}/upload-data")
 async def upload_nrl_data(
    project_id: str,
@@ -1754,11 +2031,13 @@ async def upload_nrl_data(
    - A single .zip file (the Auto_#### folder zipped) — auto-extracted
    - Multiple .rnd / .rnh files selected directly from the SD card folder

-    Creates a MonitoringSession from .rnh metadata and DataFile records
-    for each measurement file. No unit assignment required.
+    Normalizes the upload to (filename, bytes) entries, then hands off to the
+    shared ingest core (`_ingest_file_entries`) — the same path the scheduled
+    FTP pull uses via `ingest_nrl_zip`.  Creates a MonitoringSession from the
+    .rnh metadata and DataFile records for each measurement file.  No unit
+    assignment required.  dedupe=False here preserves the prior manual-upload
+    behaviour (re-uploading creates a fresh session).
    """
-    from datetime import datetime
-
    # Verify project and location exist
    project = db.query(Project).filter_by(id=project_id).first()
    _require_module(project, "sound_monitoring", db)
@@ -1769,7 +2048,7 @@ async def upload_nrl_data(
    if not location:
        raise HTTPException(status_code=404, detail="Location not found")

-    # --- Step 1: Normalize to (filename, bytes) list ---
+    # --- Normalize upload to (filename, bytes) entries ---
    file_entries: list[tuple[str, bytes]] = []

    if len(files) == 1 and files[0].filename.lower().endswith(".zip"):
@@ -1793,145 +2072,11 @@ async def upload_nrl_data(
    if not file_entries:
        raise HTTPException(status_code=400, detail="No usable files found in upload.")

-    # --- Step 1b: Filter to only relevant files ---
-    # Keep: .rnh (metadata) and measurement .rnd files
-    #   NL-43 generates two .rnd types: _Leq_ (15-min averages, wanted) and _Lp_ (1-sec granular, skip)
-    #   AU2 (NL-23/older Rion) generates a single Au2_####.rnd per session — always keep those
-    # Drop: _Lp_ .rnd, .xlsx, .mp3, and anything else
-    def _is_wanted(fname: str) -> bool:
-        n = fname.lower()
-        if n.endswith(".rnh"):
-            return True
-        if n.endswith(".rnd"):
-            if "_leq_" in n:        # NL-43 Leq file
-                return True
-            if n.startswith("au2_"): # AU2 format (NL-23) — always Leq equivalent
-                return True
-            if "_lp" not in n and "_leq_" not in n:
-                # Unknown .rnd format — include it so we don't silently drop data
-                return True
-        return False
-
-    file_entries = [(fname, fbytes) for fname, fbytes in file_entries if _is_wanted(fname)]
-
-    if not file_entries:
-        raise HTTPException(status_code=400, detail="No usable .rnd or .rnh files found. Expected NL-43 _Leq_ files or AU2 format .rnd files.")
-
-    # --- Step 2: Find and parse .rnh metadata ---
-    rnh_meta = {}
-    for fname, fbytes in file_entries:
-        if fname.lower().endswith(".rnh"):
-            rnh_meta = _parse_rnh(fbytes)
-            break
-
-    # RNH files store local time (no UTC offset).  Use local values for period
-    # classification / label generation, then convert to UTC for DB storage so
-    # the local_datetime Jinja filter displays the correct time.
-    started_at_local = _parse_rnh_datetime(rnh_meta.get("start_time_str")) or datetime.utcnow()
-    stopped_at_local = _parse_rnh_datetime(rnh_meta.get("stop_time_str"))
-
-    started_at = local_to_utc(started_at_local)
-    stopped_at = local_to_utc(stopped_at_local) if stopped_at_local else None
-
-    duration_seconds = None
-    if started_at and stopped_at:
-        duration_seconds = int((stopped_at - started_at).total_seconds())
-
-    store_name = rnh_meta.get("store_name", "")
-    serial_number = rnh_meta.get("serial_number", "")
-    index_number = rnh_meta.get("index_number", "")
-
-    # --- Step 3: Create MonitoringSession ---
-    # Use local times for period/label so classification reflects the clock at the site.
-    period_type = _derive_period_type(started_at_local) if started_at_local else None
-    session_label = _build_session_label(started_at_local, location.name, period_type) if started_at_local else None
-
-    session_id = str(uuid.uuid4())
-    monitoring_session = MonitoringSession(
-        id=session_id,
-        project_id=project_id,
-        location_id=location_id,
-        unit_id=None,
-        session_type="sound",
-        started_at=started_at,
-        stopped_at=stopped_at,
-        duration_seconds=duration_seconds,
-        status="completed",
-        session_label=session_label,
-        period_type=period_type,
-        session_metadata=json.dumps({
-            "source": "manual_upload",
-            "store_name": store_name,
-            "serial_number": serial_number,
-            "index_number": index_number,
-        }),
-    )
-    db.add(monitoring_session)
-    db.commit()
-    db.refresh(monitoring_session)
-
-    # --- Step 4: Write files to disk and create DataFile records ---
-    output_dir = Path("data/Projects") / project_id / session_id
-    output_dir.mkdir(parents=True, exist_ok=True)
-
-    leq_count = 0
-    lp_count = 0
-    metadata_count = 0
-    files_imported = 0
-
-    for fname, fbytes in file_entries:
-        file_type = _classify_file(fname)
-        fname_lower = fname.lower()
-
-        # Track counts for summary
-        if fname_lower.endswith(".rnd"):
-            if "_leq_" in fname_lower:
-                leq_count += 1
-            elif "_lp" in fname_lower:
-                lp_count += 1
-        elif fname_lower.endswith(".rnh"):
-            metadata_count += 1
-
-        # Write to disk
-        dest = output_dir / fname
-        dest.write_bytes(fbytes)
-
-        # Compute checksum
-        checksum = hashlib.sha256(fbytes).hexdigest()
-
-        # Store relative path from data/ dir
-        rel_path = str(dest.relative_to("data"))
-
-        data_file = DataFile(
-            id=str(uuid.uuid4()),
-            session_id=session_id,
-            file_path=rel_path,
-            file_type=file_type,
-            file_size_bytes=len(fbytes),
-            downloaded_at=datetime.utcnow(),
-            checksum=checksum,
-            file_metadata=json.dumps({
-                "source": "manual_upload",
-                "original_filename": fname,
-                "store_name": store_name,
-            }),
-        )
-        db.add(data_file)
-        files_imported += 1
-
-    db.commit()
-
-    return {
-        "success": True,
-        "session_id": session_id,
-        "files_imported": files_imported,
-        "leq_files": leq_count,
-        "lp_files": lp_count,
-        "metadata_files": metadata_count,
-        "store_name": store_name,
-        "started_at": started_at.isoformat() if started_at else None,
-        "stopped_at": stopped_at.isoformat() if stopped_at else None,
-    }
+    # --- Hand off to the shared ingest core ---
+    try:
+        return _ingest_file_entries(location, file_entries, db, source="manual_upload", dedupe=False)
+    except IngestError as e:
+        raise HTTPException(status_code=400, detail=str(e))


 # ============================================================================