""" Metadata-backfill admin router. Endpoints under /api/admin/metadata_backfill: GET /scan — run the scan; return clusters + suggestions (JSON). Cached 5 minutes so the wizard doesn't re-scan on every page render. POST /apply — apply a list of cluster_ids; body specifies which to accept and optional per-cluster overrides. POST /skip — mark cluster_ids as skipped (won't reappear). """ from __future__ import annotations import os import time from typing import Optional from fastapi import APIRouter, Depends, HTTPException, Request from fastapi.responses import JSONResponse from sqlalchemy.orm import Session from backend.database import get_db from backend.services import metadata_backfill as svc router = APIRouter(prefix="/api/admin/metadata_backfill", tags=["metadata-backfill"]) SFM_BASE_URL = os.getenv("SFM_BASE_URL", "http://localhost:8200") # In-process scan cache. Trades memory for not re-hammering SFM on every # wizard render. TTL: 5 minutes. Singleton per-process; fine for a # single-worker uvicorn dev setup. For prod multi-worker we'd want to put # this in the DB or Redis; deferred. _SCAN_CACHE: dict = {"at": 0.0, "result": None} _SCAN_CACHE_TTL_SECONDS = 300.0 def _serialise_suggestion(s: svc.Suggestion) -> dict: c = s.cluster return { "cluster_id": c.cluster_id, "serial": c.serial, "first_event_ts": c.first_event_ts.isoformat(), "last_event_ts": c.last_event_ts.isoformat(), "event_count": c.event_count, "sample_event_id": c.sample_event_id, "project_raw": c.project_raw, "location_raw": c.location_raw, "client_raw": c.client_raw, "operator_raw": c.operator_raw, "is_blank_meta": c.is_blank_meta, "metadata_consistency": c.metadata_consistency, "project_match": s.project_match, "project_existing_id": s.project_existing_id, "project_existing_name": s.project_existing_name, "project_match_score": s.project_match_score, "project_suggested_name": s.project_suggested_name, "location_match": s.location_match, "location_existing_id": s.location_existing_id, "location_existing_name": s.location_existing_name, "location_match_score": s.location_match_score, "location_suggested_name": s.location_suggested_name, "proposed_assigned_at": s.proposed_assigned_at.isoformat(), "proposed_assigned_until": s.proposed_assigned_until.isoformat() if s.proposed_assigned_until else None, "confidence": s.confidence, "blocking_conflict": s.blocking_conflict, "conflicts": [ { "existing_assignment_id": cf.existing_assignment_id, "other_location_id": cf.other_location_id, "other_location_name": cf.other_location_name, "other_project_id": cf.other_project_id, "other_project_name": cf.other_project_name, } for cf in s.conflicts ], } @router.get("/scan") async def scan( force: bool = False, db: Session = Depends(get_db), ): """Run a scan and return clusters + suggestions. Set force=true to bypass the 5-minute cache. """ now = time.time() if not force and _SCAN_CACHE["result"] is not None \ and (now - _SCAN_CACHE["at"]) < _SCAN_CACHE_TTL_SECONDS: return _SCAN_CACHE["result"] result = await svc.scan_clusters_and_build_suggestions(db, SFM_BASE_URL) # Group suggestions for the wizard UI. by_confidence = {"high": [], "medium": [], "low": []} blocking_conflict_count = 0 for s in result.suggestions: by_confidence[s.confidence].append(_serialise_suggestion(s)) if s.blocking_conflict: blocking_conflict_count += 1 payload = { "scanned_event_count": result.scanned_event_count, "cluster_count": result.cluster_count, "already_attributed": result.already_attributed, "skipped_orphans": result.skipped_orphans, "pending_count": len(result.suggestions), "blocking_conflict_count": blocking_conflict_count, "by_confidence": { "high": by_confidence["high"], "medium": by_confidence["medium"], "low": by_confidence["low"], }, "scanned_at": now, } _SCAN_CACHE["result"] = payload _SCAN_CACHE["at"] = now return payload @router.post("/apply") async def apply( request: Request, db: Session = Depends(get_db), ): """Apply a list of clusters. Body: { "cluster_ids": ["abc...", "def..."], "overrides": { "abc...": { "project_name": "...", "location_name": "..." } } } To accept ALL non-conflict suggestions in one shot, the UI sends every pending cluster_id with no overrides. """ try: body = await request.json() except Exception: raise HTTPException(status_code=400, detail="Invalid JSON body") cluster_ids = body.get("cluster_ids") or [] overrides = body.get("overrides") or {} if not isinstance(cluster_ids, list) or not cluster_ids: raise HTTPException(status_code=400, detail="cluster_ids must be a non-empty list") # Re-scan to get current suggestions. We don't trust the cached scan # blindly — the operator might have manually created projects in # between scan and apply. scan_result = await svc.scan_clusters_and_build_suggestions(db, SFM_BASE_URL) suggestions_by_id = {s.cluster.cluster_id: s for s in scan_result.suggestions} selected: list[svc.Suggestion] = [] not_found: list[str] = [] for cid in cluster_ids: s = suggestions_by_id.get(cid) if s is None: not_found.append(cid) continue # Apply overrides. ov = overrides.get(cid) or {} if "project_name" in ov: s.project_suggested_name = (ov["project_name"] or "").strip() or s.project_suggested_name # Override implies operator wants to create new (or rename). # If they wanted an exact match, they'd not have overridden. if s.project_match in ("create_new",): pass # keep create_new else: # Operator typed a custom name — force create-new behaviour # so we don't accidentally attach to a different existing # project by exact-match. s.project_existing_id = None s.project_match = "create_new" if "location_name" in ov: s.location_suggested_name = (ov["location_name"] or "").strip() or s.location_suggested_name if s.location_match in ("create_new",): pass else: s.location_existing_id = None s.location_match = "create_new" selected.append(s) apply_result = svc.apply_suggestions(db, selected, decided_by="operator") # Invalidate the scan cache so the next /scan picks up the new state. _SCAN_CACHE["at"] = 0.0 _SCAN_CACHE["result"] = None return { "applied": apply_result.applied, "failed": [{"cluster_id": cid, "reason": r} for cid, r in apply_result.failed], "not_found": not_found, "project_ids_created": apply_result.project_ids_created, "location_ids_created": apply_result.location_ids_created, "assignment_ids_created": apply_result.assignment_ids_created, } @router.post("/skip") async def skip( request: Request, db: Session = Depends(get_db), ): """Mark cluster_ids as skipped — they won't reappear in future scans.""" try: body = await request.json() except Exception: raise HTTPException(status_code=400, detail="Invalid JSON body") cluster_ids = body.get("cluster_ids") or [] if not isinstance(cluster_ids, list): raise HTTPException(status_code=400, detail="cluster_ids must be a list") n = svc.skip_clusters(db, cluster_ids, decided_by="operator") _SCAN_CACHE["at"] = 0.0 _SCAN_CACHE["result"] = None return {"skipped": n}