diff --git a/backend/main.py b/backend/main.py index 7397b8e..6c99b7d 100644 --- a/backend/main.py +++ b/backend/main.py @@ -251,6 +251,13 @@ async def metadata_backfill_wizard_page(request: Request): return templates.TemplateResponse("admin/metadata_backfill.html", {"request": request}) +@app.get("/settings/developer/project-tidy", response_class=HTMLResponse) +async def project_tidy_page(request: Request): + """Tidy duplicate-looking projects: detect by fuzzy name match, merge + by clicking through pairs (Phase 5b).""" + return templates.TemplateResponse("admin/project_tidy.html", {"request": request}) + + @app.get("/modems", response_class=HTMLResponse) async def modems_page(request: Request): """Field modems management dashboard""" diff --git a/backend/routers/projects.py b/backend/routers/projects.py index 13cb23a..3c61236 100644 --- a/backend/routers/projects.py +++ b/backend/routers/projects.py @@ -729,6 +729,49 @@ async def project_merge_preview( } +@router.get("/admin/duplicate_pairs") +async def get_duplicate_pairs( + threshold: float = 0.85, + max_pairs: int = 200, + db: Session = Depends(get_db), +): + """Return all active-project pairs whose names fuzzy-match above the + threshold. Used by the Tidy page to surface duplicates that would + otherwise have to be hunted down one at a time. + + Each pair carries a suggested merge-target with the reasoning so the + operator can decide direction with one click. + """ + from backend.services import project_tidy as pt + pairs = pt.find_duplicate_pairs(db, threshold=threshold, max_pairs=max_pairs) + + def _ps(p): + return { + "id": p.id, + "name": p.name, + "project_number": p.project_number, + "client_name": p.client_name, + "source": p.source, + "status": p.status, + "location_count": p.location_count, + "assignment_count": p.assignment_count, + } + + return { + "pairs": [ + { + "a": _ps(pair.a), + "b": _ps(pair.b), + "score": round(pair.score, 3), + "suggested_target_id": pair.suggested_target_id, + "reason": pair.reason, + } + for pair in pairs + ], + "threshold": threshold, + } + + @router.post("/{source_id}/merge_into") async def project_merge_execute( source_id: str, diff --git a/backend/services/project_tidy.py b/backend/services/project_tidy.py new file mode 100644 index 0000000..482b517 --- /dev/null +++ b/backend/services/project_tidy.py @@ -0,0 +1,235 @@ +""" +project_tidy.py — find duplicate-looking projects + offer bulk merge. + +The metadata-backfill parser is good at clustering events into candidate +projects but doesn't compare its proposed project names against EACH OTHER +(it only checks against existing terra-view projects). After a bulk +apply, you can end up with many near-duplicate projects — typo variants, +abbreviation differences, etc. This module surfaces them as pairs the +operator can merge. + +Pairs vs clusters: a fully-connected group like (A, B, C) where each pair +scores >= threshold becomes 3 pairs. The operator has to do 2 merges to +fully consolidate. We don't try to be smarter about transitive grouping — +in practice operators want to review the highest-similarity pair first +anyway, and the list re-computes after each merge. + +Public API: + find_duplicate_pairs(db, *, threshold=0.85, max_pairs=200) → list[DuplicatePair] +""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from typing import Optional + +import rapidfuzz +from sqlalchemy import func +from sqlalchemy.orm import Session + +from backend.models import ( + Project, + MonitoringLocation, + UnitAssignment, +) +from backend.services.metadata_backfill import _normalise as _meta_normalise + +log = logging.getLogger("backend.services.project_tidy") + + +DEFAULT_THRESHOLD = 0.85 # WRatio similarity above which we surface a pair +DEFAULT_MAX_PAIRS = 200 # Cap the result list to keep response small +MIN_NORMALISED_LENGTH = 4 # Skip projects whose normalised name is too short + # to fuzzy-match safely (avoids "1" / "1" pairs). + + +@dataclass +class ProjectSummary: + id: str + name: str + project_number: Optional[str] + client_name: Optional[str] + source: str # 'manual' | 'metadata_backfill' | ... + status: str + location_count: int + assignment_count: int + event_count_total: int # approx — sum across assignments + + +@dataclass +class DuplicatePair: + a: ProjectSummary + b: ProjectSummary + score: float + suggested_target_id: str # the recommended "keep" side + reason: str # why we picked that target + + +# ── Helpers ────────────────────────────────────────────────────────────────── + + +def _normalise_project_name(name: str) -> str: + """Project-name normalisation for tidy comparison. + + Reuses the metadata_backfill normaliser (lowercase, punctuation→space, + collapse whitespace). Returns "" for None or all-punctuation names. + """ + return _meta_normalise(name) + + +def _summarise_projects(db: Session) -> list[ProjectSummary]: + """One row per active project with cached counts. Excludes deleted.""" + projects = ( + db.query(Project) + .filter(Project.status != "deleted") + .all() + ) + + # Bulk lookup: assignment counts + location counts per project. + loc_counts: dict[str, int] = dict( + db.query(MonitoringLocation.project_id, func.count(MonitoringLocation.id)) + .filter(MonitoringLocation.project_id.in_([p.id for p in projects]) if projects else False) + .group_by(MonitoringLocation.project_id) + .all() + ) + asgn_counts: dict[str, int] = dict( + db.query(UnitAssignment.project_id, func.count(UnitAssignment.id)) + .filter(UnitAssignment.project_id.in_([p.id for p in projects]) if projects else False) + .group_by(UnitAssignment.project_id) + .all() + ) + + summaries: list[ProjectSummary] = [] + for p in projects: + summaries.append(ProjectSummary( + id = p.id, + name = p.name, + project_number = p.project_number, + client_name = p.client_name, + source = None, # filled below per assignment + status = p.status or "active", + location_count = loc_counts.get(p.id, 0), + assignment_count = asgn_counts.get(p.id, 0), + event_count_total = 0, # not cheap to compute here; left 0 + )) + + # Determine each project's dominant assignment source. Used to break ties + # when picking the "keep" target — prefer manual over parser-created. + rows = ( + db.query(UnitAssignment.project_id, UnitAssignment.source, func.count(UnitAssignment.id)) + .group_by(UnitAssignment.project_id, UnitAssignment.source) + .all() + ) + by_proj_src: dict[str, dict[str, int]] = {} + for proj_id, src, cnt in rows: + by_proj_src.setdefault(proj_id, {})[src or "manual"] = cnt + for s in summaries: + src_map = by_proj_src.get(s.id, {}) + if not src_map: + s.source = "manual" + else: + # Dominant source (most assignments). + s.source = max(src_map.items(), key=lambda kv: kv[1])[0] + + return summaries + + +def _pick_target(a: ProjectSummary, b: ProjectSummary) -> tuple[str, str]: + """Decide which project should be the merge target (the one we keep). + + Priorities (in order): + 1. The one with `source='manual'` over `source='metadata_backfill'` + — operator-curated projects beat parser-created ones. + 2. The one with a populated `project_number`. + 3. The one with more locations (more curation history). + 4. The one with more assignments. + 5. The one with the shorter, cleaner name (tiebreaker). + + Returns (target_id, reason_string). + """ + # 1. Source provenance. + a_manual = a.source == "manual" + b_manual = b.source == "manual" + if a_manual and not b_manual: + return a.id, "A is manually-created; B is parser-created" + if b_manual and not a_manual: + return b.id, "B is manually-created; A is parser-created" + + # 2. project_number populated. + if a.project_number and not b.project_number: + return a.id, "A has a project_number; B doesn't" + if b.project_number and not a.project_number: + return b.id, "B has a project_number; A doesn't" + + # 3. More locations. + if a.location_count > b.location_count: + return a.id, f"A has more locations ({a.location_count} vs {b.location_count})" + if b.location_count > a.location_count: + return b.id, f"B has more locations ({b.location_count} vs {a.location_count})" + + # 4. More assignments. + if a.assignment_count > b.assignment_count: + return a.id, f"A has more assignments ({a.assignment_count} vs {b.assignment_count})" + if b.assignment_count > a.assignment_count: + return b.id, f"B has more assignments ({b.assignment_count} vs {a.assignment_count})" + + # 5. Shorter name (less likely to have baked-in junk). + if len(a.name) <= len(b.name): + return a.id, "A has the shorter / cleaner name" + return b.id, "B has the shorter / cleaner name" + + +# ── Public ─────────────────────────────────────────────────────────────────── + + +def find_duplicate_pairs( + db: Session, + *, + threshold: float = DEFAULT_THRESHOLD, + max_pairs: int = DEFAULT_MAX_PAIRS, +) -> list[DuplicatePair]: + """Compute all project-pair similarities above `threshold`. + + O(N^2) over the project count — fine up to ~500 projects; beyond that + we'd want a blocked / token-indexed approach. In practice + `metadata_backfill` projects tend to share tokens, so a simple + pre-filter (skip pairs that share NO tokens) would cheaply cut the + inner loop. Deferred until profiling motivates it. + """ + summaries = _summarise_projects(db) + + # Pre-compute normalised names; skip too-short ones. + norm_by_id: dict[str, str] = {} + candidates: list[ProjectSummary] = [] + for s in summaries: + n = _normalise_project_name(s.name) + if len(n) < MIN_NORMALISED_LENGTH: + continue + norm_by_id[s.id] = n + candidates.append(s) + + pairs: list[DuplicatePair] = [] + n = len(candidates) + for i in range(n): + a = candidates[i] + a_norm = norm_by_id[a.id] + for j in range(i + 1, n): + b = candidates[j] + b_norm = norm_by_id[b.id] + score = rapidfuzz.fuzz.WRatio(a_norm, b_norm) / 100.0 + if score < threshold: + continue + target_id, reason = _pick_target(a, b) + pairs.append(DuplicatePair( + a = a, + b = b, + score = score, + suggested_target_id = target_id, + reason = reason, + )) + + # Sort by score desc, then by total content (more data → review first). + pairs.sort(key=lambda p: (-p.score, -(p.a.assignment_count + p.b.assignment_count))) + + return pairs[:max_pairs] diff --git a/templates/admin/metadata_backfill.html b/templates/admin/metadata_backfill.html index 9b66412..84459cd 100644 --- a/templates/admin/metadata_backfill.html +++ b/templates/admin/metadata_backfill.html @@ -595,9 +595,24 @@ async function _apply(clusterIds) { }); if (!r.ok) throw new Error('HTTP ' + r.status); const d = await r.json(); - const sub = `${d.applied} applied · ${d.project_ids_created.length} new project(s) · ${d.location_ids_created.length} new location(s)` + (d.failed.length ? ` · ${d.failed.length} failed` : ''); - _showToast(`${d.applied} cluster${d.applied === 1 ? '' : 's'} applied`, sub, d.failed.length ? 'error' : 'success'); - _hideToast(4000); + const failedCount = (d.failed || []).length; + + // Three states: + // total success — applied N, no failures → green toast, 4s + // partial — applied N, M failures → red toast + modal listing reasons + // total failure — applied 0, failures → red toast + modal + if (failedCount === 0) { + const sub = `${d.applied} applied · ${d.project_ids_created.length} new project(s) · ${d.location_ids_created.length} new location(s)`; + _showToast(`${d.applied} cluster${d.applied === 1 ? '' : 's'} applied`, sub, 'success'); + _hideToast(4000); + } else { + const title = d.applied > 0 + ? `${d.applied} applied, ${failedCount} failed` + : `Apply failed — ${failedCount} cluster${failedCount === 1 ? '' : 's'} could not be applied`; + _showToast(title, 'See the details panel.', 'error'); + _hideToast(6000); + _showFailureDetails(d.failed); + } await runScan(true); // refresh } catch (e) { _showToast('Apply failed', e.message, 'error'); @@ -605,6 +620,46 @@ async function _apply(clusterIds) { } } +// Modal-ish panel that lists each failed cluster with its server-side +// reason. Common failure modes seen in dev: missing DB tables after a +// stale schema, blocking conflicts that slipped past the front-end guard, +// rapidfuzz/SQLAlchemy edge cases. Operator can dismiss and either +// retry the cluster, skip it, or fix the underlying issue. +function _showFailureDetails(failed) { + let panel = document.getElementById('apply-failure-panel'); + if (!panel) { + panel = document.createElement('div'); + panel.id = 'apply-failure-panel'; + panel.className = 'fixed bottom-6 left-6 right-6 sm:right-auto sm:max-w-xl bg-white dark:bg-slate-800 rounded-xl shadow-2xl border border-red-200 dark:border-red-800 p-4 z-40'; + document.body.appendChild(panel); + } + const rows = failed.map(f => ` +
  • + ${(f.cluster_id || '').slice(0, 8)}… + ${_esc(f.reason || '(no reason)')} +
  • + `).join(''); + panel.innerHTML = ` +
    +

    + + ${failed.length} cluster${failed.length === 1 ? '' : 's'} failed to apply +

    + +
    + +

    + Common causes: missing DB schema (run the migration sweep), blocking conflict + with an existing UnitAssignment, or a UNIQUE constraint collision on the + project name. Re-scan and the failed clusters reappear as pending — fix the + underlying issue and retry. +

    + `; +} + async function applyOne(clusterId) { return _apply([clusterId]); } async function applyBulkHighConfidence() { diff --git a/templates/admin/project_tidy.html b/templates/admin/project_tidy.html new file mode 100644 index 0000000..e314b17 --- /dev/null +++ b/templates/admin/project_tidy.html @@ -0,0 +1,267 @@ +{% extends "base.html" %} + +{% block title %}Project Tidy - Seismo Fleet Manager{% endblock %} + +{% block content %} + +
    + +
    + + +
    +

    Project Tidy

    +

    + Find duplicate-looking projects via fuzzy name matching, then merge them with one click. + Useful after the metadata-backfill parser creates near-duplicates from operator name variations. +

    +
    + + +
    +
    +
    + + +
    + +
    +
    + + +
    +
    + Click "Scan for duplicates" to find pairs. +
    +
    + + + + + +{% endblock %} diff --git a/templates/settings.html b/templates/settings.html index b3c2cb3..caf7e51 100644 --- a/templates/settings.html +++ b/templates/settings.html @@ -574,6 +574,20 @@ Open + + +
    +
    +
    Project Tidy
    +
    + Find duplicate-looking projects via fuzzy name match (typos, abbreviations, spacing variations) and bulk-merge them. +
    +
    + + Open + +