""" project_tidy.py — find duplicate-looking projects + offer bulk merge. The metadata-backfill parser is good at clustering events into candidate projects but doesn't compare its proposed project names against EACH OTHER (it only checks against existing terra-view projects). After a bulk apply, you can end up with many near-duplicate projects — typo variants, abbreviation differences, etc. This module surfaces them as pairs the operator can merge. Pairs vs clusters: a fully-connected group like (A, B, C) where each pair scores >= threshold becomes 3 pairs. The operator has to do 2 merges to fully consolidate. We don't try to be smarter about transitive grouping — in practice operators want to review the highest-similarity pair first anyway, and the list re-computes after each merge. Public API: find_duplicate_pairs(db, *, threshold=0.85, max_pairs=200) → list[DuplicatePair] """ from __future__ import annotations import logging from dataclasses import dataclass from typing import Optional import rapidfuzz from sqlalchemy import func from sqlalchemy.orm import Session from backend.models import ( Project, MonitoringLocation, UnitAssignment, ) from backend.services.metadata_backfill import _normalise as _meta_normalise log = logging.getLogger("backend.services.project_tidy") DEFAULT_THRESHOLD = 0.85 # WRatio similarity above which we surface a pair DEFAULT_MAX_PAIRS = 200 # Cap the result list to keep response small MIN_NORMALISED_LENGTH = 4 # Skip projects whose normalised name is too short # to fuzzy-match safely (avoids "1" / "1" pairs). @dataclass class ProjectSummary: id: str name: str project_number: Optional[str] client_name: Optional[str] source: str # 'manual' | 'metadata_backfill' | ... status: str location_count: int assignment_count: int event_count_total: int # approx — sum across assignments @dataclass class DuplicatePair: a: ProjectSummary b: ProjectSummary score: float suggested_target_id: str # the recommended "keep" side reason: str # why we picked that target # ── Helpers ────────────────────────────────────────────────────────────────── def _normalise_project_name(name: str) -> str: """Project-name normalisation for tidy comparison. Reuses the metadata_backfill normaliser (lowercase, punctuation→space, collapse whitespace). Returns "" for None or all-punctuation names. """ return _meta_normalise(name) def _summarise_projects(db: Session) -> list[ProjectSummary]: """One row per active project with cached counts. Excludes deleted.""" projects = ( db.query(Project) .filter(Project.status != "deleted") .all() ) # Bulk lookup: assignment counts + location counts per project. loc_counts: dict[str, int] = dict( db.query(MonitoringLocation.project_id, func.count(MonitoringLocation.id)) .filter(MonitoringLocation.project_id.in_([p.id for p in projects]) if projects else False) .group_by(MonitoringLocation.project_id) .all() ) asgn_counts: dict[str, int] = dict( db.query(UnitAssignment.project_id, func.count(UnitAssignment.id)) .filter(UnitAssignment.project_id.in_([p.id for p in projects]) if projects else False) .group_by(UnitAssignment.project_id) .all() ) summaries: list[ProjectSummary] = [] for p in projects: summaries.append(ProjectSummary( id = p.id, name = p.name, project_number = p.project_number, client_name = p.client_name, source = None, # filled below per assignment status = p.status or "active", location_count = loc_counts.get(p.id, 0), assignment_count = asgn_counts.get(p.id, 0), event_count_total = 0, # not cheap to compute here; left 0 )) # Determine each project's dominant assignment source. Used to break ties # when picking the "keep" target — prefer manual over parser-created. rows = ( db.query(UnitAssignment.project_id, UnitAssignment.source, func.count(UnitAssignment.id)) .group_by(UnitAssignment.project_id, UnitAssignment.source) .all() ) by_proj_src: dict[str, dict[str, int]] = {} for proj_id, src, cnt in rows: by_proj_src.setdefault(proj_id, {})[src or "manual"] = cnt for s in summaries: src_map = by_proj_src.get(s.id, {}) if not src_map: s.source = "manual" else: # Dominant source (most assignments). s.source = max(src_map.items(), key=lambda kv: kv[1])[0] return summaries def _pick_target(a: ProjectSummary, b: ProjectSummary) -> tuple[str, str]: """Decide which project should be the merge target (the one we keep). Priorities (in order): 1. The one with `source='manual'` over `source='metadata_backfill'` — operator-curated projects beat parser-created ones. 2. The one with a populated `project_number`. 3. The one with more locations (more curation history). 4. The one with more assignments. 5. The one with the shorter, cleaner name (tiebreaker). Returns (target_id, reason_string). """ # 1. Source provenance. a_manual = a.source == "manual" b_manual = b.source == "manual" if a_manual and not b_manual: return a.id, "A is manually-created; B is parser-created" if b_manual and not a_manual: return b.id, "B is manually-created; A is parser-created" # 2. project_number populated. if a.project_number and not b.project_number: return a.id, "A has a project_number; B doesn't" if b.project_number and not a.project_number: return b.id, "B has a project_number; A doesn't" # 3. More locations. if a.location_count > b.location_count: return a.id, f"A has more locations ({a.location_count} vs {b.location_count})" if b.location_count > a.location_count: return b.id, f"B has more locations ({b.location_count} vs {a.location_count})" # 4. More assignments. if a.assignment_count > b.assignment_count: return a.id, f"A has more assignments ({a.assignment_count} vs {b.assignment_count})" if b.assignment_count > a.assignment_count: return b.id, f"B has more assignments ({b.assignment_count} vs {a.assignment_count})" # 5. Shorter name (less likely to have baked-in junk). if len(a.name) <= len(b.name): return a.id, "A has the shorter / cleaner name" return b.id, "B has the shorter / cleaner name" # ── Public ─────────────────────────────────────────────────────────────────── def find_duplicate_pairs( db: Session, *, threshold: float = DEFAULT_THRESHOLD, max_pairs: int = DEFAULT_MAX_PAIRS, ) -> list[DuplicatePair]: """Compute all project-pair similarities above `threshold`. O(N^2) over the project count — fine up to ~500 projects; beyond that we'd want a blocked / token-indexed approach. In practice `metadata_backfill` projects tend to share tokens, so a simple pre-filter (skip pairs that share NO tokens) would cheaply cut the inner loop. Deferred until profiling motivates it. """ summaries = _summarise_projects(db) # Pre-compute normalised names; skip too-short ones. norm_by_id: dict[str, str] = {} candidates: list[ProjectSummary] = [] for s in summaries: n = _normalise_project_name(s.name) if len(n) < MIN_NORMALISED_LENGTH: continue norm_by_id[s.id] = n candidates.append(s) pairs: list[DuplicatePair] = [] n = len(candidates) for i in range(n): a = candidates[i] a_norm = norm_by_id[a.id] for j in range(i + 1, n): b = candidates[j] b_norm = norm_by_id[b.id] score = rapidfuzz.fuzz.WRatio(a_norm, b_norm) / 100.0 if score < threshold: continue target_id, reason = _pick_target(a, b) pairs.append(DuplicatePair( a = a, b = b, score = score, suggested_target_id = target_id, reason = reason, )) # Sort by score desc, then by total content (more data → review first). pairs.sort(key=lambda p: (-p.score, -(p.a.assignment_count + p.b.assignment_count))) return pairs[:max_pairs]