terra-view/backend/services/project_tidy.py

"""
project_tidy.py — find duplicate-looking projects + offer bulk merge.

The metadata-backfill parser is good at clustering events into candidate
projects but doesn't compare its proposed project names against EACH OTHER
(it only checks against existing terra-view projects).  After a bulk
apply, you can end up with many near-duplicate projects — typo variants,
abbreviation differences, etc.  This module surfaces them as pairs the
operator can merge.

Pairs vs clusters: a fully-connected group like (A, B, C) where each pair
scores >= threshold becomes 3 pairs.  The operator has to do 2 merges to
fully consolidate.  We don't try to be smarter about transitive grouping —
in practice operators want to review the highest-similarity pair first
anyway, and the list re-computes after each merge.

Public API:
  find_duplicate_pairs(db, *, threshold=0.85, max_pairs=200) → list[DuplicatePair]
"""

from __future__ import annotations

import logging
from dataclasses import dataclass
from typing import Optional

import rapidfuzz
from sqlalchemy import func
from sqlalchemy.orm import Session

from backend.models import (
    Project,
    MonitoringLocation,
    UnitAssignment,
)
from backend.services.metadata_backfill import _normalise as _meta_normalise

log = logging.getLogger("backend.services.project_tidy")


DEFAULT_THRESHOLD       = 0.85   # WRatio similarity above which we surface a pair
DEFAULT_MAX_PAIRS       = 200    # Cap the result list to keep response small
MIN_NORMALISED_LENGTH   = 4      # Skip projects whose normalised name is too short
                                 # to fuzzy-match safely (avoids "1" / "1" pairs).


@dataclass
class ProjectSummary:
    id:                str
    name:              str
    project_number:    Optional[str]
    client_name:       Optional[str]
    source:            str            # 'manual' | 'metadata_backfill' | ...
    status:            str
    location_count:    int
    assignment_count:  int
    event_count_total: int            # approx — sum across assignments


@dataclass
class DuplicatePair:
    a:           ProjectSummary
    b:           ProjectSummary
    score:       float
    suggested_target_id: str          # the recommended "keep" side
    reason:      str                  # why we picked that target


# ── Helpers ──────────────────────────────────────────────────────────────────


def _normalise_project_name(name: str) -> str:
    """Project-name normalisation for tidy comparison.

    Reuses the metadata_backfill normaliser (lowercase, punctuation→space,
    collapse whitespace).  Returns "" for None or all-punctuation names.
    """
    return _meta_normalise(name)


def _summarise_projects(db: Session) -> list[ProjectSummary]:
    """One row per active project with cached counts.  Excludes deleted."""
    projects = (
        db.query(Project)
        .filter(Project.status != "deleted")
        .all()
    )

    # Bulk lookup: assignment counts + location counts per project.
    loc_counts: dict[str, int] = dict(
        db.query(MonitoringLocation.project_id, func.count(MonitoringLocation.id))
        .filter(MonitoringLocation.project_id.in_([p.id for p in projects]) if projects else False)
        .group_by(MonitoringLocation.project_id)
        .all()
    )
    asgn_counts: dict[str, int] = dict(
        db.query(UnitAssignment.project_id, func.count(UnitAssignment.id))
        .filter(UnitAssignment.project_id.in_([p.id for p in projects]) if projects else False)
        .group_by(UnitAssignment.project_id)
        .all()
    )

    summaries: list[ProjectSummary] = []
    for p in projects:
        summaries.append(ProjectSummary(
            id                = p.id,
            name              = p.name,
            project_number    = p.project_number,
            client_name       = p.client_name,
            source            = None,                       # filled below per assignment
            status            = p.status or "active",
            location_count    = loc_counts.get(p.id, 0),
            assignment_count  = asgn_counts.get(p.id, 0),
            event_count_total = 0,                          # not cheap to compute here; left 0
        ))

    # Determine each project's dominant assignment source.  Used to break ties
    # when picking the "keep" target — prefer manual over parser-created.
    rows = (
        db.query(UnitAssignment.project_id, UnitAssignment.source, func.count(UnitAssignment.id))
        .group_by(UnitAssignment.project_id, UnitAssignment.source)
        .all()
    )
    by_proj_src: dict[str, dict[str, int]] = {}
    for proj_id, src, cnt in rows:
        by_proj_src.setdefault(proj_id, {})[src or "manual"] = cnt
    for s in summaries:
        src_map = by_proj_src.get(s.id, {})
        if not src_map:
            s.source = "manual"
        else:
            # Dominant source (most assignments).
            s.source = max(src_map.items(), key=lambda kv: kv[1])[0]

    return summaries


def _pick_target(a: ProjectSummary, b: ProjectSummary) -> tuple[str, str]:
    """Decide which project should be the merge target (the one we keep).

    Priorities (in order):
      1. The one with `source='manual'` over `source='metadata_backfill'`
         — operator-curated projects beat parser-created ones.
      2. The one with a populated `project_number`.
      3. The one with more locations (more curation history).
      4. The one with more assignments.
      5. The one with the shorter, cleaner name (tiebreaker).

    Returns (target_id, reason_string).
    """
    # 1. Source provenance.
    a_manual = a.source == "manual"
    b_manual = b.source == "manual"
    if a_manual and not b_manual:
        return a.id, "A is manually-created; B is parser-created"
    if b_manual and not a_manual:
        return b.id, "B is manually-created; A is parser-created"

    # 2. project_number populated.
    if a.project_number and not b.project_number:
        return a.id, "A has a project_number; B doesn't"
    if b.project_number and not a.project_number:
        return b.id, "B has a project_number; A doesn't"

    # 3. More locations.
    if a.location_count > b.location_count:
        return a.id, f"A has more locations ({a.location_count} vs {b.location_count})"
    if b.location_count > a.location_count:
        return b.id, f"B has more locations ({b.location_count} vs {a.location_count})"

    # 4. More assignments.
    if a.assignment_count > b.assignment_count:
        return a.id, f"A has more assignments ({a.assignment_count} vs {b.assignment_count})"
    if b.assignment_count > a.assignment_count:
        return b.id, f"B has more assignments ({b.assignment_count} vs {a.assignment_count})"

    # 5. Shorter name (less likely to have baked-in junk).
    if len(a.name) <= len(b.name):
        return a.id, "A has the shorter / cleaner name"
    return b.id, "B has the shorter / cleaner name"


# ── Public ───────────────────────────────────────────────────────────────────


def find_duplicate_pairs(
    db: Session,
    *,
    threshold: float = DEFAULT_THRESHOLD,
    max_pairs: int = DEFAULT_MAX_PAIRS,
) -> list[DuplicatePair]:
    """Compute all project-pair similarities above `threshold`.

    O(N^2) over the project count — fine up to ~500 projects; beyond that
    we'd want a blocked / token-indexed approach.  In practice
    `metadata_backfill` projects tend to share tokens, so a simple
    pre-filter (skip pairs that share NO tokens) would cheaply cut the
    inner loop.  Deferred until profiling motivates it.
    """
    summaries = _summarise_projects(db)

    # Pre-compute normalised names; skip too-short ones.
    norm_by_id: dict[str, str] = {}
    candidates: list[ProjectSummary] = []
    for s in summaries:
        n = _normalise_project_name(s.name)
        if len(n) < MIN_NORMALISED_LENGTH:
            continue
        norm_by_id[s.id] = n
        candidates.append(s)

    pairs: list[DuplicatePair] = []
    n = len(candidates)
    for i in range(n):
        a = candidates[i]
        a_norm = norm_by_id[a.id]
        for j in range(i + 1, n):
            b = candidates[j]
            b_norm = norm_by_id[b.id]
            score = rapidfuzz.fuzz.WRatio(a_norm, b_norm) / 100.0
            if score < threshold:
                continue
            target_id, reason = _pick_target(a, b)
            pairs.append(DuplicatePair(
                a                   = a,
                b                   = b,
                score               = score,
                suggested_target_id = target_id,
                reason              = reason,
            ))

    # Sort by score desc, then by total content (more data → review first).
    pairs.sort(key=lambda p: (-p.score, -(p.a.assignment_count + p.b.assignment_count)))

    return pairs[:max_pairs]