diff --git a/backend/routers/metadata_backfill.py b/backend/routers/metadata_backfill.py index f086020..34e0d2d 100644 --- a/backend/routers/metadata_backfill.py +++ b/backend/routers/metadata_backfill.py @@ -376,7 +376,11 @@ def locations_search( if q_norm in l_norm: scored.append((l, 1.0)) continue - score = svc.similarity(q_norm, l_norm) + # Use the location-specific scorer (token_set_ratio + multi-digit + # penalty) instead of WRatio — same reason as the cluster-match + # path: location names share too much boilerplate vocabulary for + # WRatio to discriminate reliably. + score = svc.location_similarity(q_norm, l_norm) if score >= 0.50: scored.append((l, score)) diff --git a/backend/services/metadata_backfill.py b/backend/services/metadata_backfill.py index 303328a..30e234c 100644 --- a/backend/services/metadata_backfill.py +++ b/backend/services/metadata_backfill.py @@ -162,6 +162,11 @@ def similarity(a: str, b: str) -> float: too short to fuzzy-match safely (see _MIN_FUZZY_LEN comment) AND the strings don't exact-match. This guardrails the 'one common word inside a longer phrase' false positive. + + USE FOR: project names (where typos like '1-80' vs 'I-80' should + still match). For location names use `location_similarity()` — + WRatio is too lenient on the shared boilerplate vocabulary in + location strings ('Area', 'Loc', 'Bridge', 'Dam', etc.). """ if not a or not b: return 0.0 @@ -172,6 +177,50 @@ def similarity(a: str, b: str) -> float: return rapidfuzz.fuzz.WRatio(a, b) / 100.0 +# Multi-digit penalty applied when two location names have completely +# disjoint multi-digit numeric tokens (e.g. "87 Jenks" vs "68 Jenks"). +# Single-digit numbers ("Loc 1", "Area 2") are often shared coincidentally, +# but address-style multi-digit numbers are strong identifiers — if they +# differ, the locations are usually different physical places. +_LOCATION_DIGIT_MISMATCH_PENALTY = 0.30 + + +def location_similarity(a: str, b: str) -> float: + """Stricter similarity score for location-name matching. + + Location names share so much boilerplate vocabulary ('Area', 'Loc', + 'Bridge', 'Dam') that rapidfuzz.WRatio inflates obvious mismatches. + Example: 'Area 2 - Brookville Dam - Loc 2 East' vs 'Area 1 - Loc 1 - + 87 Jenks' scores 85.5 via WRatio despite being unrelated locations. + + This scorer uses `token_set_ratio` as the base (sensitive to actual + word overlap, not just substring containment). It then applies a + multi-digit penalty: if both strings contain 2+-digit numbers and + none overlap, subtract 0.30. Catches the "same project, different + address-style identifier" case ('87 Jenks' vs '68 Jenks') that pure + token-set scoring still rates above 0.90. + + Single-digit numbers ('Loc 1', 'Area 2') are excluded from the + penalty because they're often shared boilerplate ("Loc 1" in every + project) rather than discriminating identifiers. + """ + if not a or not b: + return 0.0 + if a == b: + return 1.0 + if min(len(a), len(b)) < _MIN_FUZZY_LEN: + return 0.0 + + base = rapidfuzz.fuzz.token_set_ratio(a, b) / 100.0 + + multidigits_a = set(re.findall(r"\d{2,}", a)) + multidigits_b = set(re.findall(r"\d{2,}", b)) + if multidigits_a and multidigits_b and not (multidigits_a & multidigits_b): + base = max(0.0, base - _LOCATION_DIGIT_MISMATCH_PENALTY) + + return base + + # ── Cluster + Suggestion dataclasses ─────────────────────────────────────────── @@ -572,15 +621,24 @@ async def _scan_clusters( def _find_best_match( candidate_norm: str, candidates: list[tuple[str, str]], # (id, normalised_name) + *, + kind: str = "project", # "project" | "location" ) -> tuple[Optional[str], Optional[float], str]: """Return (best_id, best_score, classification). classification ∈ {"exact", "fuzzy", "ambiguous", "no_match"} + + The `kind` parameter selects the scorer. Project matching uses + rapidfuzz.WRatio (lenient — catches typos like '1-80' vs 'I-80'). + Location matching uses `location_similarity` (stricter — catches + boilerplate-shared-but-actually-different strings like 'Loc 2 - 68 + Jenks' vs 'Loc 1 - 87 Jenks'). """ if not candidate_norm or not candidates: return None, None, "no_match" - scored = [(cid, similarity(candidate_norm, cnorm)) for cid, cnorm in candidates] + scorer = location_similarity if kind == "location" else similarity + scored = [(cid, scorer(candidate_norm, cnorm)) for cid, cnorm in candidates] scored.sort(key=lambda x: x[1], reverse=True) best_id, best_score = scored[0] @@ -725,7 +783,7 @@ def _build_suggestion(db: Session, cluster: Cluster) -> Suggestion: ) location_candidates = [(l.id, _normalise(l.name)) for l in location_candidates_objs] if cluster.location_norm: - loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates) + loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates, kind="location") else: loc_id, loc_score, loc_match = None, None, "create_new" else: