fix(backfill): location matching over-confident on boilerplate-shared names
rapidfuzz.fuzz.WRatio inflates scores when two strings share substring
tokens, even when the shared tokens are common boilerplate. For
project names this is desirable (catches typos like '1-80' vs 'I-80')
but for location names it produces obvious false positives:
'Area 2 - Brookville Dam - Loc 2 East'
vs
'Area 1 - Loc 1 - 87 Jenks' → WRatio 85.5 (above 0.80 fuzzy threshold)
These share only 'area' + 'loc' + a digit but score 85%+ because WRatio
weights partial-substring overlap heavily. Operator reported the
backfill tool suggesting completely unrelated locations as 86% matches.
Fix: introduce `location_similarity()` — token_set_ratio + multi-digit
mismatch penalty. Used for location matching everywhere; WRatio stays
as the scorer for project names where its leniency is correct.
The multi-digit penalty (-0.30) triggers when both strings contain 2+-
digit numbers and none overlap. Catches the harder "same project,
different address identifier" case:
'Area 1 - Loc 2 - 68 Jenks' vs 'Area 1 - Loc 1 - 87 Jenks'
token_set_ratio = 0.91 (would still match without penalty)
multi-digit tokens {68} and {87} disjoint → -0.30 → 0.61 (rejected)
Single-digit tokens ('Loc 1', 'Area 2') are excluded from the penalty
because they're often coincidentally shared.
Updated:
- backend/services/metadata_backfill.py: new location_similarity()
function; _find_best_match() gains a `kind` parameter that selects
scorer; cluster-match call site passes kind='location'
- backend/routers/metadata_backfill.py: locations_search endpoint
(the typeahead dropdown's data source) uses location_similarity
instead of similarity for the same reason
Verified all six test cases land correctly:
- user-reported false positive: 0.85 → 0.59 (rejected)
- '87 Jenks' vs '68 Jenks': 0.90 → 0.61 (rejected)
- NRL-01 vs NRL-02: 0.83 → 0.53 (rejected)
- 'Loc 2 - 735 Bunola' vs 'Loc 2 735 Bunola Rd': 1.00 (still matches)
- punctuation-only difference: 1.00 (still matches)
Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
@@ -376,7 +376,11 @@ def locations_search(
|
|||||||
if q_norm in l_norm:
|
if q_norm in l_norm:
|
||||||
scored.append((l, 1.0))
|
scored.append((l, 1.0))
|
||||||
continue
|
continue
|
||||||
score = svc.similarity(q_norm, l_norm)
|
# Use the location-specific scorer (token_set_ratio + multi-digit
|
||||||
|
# penalty) instead of WRatio — same reason as the cluster-match
|
||||||
|
# path: location names share too much boilerplate vocabulary for
|
||||||
|
# WRatio to discriminate reliably.
|
||||||
|
score = svc.location_similarity(q_norm, l_norm)
|
||||||
if score >= 0.50:
|
if score >= 0.50:
|
||||||
scored.append((l, score))
|
scored.append((l, score))
|
||||||
|
|
||||||
|
|||||||
@@ -162,6 +162,11 @@ def similarity(a: str, b: str) -> float:
|
|||||||
too short to fuzzy-match safely (see _MIN_FUZZY_LEN comment) AND the
|
too short to fuzzy-match safely (see _MIN_FUZZY_LEN comment) AND the
|
||||||
strings don't exact-match. This guardrails the 'one common word
|
strings don't exact-match. This guardrails the 'one common word
|
||||||
inside a longer phrase' false positive.
|
inside a longer phrase' false positive.
|
||||||
|
|
||||||
|
USE FOR: project names (where typos like '1-80' vs 'I-80' should
|
||||||
|
still match). For location names use `location_similarity()` —
|
||||||
|
WRatio is too lenient on the shared boilerplate vocabulary in
|
||||||
|
location strings ('Area', 'Loc', 'Bridge', 'Dam', etc.).
|
||||||
"""
|
"""
|
||||||
if not a or not b:
|
if not a or not b:
|
||||||
return 0.0
|
return 0.0
|
||||||
@@ -172,6 +177,50 @@ def similarity(a: str, b: str) -> float:
|
|||||||
return rapidfuzz.fuzz.WRatio(a, b) / 100.0
|
return rapidfuzz.fuzz.WRatio(a, b) / 100.0
|
||||||
|
|
||||||
|
|
||||||
|
# Multi-digit penalty applied when two location names have completely
|
||||||
|
# disjoint multi-digit numeric tokens (e.g. "87 Jenks" vs "68 Jenks").
|
||||||
|
# Single-digit numbers ("Loc 1", "Area 2") are often shared coincidentally,
|
||||||
|
# but address-style multi-digit numbers are strong identifiers — if they
|
||||||
|
# differ, the locations are usually different physical places.
|
||||||
|
_LOCATION_DIGIT_MISMATCH_PENALTY = 0.30
|
||||||
|
|
||||||
|
|
||||||
|
def location_similarity(a: str, b: str) -> float:
|
||||||
|
"""Stricter similarity score for location-name matching.
|
||||||
|
|
||||||
|
Location names share so much boilerplate vocabulary ('Area', 'Loc',
|
||||||
|
'Bridge', 'Dam') that rapidfuzz.WRatio inflates obvious mismatches.
|
||||||
|
Example: 'Area 2 - Brookville Dam - Loc 2 East' vs 'Area 1 - Loc 1 -
|
||||||
|
87 Jenks' scores 85.5 via WRatio despite being unrelated locations.
|
||||||
|
|
||||||
|
This scorer uses `token_set_ratio` as the base (sensitive to actual
|
||||||
|
word overlap, not just substring containment). It then applies a
|
||||||
|
multi-digit penalty: if both strings contain 2+-digit numbers and
|
||||||
|
none overlap, subtract 0.30. Catches the "same project, different
|
||||||
|
address-style identifier" case ('87 Jenks' vs '68 Jenks') that pure
|
||||||
|
token-set scoring still rates above 0.90.
|
||||||
|
|
||||||
|
Single-digit numbers ('Loc 1', 'Area 2') are excluded from the
|
||||||
|
penalty because they're often shared boilerplate ("Loc 1" in every
|
||||||
|
project) rather than discriminating identifiers.
|
||||||
|
"""
|
||||||
|
if not a or not b:
|
||||||
|
return 0.0
|
||||||
|
if a == b:
|
||||||
|
return 1.0
|
||||||
|
if min(len(a), len(b)) < _MIN_FUZZY_LEN:
|
||||||
|
return 0.0
|
||||||
|
|
||||||
|
base = rapidfuzz.fuzz.token_set_ratio(a, b) / 100.0
|
||||||
|
|
||||||
|
multidigits_a = set(re.findall(r"\d{2,}", a))
|
||||||
|
multidigits_b = set(re.findall(r"\d{2,}", b))
|
||||||
|
if multidigits_a and multidigits_b and not (multidigits_a & multidigits_b):
|
||||||
|
base = max(0.0, base - _LOCATION_DIGIT_MISMATCH_PENALTY)
|
||||||
|
|
||||||
|
return base
|
||||||
|
|
||||||
|
|
||||||
# ── Cluster + Suggestion dataclasses ───────────────────────────────────────────
|
# ── Cluster + Suggestion dataclasses ───────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
@@ -572,15 +621,24 @@ async def _scan_clusters(
|
|||||||
def _find_best_match(
|
def _find_best_match(
|
||||||
candidate_norm: str,
|
candidate_norm: str,
|
||||||
candidates: list[tuple[str, str]], # (id, normalised_name)
|
candidates: list[tuple[str, str]], # (id, normalised_name)
|
||||||
|
*,
|
||||||
|
kind: str = "project", # "project" | "location"
|
||||||
) -> tuple[Optional[str], Optional[float], str]:
|
) -> tuple[Optional[str], Optional[float], str]:
|
||||||
"""Return (best_id, best_score, classification).
|
"""Return (best_id, best_score, classification).
|
||||||
|
|
||||||
classification ∈ {"exact", "fuzzy", "ambiguous", "no_match"}
|
classification ∈ {"exact", "fuzzy", "ambiguous", "no_match"}
|
||||||
|
|
||||||
|
The `kind` parameter selects the scorer. Project matching uses
|
||||||
|
rapidfuzz.WRatio (lenient — catches typos like '1-80' vs 'I-80').
|
||||||
|
Location matching uses `location_similarity` (stricter — catches
|
||||||
|
boilerplate-shared-but-actually-different strings like 'Loc 2 - 68
|
||||||
|
Jenks' vs 'Loc 1 - 87 Jenks').
|
||||||
"""
|
"""
|
||||||
if not candidate_norm or not candidates:
|
if not candidate_norm or not candidates:
|
||||||
return None, None, "no_match"
|
return None, None, "no_match"
|
||||||
|
|
||||||
scored = [(cid, similarity(candidate_norm, cnorm)) for cid, cnorm in candidates]
|
scorer = location_similarity if kind == "location" else similarity
|
||||||
|
scored = [(cid, scorer(candidate_norm, cnorm)) for cid, cnorm in candidates]
|
||||||
scored.sort(key=lambda x: x[1], reverse=True)
|
scored.sort(key=lambda x: x[1], reverse=True)
|
||||||
best_id, best_score = scored[0]
|
best_id, best_score = scored[0]
|
||||||
|
|
||||||
@@ -725,7 +783,7 @@ def _build_suggestion(db: Session, cluster: Cluster) -> Suggestion:
|
|||||||
)
|
)
|
||||||
location_candidates = [(l.id, _normalise(l.name)) for l in location_candidates_objs]
|
location_candidates = [(l.id, _normalise(l.name)) for l in location_candidates_objs]
|
||||||
if cluster.location_norm:
|
if cluster.location_norm:
|
||||||
loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates)
|
loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates, kind="location")
|
||||||
else:
|
else:
|
||||||
loc_id, loc_score, loc_match = None, None, "create_new"
|
loc_id, loc_score, loc_match = None, None, "create_new"
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user