v0.11.0 #50

Merged
serversdown merged 13 commits from release/0.11.0 into main 2026-05-15 19:16:43 -04:00
2 changed files with 65 additions and 3 deletions
Showing only changes of commit ad55d4ca09 - Show all commits
+5 -1
View File
@@ -376,7 +376,11 @@ def locations_search(
if q_norm in l_norm: if q_norm in l_norm:
scored.append((l, 1.0)) scored.append((l, 1.0))
continue continue
score = svc.similarity(q_norm, l_norm) # Use the location-specific scorer (token_set_ratio + multi-digit
# penalty) instead of WRatio — same reason as the cluster-match
# path: location names share too much boilerplate vocabulary for
# WRatio to discriminate reliably.
score = svc.location_similarity(q_norm, l_norm)
if score >= 0.50: if score >= 0.50:
scored.append((l, score)) scored.append((l, score))
+60 -2
View File
@@ -162,6 +162,11 @@ def similarity(a: str, b: str) -> float:
too short to fuzzy-match safely (see _MIN_FUZZY_LEN comment) AND the too short to fuzzy-match safely (see _MIN_FUZZY_LEN comment) AND the
strings don't exact-match. This guardrails the 'one common word strings don't exact-match. This guardrails the 'one common word
inside a longer phrase' false positive. inside a longer phrase' false positive.
USE FOR: project names (where typos like '1-80' vs 'I-80' should
still match). For location names use `location_similarity()` —
WRatio is too lenient on the shared boilerplate vocabulary in
location strings ('Area', 'Loc', 'Bridge', 'Dam', etc.).
""" """
if not a or not b: if not a or not b:
return 0.0 return 0.0
@@ -172,6 +177,50 @@ def similarity(a: str, b: str) -> float:
return rapidfuzz.fuzz.WRatio(a, b) / 100.0 return rapidfuzz.fuzz.WRatio(a, b) / 100.0
# Multi-digit penalty applied when two location names have completely
# disjoint multi-digit numeric tokens (e.g. "87 Jenks" vs "68 Jenks").
# Single-digit numbers ("Loc 1", "Area 2") are often shared coincidentally,
# but address-style multi-digit numbers are strong identifiers — if they
# differ, the locations are usually different physical places.
_LOCATION_DIGIT_MISMATCH_PENALTY = 0.30
def location_similarity(a: str, b: str) -> float:
"""Stricter similarity score for location-name matching.
Location names share so much boilerplate vocabulary ('Area', 'Loc',
'Bridge', 'Dam') that rapidfuzz.WRatio inflates obvious mismatches.
Example: 'Area 2 - Brookville Dam - Loc 2 East' vs 'Area 1 - Loc 1 -
87 Jenks' scores 85.5 via WRatio despite being unrelated locations.
This scorer uses `token_set_ratio` as the base (sensitive to actual
word overlap, not just substring containment). It then applies a
multi-digit penalty: if both strings contain 2+-digit numbers and
none overlap, subtract 0.30. Catches the "same project, different
address-style identifier" case ('87 Jenks' vs '68 Jenks') that pure
token-set scoring still rates above 0.90.
Single-digit numbers ('Loc 1', 'Area 2') are excluded from the
penalty because they're often shared boilerplate ("Loc 1" in every
project) rather than discriminating identifiers.
"""
if not a or not b:
return 0.0
if a == b:
return 1.0
if min(len(a), len(b)) < _MIN_FUZZY_LEN:
return 0.0
base = rapidfuzz.fuzz.token_set_ratio(a, b) / 100.0
multidigits_a = set(re.findall(r"\d{2,}", a))
multidigits_b = set(re.findall(r"\d{2,}", b))
if multidigits_a and multidigits_b and not (multidigits_a & multidigits_b):
base = max(0.0, base - _LOCATION_DIGIT_MISMATCH_PENALTY)
return base
# ── Cluster + Suggestion dataclasses ─────────────────────────────────────────── # ── Cluster + Suggestion dataclasses ───────────────────────────────────────────
@@ -572,15 +621,24 @@ async def _scan_clusters(
def _find_best_match( def _find_best_match(
candidate_norm: str, candidate_norm: str,
candidates: list[tuple[str, str]], # (id, normalised_name) candidates: list[tuple[str, str]], # (id, normalised_name)
*,
kind: str = "project", # "project" | "location"
) -> tuple[Optional[str], Optional[float], str]: ) -> tuple[Optional[str], Optional[float], str]:
"""Return (best_id, best_score, classification). """Return (best_id, best_score, classification).
classification ∈ {"exact", "fuzzy", "ambiguous", "no_match"} classification ∈ {"exact", "fuzzy", "ambiguous", "no_match"}
The `kind` parameter selects the scorer. Project matching uses
rapidfuzz.WRatio (lenient — catches typos like '1-80' vs 'I-80').
Location matching uses `location_similarity` (stricter — catches
boilerplate-shared-but-actually-different strings like 'Loc 2 - 68
Jenks' vs 'Loc 1 - 87 Jenks').
""" """
if not candidate_norm or not candidates: if not candidate_norm or not candidates:
return None, None, "no_match" return None, None, "no_match"
scored = [(cid, similarity(candidate_norm, cnorm)) for cid, cnorm in candidates] scorer = location_similarity if kind == "location" else similarity
scored = [(cid, scorer(candidate_norm, cnorm)) for cid, cnorm in candidates]
scored.sort(key=lambda x: x[1], reverse=True) scored.sort(key=lambda x: x[1], reverse=True)
best_id, best_score = scored[0] best_id, best_score = scored[0]
@@ -725,7 +783,7 @@ def _build_suggestion(db: Session, cluster: Cluster) -> Suggestion:
) )
location_candidates = [(l.id, _normalise(l.name)) for l in location_candidates_objs] location_candidates = [(l.id, _normalise(l.name)) for l in location_candidates_objs]
if cluster.location_norm: if cluster.location_norm:
loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates) loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates, kind="location")
else: else:
loc_id, loc_score, loc_match = None, None, "create_new" loc_id, loc_score, loc_match = None, None, "create_new"
else: else: