Compare commits
2 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 295f9637b3 | |||
| ad55d4ca09 |
@@ -376,7 +376,11 @@ def locations_search(
|
||||
if q_norm in l_norm:
|
||||
scored.append((l, 1.0))
|
||||
continue
|
||||
score = svc.similarity(q_norm, l_norm)
|
||||
# Use the location-specific scorer (token_set_ratio + multi-digit
|
||||
# penalty) instead of WRatio — same reason as the cluster-match
|
||||
# path: location names share too much boilerplate vocabulary for
|
||||
# WRatio to discriminate reliably.
|
||||
score = svc.location_similarity(q_norm, l_norm)
|
||||
if score >= 0.50:
|
||||
scored.append((l, score))
|
||||
|
||||
|
||||
@@ -162,6 +162,11 @@ def similarity(a: str, b: str) -> float:
|
||||
too short to fuzzy-match safely (see _MIN_FUZZY_LEN comment) AND the
|
||||
strings don't exact-match. This guardrails the 'one common word
|
||||
inside a longer phrase' false positive.
|
||||
|
||||
USE FOR: project names (where typos like '1-80' vs 'I-80' should
|
||||
still match). For location names use `location_similarity()` —
|
||||
WRatio is too lenient on the shared boilerplate vocabulary in
|
||||
location strings ('Area', 'Loc', 'Bridge', 'Dam', etc.).
|
||||
"""
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
@@ -172,6 +177,50 @@ def similarity(a: str, b: str) -> float:
|
||||
return rapidfuzz.fuzz.WRatio(a, b) / 100.0
|
||||
|
||||
|
||||
# Multi-digit penalty applied when two location names have completely
|
||||
# disjoint multi-digit numeric tokens (e.g. "87 Jenks" vs "68 Jenks").
|
||||
# Single-digit numbers ("Loc 1", "Area 2") are often shared coincidentally,
|
||||
# but address-style multi-digit numbers are strong identifiers — if they
|
||||
# differ, the locations are usually different physical places.
|
||||
_LOCATION_DIGIT_MISMATCH_PENALTY = 0.30
|
||||
|
||||
|
||||
def location_similarity(a: str, b: str) -> float:
|
||||
"""Stricter similarity score for location-name matching.
|
||||
|
||||
Location names share so much boilerplate vocabulary ('Area', 'Loc',
|
||||
'Bridge', 'Dam') that rapidfuzz.WRatio inflates obvious mismatches.
|
||||
Example: 'Area 2 - Brookville Dam - Loc 2 East' vs 'Area 1 - Loc 1 -
|
||||
87 Jenks' scores 85.5 via WRatio despite being unrelated locations.
|
||||
|
||||
This scorer uses `token_set_ratio` as the base (sensitive to actual
|
||||
word overlap, not just substring containment). It then applies a
|
||||
multi-digit penalty: if both strings contain 2+-digit numbers and
|
||||
none overlap, subtract 0.30. Catches the "same project, different
|
||||
address-style identifier" case ('87 Jenks' vs '68 Jenks') that pure
|
||||
token-set scoring still rates above 0.90.
|
||||
|
||||
Single-digit numbers ('Loc 1', 'Area 2') are excluded from the
|
||||
penalty because they're often shared boilerplate ("Loc 1" in every
|
||||
project) rather than discriminating identifiers.
|
||||
"""
|
||||
if not a or not b:
|
||||
return 0.0
|
||||
if a == b:
|
||||
return 1.0
|
||||
if min(len(a), len(b)) < _MIN_FUZZY_LEN:
|
||||
return 0.0
|
||||
|
||||
base = rapidfuzz.fuzz.token_set_ratio(a, b) / 100.0
|
||||
|
||||
multidigits_a = set(re.findall(r"\d{2,}", a))
|
||||
multidigits_b = set(re.findall(r"\d{2,}", b))
|
||||
if multidigits_a and multidigits_b and not (multidigits_a & multidigits_b):
|
||||
base = max(0.0, base - _LOCATION_DIGIT_MISMATCH_PENALTY)
|
||||
|
||||
return base
|
||||
|
||||
|
||||
# ── Cluster + Suggestion dataclasses ───────────────────────────────────────────
|
||||
|
||||
|
||||
@@ -572,15 +621,24 @@ async def _scan_clusters(
|
||||
def _find_best_match(
|
||||
candidate_norm: str,
|
||||
candidates: list[tuple[str, str]], # (id, normalised_name)
|
||||
*,
|
||||
kind: str = "project", # "project" | "location"
|
||||
) -> tuple[Optional[str], Optional[float], str]:
|
||||
"""Return (best_id, best_score, classification).
|
||||
|
||||
classification ∈ {"exact", "fuzzy", "ambiguous", "no_match"}
|
||||
|
||||
The `kind` parameter selects the scorer. Project matching uses
|
||||
rapidfuzz.WRatio (lenient — catches typos like '1-80' vs 'I-80').
|
||||
Location matching uses `location_similarity` (stricter — catches
|
||||
boilerplate-shared-but-actually-different strings like 'Loc 2 - 68
|
||||
Jenks' vs 'Loc 1 - 87 Jenks').
|
||||
"""
|
||||
if not candidate_norm or not candidates:
|
||||
return None, None, "no_match"
|
||||
|
||||
scored = [(cid, similarity(candidate_norm, cnorm)) for cid, cnorm in candidates]
|
||||
scorer = location_similarity if kind == "location" else similarity
|
||||
scored = [(cid, scorer(candidate_norm, cnorm)) for cid, cnorm in candidates]
|
||||
scored.sort(key=lambda x: x[1], reverse=True)
|
||||
best_id, best_score = scored[0]
|
||||
|
||||
@@ -725,7 +783,7 @@ def _build_suggestion(db: Session, cluster: Cluster) -> Suggestion:
|
||||
)
|
||||
location_candidates = [(l.id, _normalise(l.name)) for l in location_candidates_objs]
|
||||
if cluster.location_norm:
|
||||
loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates)
|
||||
loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates, kind="location")
|
||||
else:
|
||||
loc_id, loc_score, loc_match = None, None, "create_new"
|
||||
else:
|
||||
|
||||
@@ -87,9 +87,14 @@
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Merge Modal -->
|
||||
<!-- Merge Modal —
|
||||
min-h on the body ensures the typeahead dropdown has room to render
|
||||
below the input without forcing the operator to scroll inside the
|
||||
modal. overflow-visible on the body lets the dropdown extend
|
||||
beyond the body's natural height when needed. -->
|
||||
<div id="merge-modal" class="hidden fixed inset-0 z-50 flex items-center justify-center bg-black/60 backdrop-blur-sm">
|
||||
<div class="bg-white dark:bg-slate-800 rounded-xl shadow-2xl w-full max-w-2xl mx-4 max-h-[90vh] flex flex-col">
|
||||
<div class="bg-white dark:bg-slate-800 rounded-xl shadow-2xl w-full max-w-2xl mx-4 max-h-[90vh] flex flex-col"
|
||||
style="min-height: 480px;">
|
||||
<!-- Header -->
|
||||
<div class="px-6 py-4 border-b border-gray-200 dark:border-gray-700 flex items-center justify-between">
|
||||
<div>
|
||||
@@ -104,7 +109,7 @@
|
||||
</div>
|
||||
|
||||
<!-- Body -->
|
||||
<div class="px-6 py-4 overflow-y-auto flex-1">
|
||||
<div class="px-6 py-4 overflow-y-auto flex-1 min-h-[320px]">
|
||||
<label class="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-2">
|
||||
Target project
|
||||
</label>
|
||||
@@ -202,6 +207,10 @@ async function _mergeFetchTargets() {
|
||||
return;
|
||||
}
|
||||
|
||||
// Stash target id + name in data-* attributes (NOT inline JS args)
|
||||
// to avoid the quote-collision that breaks click binding when the
|
||||
// project name contains characters JSON.stringify quotes. Same
|
||||
// pattern as the backfill typeahead dropdown.
|
||||
dropdown.innerHTML = candidates.map(m => {
|
||||
const scoreBadge = m.score >= 0.99
|
||||
? '<span class="text-xs text-green-600 dark:text-green-400 ml-2">exact</span>'
|
||||
@@ -212,8 +221,10 @@ async function _mergeFetchTargets() {
|
||||
if (m.location_count > 0) meta.push(`${m.location_count} location${m.location_count === 1 ? '' : 's'}`);
|
||||
const metaLine = meta.length ? `<div class="text-xs text-gray-500 dark:text-gray-400">${meta.join(' · ')}</div>` : '';
|
||||
return `<button type="button"
|
||||
data-target-id="${_mergeEsc(m.id)}"
|
||||
data-target-name="${_mergeEsc(m.name)}"
|
||||
onmousedown="event.preventDefault()"
|
||||
onclick="onMergePickTarget('${_mergeEsc(m.id)}', ${JSON.stringify(m.name)})"
|
||||
onclick="_mergePickFromButton(this)"
|
||||
class="w-full text-left px-3 py-2 hover:bg-gray-50 dark:hover:bg-slate-700 border-b border-gray-100 dark:border-gray-700 last:border-b-0">
|
||||
<div class="text-sm font-medium text-gray-900 dark:text-white">${_mergeEsc(m.name)}${scoreBadge}</div>
|
||||
${metaLine}
|
||||
@@ -222,6 +233,13 @@ async function _mergeFetchTargets() {
|
||||
dropdown.classList.remove('hidden');
|
||||
}
|
||||
|
||||
// Trampoline — reads the button's data attributes and forwards. Keeps
|
||||
// the inline onclick free of any string interpolation that could break
|
||||
// HTML quoting (see notes on the same pattern in metadata_backfill.html).
|
||||
function _mergePickFromButton(btn) {
|
||||
onMergePickTarget(btn.dataset.targetId, btn.dataset.targetName);
|
||||
}
|
||||
|
||||
async function onMergePickTarget(targetId, targetName) {
|
||||
document.getElementById('merge-target-input').value = targetName;
|
||||
document.getElementById('merge-target-id').value = targetId;
|
||||
|
||||
Reference in New Issue
Block a user