fix(merge-project): dropdown unclickable + modal too short to show it

Two bugs in the project-merge modal: 1. Dropdown options had the same JSON.stringify quote-collision in their inline onclick that broke the location Remove button and the metadata-backfill typeahead earlier this week: onclick="onMergePickTarget('${id}', ${JSON.stringify(m.name)})" For 'I-80 Area 1' that renders as onclick="...(\"I-80 Area 1\")" — the inner double quotes terminate the onclick attribute early, and the browser never binds the click handler. Operator clicked items in the dropdown and nothing happened. Fixed via data-target-id / data-target-name attributes and a _mergePickFromButton(btn) trampoline. 2. Modal body had `flex-1 overflow-y-auto` with no min-height, so the container shrunk tight around the input. When the typeahead dropdown appeared below the input it got clipped by the body's overflow and the operator had to scroll inside the modal to see the options. Fixed by adding min-height: 480px to the modal container + min-h- [320px] on the body so there's always room for the dropdown + the preview pane that appears below after a target is picked. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
fix(backfill): location matching over-confident on boilerplate-shared names
2026-05-15 04:54:33 +00:00 · 2026-05-15 04:10:48 +00:00
3 changed files with 87 additions and 7 deletions
@@ -376,7 +376,11 @@ def locations_search(
        if q_norm in l_norm:
            scored.append((l, 1.0))
            continue
-        score = svc.similarity(q_norm, l_norm)
+        # Use the location-specific scorer (token_set_ratio + multi-digit
+        # penalty) instead of WRatio — same reason as the cluster-match
+        # path: location names share too much boilerplate vocabulary for
+        # WRatio to discriminate reliably.
+        score = svc.location_similarity(q_norm, l_norm)
        if score >= 0.50:
            scored.append((l, score))

@@ -162,6 +162,11 @@ def similarity(a: str, b: str) -> float:
    too short to fuzzy-match safely (see _MIN_FUZZY_LEN comment) AND the
    strings don't exact-match.  This guardrails the 'one common word
    inside a longer phrase' false positive.
+
+    USE FOR: project names (where typos like '1-80' vs 'I-80' should
+    still match).  For location names use `location_similarity()` —
+    WRatio is too lenient on the shared boilerplate vocabulary in
+    location strings ('Area', 'Loc', 'Bridge', 'Dam', etc.).
    """
    if not a or not b:
        return 0.0
@@ -172,6 +177,50 @@ def similarity(a: str, b: str) -> float:
    return rapidfuzz.fuzz.WRatio(a, b) / 100.0


+# Multi-digit penalty applied when two location names have completely
+# disjoint multi-digit numeric tokens (e.g. "87 Jenks" vs "68 Jenks").
+# Single-digit numbers ("Loc 1", "Area 2") are often shared coincidentally,
+# but address-style multi-digit numbers are strong identifiers — if they
+# differ, the locations are usually different physical places.
+_LOCATION_DIGIT_MISMATCH_PENALTY = 0.30
+
+
+def location_similarity(a: str, b: str) -> float:
+    """Stricter similarity score for location-name matching.
+
+    Location names share so much boilerplate vocabulary ('Area', 'Loc',
+    'Bridge', 'Dam') that rapidfuzz.WRatio inflates obvious mismatches.
+    Example: 'Area 2 - Brookville Dam - Loc 2 East' vs 'Area 1 - Loc 1 -
+    87 Jenks' scores 85.5 via WRatio despite being unrelated locations.
+
+    This scorer uses `token_set_ratio` as the base (sensitive to actual
+    word overlap, not just substring containment).  It then applies a
+    multi-digit penalty: if both strings contain 2+-digit numbers and
+    none overlap, subtract 0.30.  Catches the "same project, different
+    address-style identifier" case ('87 Jenks' vs '68 Jenks') that pure
+    token-set scoring still rates above 0.90.
+
+    Single-digit numbers ('Loc 1', 'Area 2') are excluded from the
+    penalty because they're often shared boilerplate ("Loc 1" in every
+    project) rather than discriminating identifiers.
+    """
+    if not a or not b:
+        return 0.0
+    if a == b:
+        return 1.0
+    if min(len(a), len(b)) < _MIN_FUZZY_LEN:
+        return 0.0
+
+    base = rapidfuzz.fuzz.token_set_ratio(a, b) / 100.0
+
+    multidigits_a = set(re.findall(r"\d{2,}", a))
+    multidigits_b = set(re.findall(r"\d{2,}", b))
+    if multidigits_a and multidigits_b and not (multidigits_a & multidigits_b):
+        base = max(0.0, base - _LOCATION_DIGIT_MISMATCH_PENALTY)
+
+    return base
+
+
 # ── Cluster + Suggestion dataclasses ───────────────────────────────────────────


@@ -572,15 +621,24 @@ async def _scan_clusters(
 def _find_best_match(
    candidate_norm: str,
    candidates: list[tuple[str, str]],   # (id, normalised_name)
+    *,
+    kind: str = "project",               # "project" | "location"
 ) -> tuple[Optional[str], Optional[float], str]:
    """Return (best_id, best_score, classification).

    classification ∈ {"exact", "fuzzy", "ambiguous", "no_match"}
+
+    The `kind` parameter selects the scorer.  Project matching uses
+    rapidfuzz.WRatio (lenient — catches typos like '1-80' vs 'I-80').
+    Location matching uses `location_similarity` (stricter — catches
+    boilerplate-shared-but-actually-different strings like 'Loc 2 - 68
+    Jenks' vs 'Loc 1 - 87 Jenks').
    """
    if not candidate_norm or not candidates:
        return None, None, "no_match"

-    scored = [(cid, similarity(candidate_norm, cnorm)) for cid, cnorm in candidates]
+    scorer = location_similarity if kind == "location" else similarity
+    scored = [(cid, scorer(candidate_norm, cnorm)) for cid, cnorm in candidates]
    scored.sort(key=lambda x: x[1], reverse=True)
    best_id, best_score = scored[0]

@@ -725,7 +783,7 @@ def _build_suggestion(db: Session, cluster: Cluster) -> Suggestion:
        )
        location_candidates = [(l.id, _normalise(l.name)) for l in location_candidates_objs]
        if cluster.location_norm:
-            loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates)
+            loc_id, loc_score, loc_match = _find_best_match(cluster.location_norm, location_candidates, kind="location")
        else:
            loc_id, loc_score, loc_match = None, None, "create_new"
    else:
@@ -87,9 +87,14 @@
    </div>
 </div>

-<!-- Merge Modal -->
+<!-- Merge Modal —
+     min-h on the body ensures the typeahead dropdown has room to render
+     below the input without forcing the operator to scroll inside the
+     modal.  overflow-visible on the body lets the dropdown extend
+     beyond the body's natural height when needed. -->
 <div id="merge-modal" class="hidden fixed inset-0 z-50 flex items-center justify-center bg-black/60 backdrop-blur-sm">
-    <div class="bg-white dark:bg-slate-800 rounded-xl shadow-2xl w-full max-w-2xl mx-4 max-h-[90vh] flex flex-col">
+    <div class="bg-white dark:bg-slate-800 rounded-xl shadow-2xl w-full max-w-2xl mx-4 max-h-[90vh] flex flex-col"
+         style="min-height: 480px;">
        <!-- Header -->
        <div class="px-6 py-4 border-b border-gray-200 dark:border-gray-700 flex items-center justify-between">
            <div>
@@ -104,7 +109,7 @@
        </div>

        <!-- Body -->
-        <div class="px-6 py-4 overflow-y-auto flex-1">
+        <div class="px-6 py-4 overflow-y-auto flex-1 min-h-[320px]">
            <label class="block text-sm font-medium text-gray-700 dark:text-gray-300 mb-2">
                Target project
            </label>
@@ -202,6 +207,10 @@ async function _mergeFetchTargets() {
        return;
    }

+    // Stash target id + name in data-* attributes (NOT inline JS args)
+    // to avoid the quote-collision that breaks click binding when the
+    // project name contains characters JSON.stringify quotes.  Same
+    // pattern as the backfill typeahead dropdown.
    dropdown.innerHTML = candidates.map(m => {
        const scoreBadge = m.score >= 0.99
            ? '<span class="text-xs text-green-600 dark:text-green-400 ml-2">exact</span>'
@@ -212,8 +221,10 @@ async function _mergeFetchTargets() {
        if (m.location_count > 0) meta.push(`${m.location_count} location${m.location_count === 1 ? '' : 's'}`);
        const metaLine = meta.length ? `<div class="text-xs text-gray-500 dark:text-gray-400">${meta.join(' · ')}</div>` : '';
        return `<button type="button"
+            data-target-id="${_mergeEsc(m.id)}"
+            data-target-name="${_mergeEsc(m.name)}"
            onmousedown="event.preventDefault()"
-            onclick="onMergePickTarget('${_mergeEsc(m.id)}', ${JSON.stringify(m.name)})"
+            onclick="_mergePickFromButton(this)"
            class="w-full text-left px-3 py-2 hover:bg-gray-50 dark:hover:bg-slate-700 border-b border-gray-100 dark:border-gray-700 last:border-b-0">
            <div class="text-sm font-medium text-gray-900 dark:text-white">${_mergeEsc(m.name)}${scoreBadge}</div>
            ${metaLine}
@@ -222,6 +233,13 @@ async function _mergeFetchTargets() {
    dropdown.classList.remove('hidden');
 }

+// Trampoline — reads the button's data attributes and forwards.  Keeps
+// the inline onclick free of any string interpolation that could break
+// HTML quoting (see notes on the same pattern in metadata_backfill.html).
+function _mergePickFromButton(btn) {
+    onMergePickTarget(btn.dataset.targetId, btn.dataset.targetName);
+}
+
 async function onMergePickTarget(targetId, targetName) {
    document.getElementById('merge-target-input').value = targetName;
    document.getElementById('merge-target-id').value = targetId;