From d46f9fccf8e15ca15098de2a44abdc125f2c139c Mon Sep 17 00:00:00 2001 From: serversdown Date: Tue, 12 May 2026 19:19:46 +0000 Subject: [PATCH] fix(sfm): broaden Loc-N suffix regex to catch '.Loc' and 'Loc No.' variants MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Operators use more separator variations than the original regex caught: - "Trumbull-Brayman-JV- Mont.Dam.Loc 2-R-25" — period as separator - "CMU - RKM Hall - Loc No. 3 - 4615 Forbes" — "No." between Loc and digit Added period to the separator character class and optional "No." token before the digit. Catches both above patterns plus near-variants without false-positives on normal project strings. Real-data impact: 5 more clusters now auto-strip cleanly, including the 1,903-event Trumbull-Brayman-JV- Mont.Dam cluster. Confidence distribution: 43 → 44 high. --- backend/services/metadata_backfill.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/backend/services/metadata_backfill.py b/backend/services/metadata_backfill.py index 8117493..303328a 100644 --- a/backend/services/metadata_backfill.py +++ b/backend/services/metadata_backfill.py @@ -103,16 +103,20 @@ def _normalise(s: Optional[str]) -> str: # their full project_raw and the operator can edit them in the wizard. _PROJECT_LOC_SUFFIX = re.compile( r""" - \s* # any leading whitespace - [-–—] # hyphen or em-dash (separator before the Loc marker) - \s* # optional spaces - (?:loc|location) # 'Loc' or 'Location' - \.? # optional period - \s* # optional space - \#? # optional '#' - \s* # optional space - \d+ # required digit - \b # word boundary + \s* # any leading whitespace + [-–—.] # separator: hyphen, em-dash, or period + # (operators use any of these — see + # "Mont.Dam.Loc 2-R-25") + \s* + (?:loc|location) # 'Loc' or 'Location' + \.? # optional trailing period after Loc + \s* + (?:no\.?\s*)? # optional "No." or "No " before the digit + # (e.g. "Loc No. 3", "Loc No 5") + \#? # optional '#' + \s* + \d+ # required digit + \b """, re.IGNORECASE | re.VERBOSE, )