feat(sfm): strip "- Loc N" suffix from operator-typed project names

Operators sometimes bake location identifiers into the project string for email-readability — "Fay - Locks & Dam No3 - Loc 2 - 735 Bunola" where "Fay - Locks & Dam No3" is the actual project and "- Loc 2 - 735 Bunola" is location info that already lives in sensor_location. Without stripping, every "- Loc N" variant became a separate project, fragmenting what should be one project with several locations. Backend: - New _extract_project_root() helper. Regex matches " - Loc N" / "-Loc3" / " - Location #5" / etc. with case-insensitive multi-dash support; strips from that marker forward and cleans up dangling separators. Strings without a Loc-marker pass through unchanged. - Cluster dataclass adds project_root field alongside project_raw. project_raw stays the operator-typed string for display ("hover to see what was actually typed"). project_root is what gets normalised for matching and used as the suggested project name. - _ensure_project + _ensure_location now do normalisation-aware dedup before creating: a cluster of "SR81" and a cluster of "SR 81" (which normalise to the same string) collapse into one project on apply, even when applied in the same bulk operation. Avoids UNIQUE constraint collisions and duplicate-named-by-spacing projects. Frontend: - Wizard cluster cards show "↳ stripped trailing 'Loc N' suffix; operator typed: <raw>" when project_root differs from project_raw, so the operator can see at a glance what the parser did to the string. Real-data results: against the same 10,055 SFM events, confidence distribution improved from 37/14/8 (high/med/low) to 43/9/7. "Fay - Locks & Dam No3" now appears as ONE project across 6 cluster instances spanning 3 serials and 6 different locations — exactly the "one project, many locations" model the user described. Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
2026-05-12 16:49:14 +00:00
parent 42de06f441
commit 6ebbe28308
3 changed files with 109 additions and 8 deletions
@@ -90,6 +90,56 @@ def _normalise(s: Optional[str]) -> str:
    return s


+# Match a "Loc N" / "Location #N" suffix preceded by a separator.  Operators
+# often type project names like "Fay - Locks & Dam No3 - Loc 2 - 735 Bunola"
+# where the leading "Fay - Locks & Dam No3" is the actual project and the
+# trailing "- Loc 2 - ..." is location info that already lives in the
+# sensor_location field.  We strip the trailing junk so projects with the
+# same root get clustered together.
+#
+# Matches:
+#   "- Loc 2", "-Loc3", "- Location #5", " — Location.5", "- LOC #07"
+# Doesn't match strings without an obvious Loc N marker — those keep
+# their full project_raw and the operator can edit them in the wizard.
+_PROJECT_LOC_SUFFIX = re.compile(
+    r"""
+    \s*               # any leading whitespace
+    [-–—]             # hyphen or em-dash (separator before the Loc marker)
+    \s*               # optional spaces
+    (?:loc|location)  # 'Loc' or 'Location'
+    \.?               # optional period
+    \s*               # optional space
+    \#?               # optional '#'
+    \s*               # optional space
+    \d+               # required digit
+    \b                # word boundary
+    """,
+    re.IGNORECASE | re.VERBOSE,
+)
+
+
+def _extract_project_root(project_raw: str) -> str:
+    """Return the leading 'project root' portion of an operator-typed string.
+
+    Strips everything from the first " - Loc N" (or similar) marker forward,
+    so 'Fay - Locks & Dam No3 - Loc 2 - 735 Bunola' becomes
+    'Fay - Locks & Dam No3'.  Strings without a Loc-marker pass through
+    unchanged.
+
+    Trailing whitespace and dangling hyphens are cleaned up.
+    """
+    if not project_raw:
+        return ""
+    m = _PROJECT_LOC_SUFFIX.search(project_raw)
+    if m is None:
+        return project_raw.strip()
+    root = project_raw[: m.start()]
+    # Strip trailing whitespace + dangling separators left behind
+    # (e.g. "Fay - Locks & Dam No3 -" → "Fay - Locks & Dam No3").
+    root = re.sub(r"[\s\-–—]+$", "", root)
+    return root.strip()
+
+
 # Min length of the SHORTER input before a fuzzy match is accepted.
 # rapidfuzz.WRatio is generous with partial_ratio on short strings — e.g.
 # 'demo' vs 'bridge demo project' scores 0.90 (false positive).  Requiring
@@ -130,9 +180,17 @@ class Cluster:
    event_count:           int
    sample_event_id:       str

-    # Display values — the mode (most common) of each field.
+    # project_raw is the FULL operator-typed string (e.g.
+    # "Fay - Locks & Dam No3 - Loc 5 Synthomer").  Kept for display so
+    # operator can sanity-check what they typed.
    project_raw:           str
+    # project_root is project_raw with any trailing "- Loc N" suffix
+    # stripped — what we actually use for matching and as the suggested
+    # project name.  (e.g. "Fay - Locks & Dam No3").  Same as project_raw
+    # if no Loc marker was found.
+    project_root:          str
    project_norm:          str
+
    location_raw:          str
    location_norm:         str
    client_raw:            str
@@ -386,8 +444,16 @@ def _build_cluster(serial: str, events: list[dict]) -> Cluster:
    client_raw   = _pick_display("client",          _normalise(events[0].get("client")))
    operator_raw = _pick_display("operator",        _normalise(events[0].get("operator")))

+    # Strip trailing "- Loc N" location info that operators sometimes bake
+    # into the project string for email-readability ("I-80 - Loc 2 - 543 W
+    # Plant Rd" → "I-80").  The sensor_location field already has the
+    # authoritative location identifier.  Use project_root for matching
+    # and as the suggested project name; keep project_raw for display.
+    project_root = _extract_project_root(project_raw)
+    project_norm_for_matching = _normalise(project_root)
+
    consistency = min(project_consistency, location_consistency)
-    is_blank = (not project_mode_norm) or (not location_mode_norm)
+    is_blank = (not project_norm_for_matching) or (not location_mode_norm)

    return Cluster(
        cluster_id            = _build_cluster_id(serial, first_ts, last_ts),
@@ -397,7 +463,8 @@ def _build_cluster(serial: str, events: list[dict]) -> Cluster:
        event_count           = len(events),
        sample_event_id       = events[0]["id"],
        project_raw           = project_raw,
-        project_norm          = project_mode_norm,
+        project_root          = project_root,
+        project_norm          = project_norm_for_matching,
        location_raw          = location_raw,
        location_norm         = location_mode_norm,
        client_raw            = client_raw,
@@ -638,7 +705,7 @@ def _build_suggestion(db: Session, cluster: Cluster) -> Suggestion:

    project_suggested_name = (
        project_existing.name if project_existing and project_match == "exact"
-        else cluster.project_raw or f"Project {cluster.serial}"
+        else cluster.project_root or cluster.project_raw or f"Project {cluster.serial}"
    )

    # Match location ONLY within the matched project's existing locations.
@@ -818,15 +885,32 @@ def _ensure_auto_imported_project_type(db: Session) -> str:


 def _ensure_project(db: Session, suggestion: Suggestion) -> tuple[Project, bool]:
-    """Return (project, created_flag)."""
+    """Return (project, created_flag).
+
+    Dedup is normalisation-aware: "SR81" and "SR 81" collapse to the same
+    project (both normalise to "sr 81"), as do "Fay - Locks & Dam No3"
+    and "Fay-Locks-&-Dam-No3".  Important when applying many clusters in
+    one bulk operation — the first creates the project, subsequent
+    clusters with normalisation-equivalent names attach to it instead
+    of triggering a UNIQUE constraint violation.
+    """
    if suggestion.project_existing_id:
        p = db.query(Project).filter_by(id=suggestion.project_existing_id).first()
        if p is not None:
            return p, False

-    # Need to create.  But: Project.name has UNIQUE constraint.  Check for
-    # a case-insensitive existing project before creating.
    candidate_name = suggestion.project_suggested_name.strip() or f"Auto-imported project ({suggestion.cluster.serial})"
+    candidate_norm = _normalise(candidate_name)
+
+    # Pre-flight normalised lookup: avoids creating duplicates that
+    # differ only in punctuation/spacing.
+    if candidate_norm:
+        for p in db.query(Project).filter(Project.status != "deleted").all():
+            if _normalise(p.name) == candidate_norm:
+                return p, False
+
+    # Final fallback: case-insensitive exact (cheap, catches the same
+    # things normalised lookup would but it's harmless to keep).
    existing = db.query(Project).filter(Project.name.ilike(candidate_name)).first()
    if existing is not None:
        return existing, False
@@ -882,7 +966,20 @@ def _ensure_location(
            return l, False

    candidate_name = suggestion.location_suggested_name.strip() or "Unnamed location"
-    # Check for case-insensitive existing within this project.
+    candidate_norm = _normalise(candidate_name)
+
+    # Normalisation-aware lookup within this project — same dedup
+    # principle as _ensure_project.
+    if candidate_norm:
+        for existing in (
+            db.query(MonitoringLocation)
+            .filter(MonitoringLocation.project_id == project.id)
+            .all()
+        ):
+            if _normalise(existing.name) == candidate_norm:
+                return existing, False
+
+    # Fallback to case-insensitive exact.
    existing = (
        db.query(MonitoringLocation)
        .filter(MonitoringLocation.project_id == project.id)