diff --git a/backend/routers/metadata_backfill.py b/backend/routers/metadata_backfill.py index bf97520..3e77a6d 100644 --- a/backend/routers/metadata_backfill.py +++ b/backend/routers/metadata_backfill.py @@ -46,6 +46,7 @@ def _serialise_suggestion(s: svc.Suggestion) -> dict: "event_count": c.event_count, "sample_event_id": c.sample_event_id, "project_raw": c.project_raw, + "project_root": c.project_root, "location_raw": c.location_raw, "client_raw": c.client_raw, "operator_raw": c.operator_raw, diff --git a/backend/services/metadata_backfill.py b/backend/services/metadata_backfill.py index 5a86476..8117493 100644 --- a/backend/services/metadata_backfill.py +++ b/backend/services/metadata_backfill.py @@ -90,6 +90,56 @@ def _normalise(s: Optional[str]) -> str: return s +# Match a "Loc N" / "Location #N" suffix preceded by a separator. Operators +# often type project names like "Fay - Locks & Dam No3 - Loc 2 - 735 Bunola" +# where the leading "Fay - Locks & Dam No3" is the actual project and the +# trailing "- Loc 2 - ..." is location info that already lives in the +# sensor_location field. We strip the trailing junk so projects with the +# same root get clustered together. +# +# Matches: +# "- Loc 2", "-Loc3", "- Location #5", " — Location.5", "- LOC #07" +# Doesn't match strings without an obvious Loc N marker — those keep +# their full project_raw and the operator can edit them in the wizard. +_PROJECT_LOC_SUFFIX = re.compile( + r""" + \s* # any leading whitespace + [-–—] # hyphen or em-dash (separator before the Loc marker) + \s* # optional spaces + (?:loc|location) # 'Loc' or 'Location' + \.? # optional period + \s* # optional space + \#? # optional '#' + \s* # optional space + \d+ # required digit + \b # word boundary + """, + re.IGNORECASE | re.VERBOSE, +) + + +def _extract_project_root(project_raw: str) -> str: + """Return the leading 'project root' portion of an operator-typed string. + + Strips everything from the first " - Loc N" (or similar) marker forward, + so 'Fay - Locks & Dam No3 - Loc 2 - 735 Bunola' becomes + 'Fay - Locks & Dam No3'. Strings without a Loc-marker pass through + unchanged. + + Trailing whitespace and dangling hyphens are cleaned up. + """ + if not project_raw: + return "" + m = _PROJECT_LOC_SUFFIX.search(project_raw) + if m is None: + return project_raw.strip() + root = project_raw[: m.start()] + # Strip trailing whitespace + dangling separators left behind + # (e.g. "Fay - Locks & Dam No3 -" → "Fay - Locks & Dam No3"). + root = re.sub(r"[\s\-–—]+$", "", root) + return root.strip() + + # Min length of the SHORTER input before a fuzzy match is accepted. # rapidfuzz.WRatio is generous with partial_ratio on short strings — e.g. # 'demo' vs 'bridge demo project' scores 0.90 (false positive). Requiring @@ -130,9 +180,17 @@ class Cluster: event_count: int sample_event_id: str - # Display values — the mode (most common) of each field. + # project_raw is the FULL operator-typed string (e.g. + # "Fay - Locks & Dam No3 - Loc 5 Synthomer"). Kept for display so + # operator can sanity-check what they typed. project_raw: str + # project_root is project_raw with any trailing "- Loc N" suffix + # stripped — what we actually use for matching and as the suggested + # project name. (e.g. "Fay - Locks & Dam No3"). Same as project_raw + # if no Loc marker was found. + project_root: str project_norm: str + location_raw: str location_norm: str client_raw: str @@ -386,8 +444,16 @@ def _build_cluster(serial: str, events: list[dict]) -> Cluster: client_raw = _pick_display("client", _normalise(events[0].get("client"))) operator_raw = _pick_display("operator", _normalise(events[0].get("operator"))) + # Strip trailing "- Loc N" location info that operators sometimes bake + # into the project string for email-readability ("I-80 - Loc 2 - 543 W + # Plant Rd" → "I-80"). The sensor_location field already has the + # authoritative location identifier. Use project_root for matching + # and as the suggested project name; keep project_raw for display. + project_root = _extract_project_root(project_raw) + project_norm_for_matching = _normalise(project_root) + consistency = min(project_consistency, location_consistency) - is_blank = (not project_mode_norm) or (not location_mode_norm) + is_blank = (not project_norm_for_matching) or (not location_mode_norm) return Cluster( cluster_id = _build_cluster_id(serial, first_ts, last_ts), @@ -397,7 +463,8 @@ def _build_cluster(serial: str, events: list[dict]) -> Cluster: event_count = len(events), sample_event_id = events[0]["id"], project_raw = project_raw, - project_norm = project_mode_norm, + project_root = project_root, + project_norm = project_norm_for_matching, location_raw = location_raw, location_norm = location_mode_norm, client_raw = client_raw, @@ -638,7 +705,7 @@ def _build_suggestion(db: Session, cluster: Cluster) -> Suggestion: project_suggested_name = ( project_existing.name if project_existing and project_match == "exact" - else cluster.project_raw or f"Project {cluster.serial}" + else cluster.project_root or cluster.project_raw or f"Project {cluster.serial}" ) # Match location ONLY within the matched project's existing locations. @@ -818,15 +885,32 @@ def _ensure_auto_imported_project_type(db: Session) -> str: def _ensure_project(db: Session, suggestion: Suggestion) -> tuple[Project, bool]: - """Return (project, created_flag).""" + """Return (project, created_flag). + + Dedup is normalisation-aware: "SR81" and "SR 81" collapse to the same + project (both normalise to "sr 81"), as do "Fay - Locks & Dam No3" + and "Fay-Locks-&-Dam-No3". Important when applying many clusters in + one bulk operation — the first creates the project, subsequent + clusters with normalisation-equivalent names attach to it instead + of triggering a UNIQUE constraint violation. + """ if suggestion.project_existing_id: p = db.query(Project).filter_by(id=suggestion.project_existing_id).first() if p is not None: return p, False - # Need to create. But: Project.name has UNIQUE constraint. Check for - # a case-insensitive existing project before creating. candidate_name = suggestion.project_suggested_name.strip() or f"Auto-imported project ({suggestion.cluster.serial})" + candidate_norm = _normalise(candidate_name) + + # Pre-flight normalised lookup: avoids creating duplicates that + # differ only in punctuation/spacing. + if candidate_norm: + for p in db.query(Project).filter(Project.status != "deleted").all(): + if _normalise(p.name) == candidate_norm: + return p, False + + # Final fallback: case-insensitive exact (cheap, catches the same + # things normalised lookup would but it's harmless to keep). existing = db.query(Project).filter(Project.name.ilike(candidate_name)).first() if existing is not None: return existing, False @@ -882,7 +966,20 @@ def _ensure_location( return l, False candidate_name = suggestion.location_suggested_name.strip() or "Unnamed location" - # Check for case-insensitive existing within this project. + candidate_norm = _normalise(candidate_name) + + # Normalisation-aware lookup within this project — same dedup + # principle as _ensure_project. + if candidate_norm: + for existing in ( + db.query(MonitoringLocation) + .filter(MonitoringLocation.project_id == project.id) + .all() + ): + if _normalise(existing.name) == candidate_norm: + return existing, False + + # Fallback to case-insensitive exact. existing = ( db.query(MonitoringLocation) .filter(MonitoringLocation.project_id == project.id) diff --git a/templates/admin/metadata_backfill.html b/templates/admin/metadata_backfill.html index adb0177..3c42541 100644 --- a/templates/admin/metadata_backfill.html +++ b/templates/admin/metadata_backfill.html @@ -288,6 +288,9 @@ function _renderCluster(s) {