fix: consolidation no longer stalls or breaks the live chat turn

Two bugs surfacing in the log during live play: - SUMMARY_BACKEND=mi50 (llama.cpp, 32B) was fed 24k-char chunks → "Context size has been exceeded". Chunk budget is now backend-aware: cloud 24k, local/mi50 8k, and the merge step recurses so merged partials never overflow either. - maybe_summarize ran inline in the chat turn and retried 4× with backoff (~30s), stalling the reply and surfacing the error. It now runs in a background daemon thread, swallows errors (consolidation is best-effort maintenance), and dedupes so at most one summary per session runs at a time. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-20 04:37:17 +00:00
parent 5e9f3efeec
commit 5c41bd48d1
2 changed files with 44 additions and 7 deletions
@@ -10,6 +10,7 @@ big imported conversation doesn't blow the local model's context window.
 from __future__ import annotations

 import sys
+import threading
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed

@@ -20,8 +21,15 @@ _RETRIES = 4

 # Re-summarize a session once it has accumulated this many new raw exchanges.
 SUMMARIZE_AFTER = 20
-# Transcript budget per LLM call; longer sessions are chunked + merged.
+# Transcript budget per LLM call; longer sessions are chunked + merged. Cloud has
+# a large context window; the local llama.cpp/Ollama servers have small ones, so a
+# 24k-char chunk overflows them ("Context size has been exceeded") — keep local small.
 MAX_TRANSCRIPT_CHARS = 24000
+LOCAL_TRANSCRIPT_CHARS = 8000
+
+
+def _budget(backend: Backend) -> int:
+    return MAX_TRANSCRIPT_CHARS if backend == "cloud" else LOCAL_TRANSCRIPT_CHARS

 _PROMPT = """You are compacting a conversation into a long-term memory record \
 (not replying to anyone). Write a concise gist of the session below: what was \
@@ -66,11 +74,14 @@ def _summarize_text(text: str, backend: Backend) -> str:


 def _summarize_transcript(transcript: str, backend: Backend) -> str:
-    """Transcript -> gist (LLM only, no DB). Chunks + merges if oversized."""
-    if len(transcript) <= MAX_TRANSCRIPT_CHARS:
+    """Transcript -> gist (LLM only, no DB). Chunks + merges if oversized, and
+    recurses so even the merged partials never exceed the backend's window."""
+    budget = _budget(backend)
+    if len(transcript) <= budget:
        return _summarize_text(transcript, backend)
-    partials = [_summarize_text(c, backend) for c in _chunk(transcript, MAX_TRANSCRIPT_CHARS)]
-    return _summarize_text("Partial summaries to merge:\n\n" + "\n\n".join(partials), backend)
+    partials = [_summarize_text(c, backend) for c in _chunk(transcript, budget)]
+    merged = "Partial summaries to merge:\n\n" + "\n\n".join(partials)
+    return _summarize_transcript(merged, backend)


 def summarize_session(session_id: str, backend: Backend | None = None) -> str | None:
@@ -91,6 +102,32 @@ def maybe_summarize(session_id: str, backend: Backend | None = None) -> None:
        summarize_session(session_id, backend=backend)


+_inflight: set[str] = set()
+_inflight_lock = threading.Lock()
+
+
+def maybe_summarize_async(session_id: str, backend: Backend | None = None) -> None:
+    """Run maybe_summarize off the chat turn's critical path. Consolidation is
+    background maintenance — it must never stall the reply or surface an error to
+    the user (a slow/oversized local model would otherwise block the turn). At most
+    one summary per session runs at a time."""
+    with _inflight_lock:
+        if session_id in _inflight:
+            return
+        _inflight.add(session_id)
+
+    def _run() -> None:
+        try:
+            maybe_summarize(session_id, backend=backend)
+        except Exception as exc:
+            logbus.log("error", "summary skipped", session=session_id, error=str(exc)[:120])
+        finally:
+            with _inflight_lock:
+                _inflight.discard(session_id)
+
+    threading.Thread(target=_run, daemon=True, name="summarize").start()
+
+
 def summarize_all(
    backend: Backend | None = None, limit: int | None = None, workers: int = 8
 ) -> dict: