fix: consolidation no longer stalls or breaks the live chat turn

Two bugs surfacing in the log during live play: - SUMMARY_BACKEND=mi50 (llama.cpp, 32B) was fed 24k-char chunks → "Context size has been exceeded". Chunk budget is now backend-aware: cloud 24k, local/mi50 8k, and the merge step recurses so merged partials never overflow either. - maybe_summarize ran inline in the chat turn and retried 4× with backoff (~30s), stalling the reply and surfacing the error. It now runs in a background daemon thread, swallows errors (consolidation is best-effort maintenance), and dedupes so at most one summary per session runs at a time. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-20 04:37:17 +00:00
parent 5e9f3efeec
commit 5c41bd48d1
2 changed files with 44 additions and 7 deletions
@@ -200,7 +200,7 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud",
    memory.remember(session_id, "assistant", reply)
    # Compact this session once enough new turns have piled up.
-    summary.maybe_summarize(session_id)
+    summary.maybe_summarize_async(session_id)
    return reply
@@ -259,5 +259,5 @@ def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud",
    memory.remember(session_id, "user", user_msg)
    memory.remember(session_id, "assistant", reply)
-    summary.maybe_summarize(session_id)
+    summary.maybe_summarize_async(session_id)
    yield ("done", reply)
@@ -10,6 +10,7 @@ big imported conversation doesn't blow the local model's context window.
 from __future__ import annotations
 import sys
 import threading
 import time
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -20,8 +21,15 @@ _RETRIES = 4
 # Re-summarize a session once it has accumulated this many new raw exchanges.
 SUMMARIZE_AFTER = 20
-# Transcript budget per LLM call; longer sessions are chunked + merged.
+# Transcript budget per LLM call; longer sessions are chunked + merged. Cloud has
 # a large context window; the local llama.cpp/Ollama servers have small ones, so a
 # 24k-char chunk overflows them ("Context size has been exceeded") — keep local small.
 MAX_TRANSCRIPT_CHARS = 24000
 LOCAL_TRANSCRIPT_CHARS = 8000
 def _budget(backend: Backend) -> int:
    return MAX_TRANSCRIPT_CHARS if backend == "cloud" else LOCAL_TRANSCRIPT_CHARS
 _PROMPT = """You are compacting a conversation into a long-term memory record \
 (not replying to anyone). Write a concise gist of the session below: what was \
@@ -66,11 +74,14 @@ def _summarize_text(text: str, backend: Backend) -> str:
 def _summarize_transcript(transcript: str, backend: Backend) -> str:
-    """Transcript -> gist (LLM only, no DB). Chunks + merges if oversized."""
+    """Transcript -> gist (LLM only, no DB). Chunks + merges if oversized, and
-    if len(transcript) <= MAX_TRANSCRIPT_CHARS:
+    recurses so even the merged partials never exceed the backend's window."""
    budget = _budget(backend)
    if len(transcript) <= budget:
        return _summarize_text(transcript, backend)
-    partials = [_summarize_text(c, backend) for c in _chunk(transcript, MAX_TRANSCRIPT_CHARS)]
+    partials = [_summarize_text(c, backend) for c in _chunk(transcript, budget)]
-    return _summarize_text("Partial summaries to merge:\n\n" + "\n\n".join(partials), backend)
+    merged = "Partial summaries to merge:\n\n" + "\n\n".join(partials)
    return _summarize_transcript(merged, backend)
 def summarize_session(session_id: str, backend: Backend | None = None) -> str | None:
@@ -91,6 +102,32 @@ def maybe_summarize(session_id: str, backend: Backend | None = None) -> None:
        summarize_session(session_id, backend=backend)
 _inflight: set[str] = set()
 _inflight_lock = threading.Lock()
 def maybe_summarize_async(session_id: str, backend: Backend | None = None) -> None:
    """Run maybe_summarize off the chat turn's critical path. Consolidation is
    background maintenance — it must never stall the reply or surface an error to
    the user (a slow/oversized local model would otherwise block the turn). At most
    one summary per session runs at a time."""
    with _inflight_lock:
        if session_id in _inflight:
            return
        _inflight.add(session_id)
    def _run() -> None:
        try:
            maybe_summarize(session_id, backend=backend)
        except Exception as exc:
            logbus.log("error", "summary skipped", session=session_id, error=str(exc)[:120])
        finally:
            with _inflight_lock:
                _inflight.discard(session_id)
    threading.Thread(target=_run, daemon=True, name="summarize").start()
 def summarize_all(
    backend: Backend | None = None, limit: int | None = None, workers: int = 8
 ) -> dict: