fix: consolidation no longer stalls or breaks the live chat turn

Two bugs surfacing in the log during live play:
- SUMMARY_BACKEND=mi50 (llama.cpp, 32B) was fed 24k-char chunks → "Context size
  has been exceeded". Chunk budget is now backend-aware: cloud 24k, local/mi50 8k,
  and the merge step recurses so merged partials never overflow either.
- maybe_summarize ran inline in the chat turn and retried 4× with backoff (~30s),
  stalling the reply and surfacing the error. It now runs in a background daemon
  thread, swallows errors (consolidation is best-effort maintenance), and dedupes
  so at most one summary per session runs at a time.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-20 04:37:17 +00:00
parent 5e9f3efeec
commit 5c41bd48d1
2 changed files with 44 additions and 7 deletions
+42 -5
View File
@@ -10,6 +10,7 @@ big imported conversation doesn't blow the local model's context window.
from __future__ import annotations
import sys
import threading
import time
from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -20,8 +21,15 @@ _RETRIES = 4
# Re-summarize a session once it has accumulated this many new raw exchanges.
SUMMARIZE_AFTER = 20
# Transcript budget per LLM call; longer sessions are chunked + merged.
# Transcript budget per LLM call; longer sessions are chunked + merged. Cloud has
# a large context window; the local llama.cpp/Ollama servers have small ones, so a
# 24k-char chunk overflows them ("Context size has been exceeded") — keep local small.
MAX_TRANSCRIPT_CHARS = 24000
LOCAL_TRANSCRIPT_CHARS = 8000
def _budget(backend: Backend) -> int:
return MAX_TRANSCRIPT_CHARS if backend == "cloud" else LOCAL_TRANSCRIPT_CHARS
_PROMPT = """You are compacting a conversation into a long-term memory record \
(not replying to anyone). Write a concise gist of the session below: what was \
@@ -66,11 +74,14 @@ def _summarize_text(text: str, backend: Backend) -> str:
def _summarize_transcript(transcript: str, backend: Backend) -> str:
"""Transcript -> gist (LLM only, no DB). Chunks + merges if oversized."""
if len(transcript) <= MAX_TRANSCRIPT_CHARS:
"""Transcript -> gist (LLM only, no DB). Chunks + merges if oversized, and
recurses so even the merged partials never exceed the backend's window."""
budget = _budget(backend)
if len(transcript) <= budget:
return _summarize_text(transcript, backend)
partials = [_summarize_text(c, backend) for c in _chunk(transcript, MAX_TRANSCRIPT_CHARS)]
return _summarize_text("Partial summaries to merge:\n\n" + "\n\n".join(partials), backend)
partials = [_summarize_text(c, backend) for c in _chunk(transcript, budget)]
merged = "Partial summaries to merge:\n\n" + "\n\n".join(partials)
return _summarize_transcript(merged, backend)
def summarize_session(session_id: str, backend: Backend | None = None) -> str | None:
@@ -91,6 +102,32 @@ def maybe_summarize(session_id: str, backend: Backend | None = None) -> None:
summarize_session(session_id, backend=backend)
_inflight: set[str] = set()
_inflight_lock = threading.Lock()
def maybe_summarize_async(session_id: str, backend: Backend | None = None) -> None:
"""Run maybe_summarize off the chat turn's critical path. Consolidation is
background maintenance — it must never stall the reply or surface an error to
the user (a slow/oversized local model would otherwise block the turn). At most
one summary per session runs at a time."""
with _inflight_lock:
if session_id in _inflight:
return
_inflight.add(session_id)
def _run() -> None:
try:
maybe_summarize(session_id, backend=backend)
except Exception as exc:
logbus.log("error", "summary skipped", session=session_id, error=str(exc)[:120])
finally:
with _inflight_lock:
_inflight.discard(session_id)
threading.Thread(target=_run, daemon=True, name="summarize").start()
def summarize_all(
backend: Backend | None = None, limit: int | None = None, workers: int = 8
) -> dict: