fix: consolidation no longer stalls or breaks the live chat turn
Two bugs surfacing in the log during live play: - SUMMARY_BACKEND=mi50 (llama.cpp, 32B) was fed 24k-char chunks → "Context size has been exceeded". Chunk budget is now backend-aware: cloud 24k, local/mi50 8k, and the merge step recurses so merged partials never overflow either. - maybe_summarize ran inline in the chat turn and retried 4× with backoff (~30s), stalling the reply and surfacing the error. It now runs in a background daemon thread, swallows errors (consolidation is best-effort maintenance), and dedupes so at most one summary per session runs at a time. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+2
-2
@@ -200,7 +200,7 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud",
|
|||||||
memory.remember(session_id, "assistant", reply)
|
memory.remember(session_id, "assistant", reply)
|
||||||
|
|
||||||
# Compact this session once enough new turns have piled up.
|
# Compact this session once enough new turns have piled up.
|
||||||
summary.maybe_summarize(session_id)
|
summary.maybe_summarize_async(session_id)
|
||||||
return reply
|
return reply
|
||||||
|
|
||||||
|
|
||||||
@@ -259,5 +259,5 @@ def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud",
|
|||||||
|
|
||||||
memory.remember(session_id, "user", user_msg)
|
memory.remember(session_id, "user", user_msg)
|
||||||
memory.remember(session_id, "assistant", reply)
|
memory.remember(session_id, "assistant", reply)
|
||||||
summary.maybe_summarize(session_id)
|
summary.maybe_summarize_async(session_id)
|
||||||
yield ("done", reply)
|
yield ("done", reply)
|
||||||
|
|||||||
+42
-5
@@ -10,6 +10,7 @@ big imported conversation doesn't blow the local model's context window.
|
|||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import sys
|
import sys
|
||||||
|
import threading
|
||||||
import time
|
import time
|
||||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||||
|
|
||||||
@@ -20,8 +21,15 @@ _RETRIES = 4
|
|||||||
|
|
||||||
# Re-summarize a session once it has accumulated this many new raw exchanges.
|
# Re-summarize a session once it has accumulated this many new raw exchanges.
|
||||||
SUMMARIZE_AFTER = 20
|
SUMMARIZE_AFTER = 20
|
||||||
# Transcript budget per LLM call; longer sessions are chunked + merged.
|
# Transcript budget per LLM call; longer sessions are chunked + merged. Cloud has
|
||||||
|
# a large context window; the local llama.cpp/Ollama servers have small ones, so a
|
||||||
|
# 24k-char chunk overflows them ("Context size has been exceeded") — keep local small.
|
||||||
MAX_TRANSCRIPT_CHARS = 24000
|
MAX_TRANSCRIPT_CHARS = 24000
|
||||||
|
LOCAL_TRANSCRIPT_CHARS = 8000
|
||||||
|
|
||||||
|
|
||||||
|
def _budget(backend: Backend) -> int:
|
||||||
|
return MAX_TRANSCRIPT_CHARS if backend == "cloud" else LOCAL_TRANSCRIPT_CHARS
|
||||||
|
|
||||||
_PROMPT = """You are compacting a conversation into a long-term memory record \
|
_PROMPT = """You are compacting a conversation into a long-term memory record \
|
||||||
(not replying to anyone). Write a concise gist of the session below: what was \
|
(not replying to anyone). Write a concise gist of the session below: what was \
|
||||||
@@ -66,11 +74,14 @@ def _summarize_text(text: str, backend: Backend) -> str:
|
|||||||
|
|
||||||
|
|
||||||
def _summarize_transcript(transcript: str, backend: Backend) -> str:
|
def _summarize_transcript(transcript: str, backend: Backend) -> str:
|
||||||
"""Transcript -> gist (LLM only, no DB). Chunks + merges if oversized."""
|
"""Transcript -> gist (LLM only, no DB). Chunks + merges if oversized, and
|
||||||
if len(transcript) <= MAX_TRANSCRIPT_CHARS:
|
recurses so even the merged partials never exceed the backend's window."""
|
||||||
|
budget = _budget(backend)
|
||||||
|
if len(transcript) <= budget:
|
||||||
return _summarize_text(transcript, backend)
|
return _summarize_text(transcript, backend)
|
||||||
partials = [_summarize_text(c, backend) for c in _chunk(transcript, MAX_TRANSCRIPT_CHARS)]
|
partials = [_summarize_text(c, backend) for c in _chunk(transcript, budget)]
|
||||||
return _summarize_text("Partial summaries to merge:\n\n" + "\n\n".join(partials), backend)
|
merged = "Partial summaries to merge:\n\n" + "\n\n".join(partials)
|
||||||
|
return _summarize_transcript(merged, backend)
|
||||||
|
|
||||||
|
|
||||||
def summarize_session(session_id: str, backend: Backend | None = None) -> str | None:
|
def summarize_session(session_id: str, backend: Backend | None = None) -> str | None:
|
||||||
@@ -91,6 +102,32 @@ def maybe_summarize(session_id: str, backend: Backend | None = None) -> None:
|
|||||||
summarize_session(session_id, backend=backend)
|
summarize_session(session_id, backend=backend)
|
||||||
|
|
||||||
|
|
||||||
|
_inflight: set[str] = set()
|
||||||
|
_inflight_lock = threading.Lock()
|
||||||
|
|
||||||
|
|
||||||
|
def maybe_summarize_async(session_id: str, backend: Backend | None = None) -> None:
|
||||||
|
"""Run maybe_summarize off the chat turn's critical path. Consolidation is
|
||||||
|
background maintenance — it must never stall the reply or surface an error to
|
||||||
|
the user (a slow/oversized local model would otherwise block the turn). At most
|
||||||
|
one summary per session runs at a time."""
|
||||||
|
with _inflight_lock:
|
||||||
|
if session_id in _inflight:
|
||||||
|
return
|
||||||
|
_inflight.add(session_id)
|
||||||
|
|
||||||
|
def _run() -> None:
|
||||||
|
try:
|
||||||
|
maybe_summarize(session_id, backend=backend)
|
||||||
|
except Exception as exc:
|
||||||
|
logbus.log("error", "summary skipped", session=session_id, error=str(exc)[:120])
|
||||||
|
finally:
|
||||||
|
with _inflight_lock:
|
||||||
|
_inflight.discard(session_id)
|
||||||
|
|
||||||
|
threading.Thread(target=_run, daemon=True, name="summarize").start()
|
||||||
|
|
||||||
|
|
||||||
def summarize_all(
|
def summarize_all(
|
||||||
backend: Backend | None = None, limit: int | None = None, workers: int = 8
|
backend: Backend | None = None, limit: int | None = None, workers: int = 8
|
||||||
) -> dict:
|
) -> dict:
|
||||||
|
|||||||
Reference in New Issue
Block a user