fix: make summarize-all resilient to backend hiccups

The MI50 llama.cpp server OOM-killed (LXC RAM limit + 8GB prompt cache) mid-run,
and summarize_all had no error handling, so one APIConnectionError killed the
whole batch. Add retry-with-backoff around the summarization LLM call, and
try/except per session in summarize_all (log + skip; unsummarized sessions get
retried on the next run). (Server-side: CT202 RAM raised + prompt cache disabled.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-16 06:31:28 +00:00
parent aae95bfa6c
commit 34392e4097
+22 -3
View File
@@ -10,10 +10,13 @@ big imported conversation doesn't blow the local model's context window.
from __future__ import annotations from __future__ import annotations
import sys import sys
import time
from lyra import config, llm, logbus, memory from lyra import config, llm, logbus, memory
from lyra.llm import Backend, Message from lyra.llm import Backend, Message
_RETRIES = 4
# Re-summarize a session once it has accumulated this many new raw exchanges. # Re-summarize a session once it has accumulated this many new raw exchanges.
SUMMARIZE_AFTER = 20 SUMMARIZE_AFTER = 20
# Transcript budget per LLM call; longer sessions are chunked + merged. # Transcript budget per LLM call; longer sessions are chunked + merged.
@@ -49,7 +52,16 @@ def _summarize_text(text: str, backend: Backend) -> str:
{"role": "system", "content": _PROMPT}, {"role": "system", "content": _PROMPT},
{"role": "user", "content": text}, {"role": "user", "content": text},
] ]
# Retry transient backend errors (e.g. the GPU server restarting) with backoff.
for attempt in range(_RETRIES):
try:
return llm.complete(messages, backend=backend) return llm.complete(messages, backend=backend)
except Exception as exc:
if attempt == _RETRIES - 1:
raise
logbus.log("debug", "summary retry", attempt=attempt + 1, error=str(exc)[:80])
time.sleep(5 * (attempt + 1))
raise RuntimeError("unreachable")
def summarize_session(session_id: str, backend: Backend | None = None) -> str | None: def summarize_session(session_id: str, backend: Backend | None = None) -> str | None:
@@ -85,19 +97,26 @@ def summarize_all(backend: Backend | None = None, limit: int | None = None) -> d
with an up-to-date summary are skipped, so re-running continues where it left off. with an up-to-date summary are skipped, so re-running continues where it left off.
""" """
sessions = memory.list_sessions() sessions = memory.list_sessions()
done, skipped = 0, 0 done, skipped, failed = 0, 0, 0
for s in sessions: for s in sessions:
sid = s["id"] sid = s["id"]
if memory.get_summary(sid) and memory.unsummarized_count(sid) == 0: if memory.get_summary(sid) and memory.unsummarized_count(sid) == 0:
skipped += 1 skipped += 1
continue continue
try:
summarize_session(sid, backend=backend) summarize_session(sid, backend=backend)
except Exception as exc:
# Don't let one bad session kill the batch; log and move on (it'll
# be retried on the next run, since it stays unsummarized).
failed += 1
logbus.log("error", "summarize failed", session=sid, error=str(exc)[:120])
continue
done += 1 done += 1
if done % 25 == 0: if done % 25 == 0:
logbus.log("info", "summarize-all progress", summarized=done, skipped=skipped) logbus.log("info", "summarize-all progress", summarized=done, skipped=skipped, failed=failed)
if limit is not None and done >= limit: if limit is not None and done >= limit:
break break
report = {"summarized": done, "skipped": skipped, "total": len(sessions)} report = {"summarized": done, "skipped": skipped, "failed": failed, "total": len(sessions)}
logbus.log("info", "summarize-all complete", **report) logbus.log("info", "summarize-all complete", **report)
return report return report