feat(web): stream chat replies token-by-token (M3)

- llm.chat_call_stream: streaming generator for all 3 backends (Ollama NDJSON,
  OpenAI/MI50 SSE), accumulating tool-call fragments by index.
- chat.respond_stream: mirrors respond()'s tool loop and persistence/compaction,
  yielding ("delta", text) / ("tool", name) / ("done", reply).
- POST /v1/chat/stream: SSE endpoint; blocking generator bridged to async via a
  worker thread + asyncio.Queue. Old completions endpoint kept as fallback.
- Client streams into a live bubble with a blinking caret; rAF-throttled render
  (no full re-parse per token) and instant scroll during stream — fixes iOS
  Safari ghosting from per-token smooth-scroll. Falls back to the blocking
  endpoint only if nothing streamed (no double-persist).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-19 00:06:51 +00:00
parent fa168271e1
commit 5dc3fa17d7
5 changed files with 281 additions and 9 deletions
+57
View File
@@ -162,3 +162,60 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud",
# Compact this session once enough new turns have piled up.
summary.maybe_summarize(session_id)
return reply
def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud",
model_override: str | None = None):
"""Streaming generator version of `respond`.
Yields ("delta", text) as content streams in, and ("tool", name) when a tool
runs. Persists the full exchange and yields a final ("done", reply) — matching
`respond`'s side effects (memory + compaction) exactly.
"""
cfg = config.load()
model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get(
backend, backend
)
if model_override and backend == "cloud":
model = model_override
logbus.log(
"info", "chat request (stream)", session=session_id, backend=backend,
model=model, embed=cfg.embed_backend,
)
messages = build_messages(session_id, user_msg)
tool_specs = toolkit.specs() if backend in TOOL_BACKENDS else None
ctx = {"session_id": session_id, "backend": backend}
parts: list[str] = []
for _ in range(MAX_TOOL_ROUNDS):
assistant_msg = None
tool_calls = None
for ev, payload in llm.chat_call_stream(
messages, backend=backend, model=model, tools=tool_specs
):
if ev == "delta":
parts.append(payload)
yield ("delta", payload)
elif ev == "message":
assistant_msg = payload
elif ev == "tool_calls":
tool_calls = payload
if not tool_calls:
break
messages.append(assistant_msg) # her tool-call request
for tc in tool_calls:
result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
yield ("tool", tc["name"])
reply = "".join(parts)
if not reply:
reply = "(I got tangled using my tools there — say that again?)"
yield ("delta", reply)
logbus.log("info", "reply", session=session_id, chars=len(reply))
memory.remember(session_id, "user", user_msg)
memory.remember(session_id, "assistant", reply)
summary.maybe_summarize(session_id)
yield ("done", reply)