feat(web): stream chat replies token-by-token (M3)
- llm.chat_call_stream: streaming generator for all 3 backends (Ollama NDJSON,
OpenAI/MI50 SSE), accumulating tool-call fragments by index.
- chat.respond_stream: mirrors respond()'s tool loop and persistence/compaction,
yielding ("delta", text) / ("tool", name) / ("done", reply).
- POST /v1/chat/stream: SSE endpoint; blocking generator bridged to async via a
worker thread + asyncio.Queue. Old completions endpoint kept as fallback.
- Client streams into a live bubble with a blinking caret; rAF-throttled render
(no full re-parse per token) and instant scroll during stream — fixes iOS
Safari ghosting from per-token smooth-scroll. Falls back to the blocking
endpoint only if nothing streamed (no double-persist).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -162,3 +162,60 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud",
|
||||
# Compact this session once enough new turns have piled up.
|
||||
summary.maybe_summarize(session_id)
|
||||
return reply
|
||||
|
||||
|
||||
def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud",
|
||||
model_override: str | None = None):
|
||||
"""Streaming generator version of `respond`.
|
||||
|
||||
Yields ("delta", text) as content streams in, and ("tool", name) when a tool
|
||||
runs. Persists the full exchange and yields a final ("done", reply) — matching
|
||||
`respond`'s side effects (memory + compaction) exactly.
|
||||
"""
|
||||
cfg = config.load()
|
||||
model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get(
|
||||
backend, backend
|
||||
)
|
||||
if model_override and backend == "cloud":
|
||||
model = model_override
|
||||
logbus.log(
|
||||
"info", "chat request (stream)", session=session_id, backend=backend,
|
||||
model=model, embed=cfg.embed_backend,
|
||||
)
|
||||
|
||||
messages = build_messages(session_id, user_msg)
|
||||
tool_specs = toolkit.specs() if backend in TOOL_BACKENDS else None
|
||||
ctx = {"session_id": session_id, "backend": backend}
|
||||
parts: list[str] = []
|
||||
for _ in range(MAX_TOOL_ROUNDS):
|
||||
assistant_msg = None
|
||||
tool_calls = None
|
||||
for ev, payload in llm.chat_call_stream(
|
||||
messages, backend=backend, model=model, tools=tool_specs
|
||||
):
|
||||
if ev == "delta":
|
||||
parts.append(payload)
|
||||
yield ("delta", payload)
|
||||
elif ev == "message":
|
||||
assistant_msg = payload
|
||||
elif ev == "tool_calls":
|
||||
tool_calls = payload
|
||||
if not tool_calls:
|
||||
break
|
||||
messages.append(assistant_msg) # her tool-call request
|
||||
for tc in tool_calls:
|
||||
result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
|
||||
logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
|
||||
messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
|
||||
yield ("tool", tc["name"])
|
||||
|
||||
reply = "".join(parts)
|
||||
if not reply:
|
||||
reply = "(I got tangled using my tools there — say that again?)"
|
||||
yield ("delta", reply)
|
||||
logbus.log("info", "reply", session=session_id, chars=len(reply))
|
||||
|
||||
memory.remember(session_id, "user", user_msg)
|
||||
memory.remember(session_id, "assistant", reply)
|
||||
summary.maybe_summarize(session_id)
|
||||
yield ("done", reply)
|
||||
|
||||
Reference in New Issue
Block a user