feat(web): stream chat replies token-by-token (M3)

- llm.chat_call_stream: streaming generator for all 3 backends (Ollama NDJSON,
  OpenAI/MI50 SSE), accumulating tool-call fragments by index.
- chat.respond_stream: mirrors respond()'s tool loop and persistence/compaction,
  yielding ("delta", text) / ("tool", name) / ("done", reply).
- POST /v1/chat/stream: SSE endpoint; blocking generator bridged to async via a
  worker thread + asyncio.Queue. Old completions endpoint kept as fallback.
- Client streams into a live bubble with a blinking caret; rAF-throttled render
  (no full re-parse per token) and instant scroll during stream — fixes iOS
  Safari ghosting from per-token smooth-scroll. Falls back to the blocking
  endpoint only if nothing streamed (no double-persist).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-19 00:06:51 +00:00
parent fa168271e1
commit 5dc3fa17d7
5 changed files with 281 additions and 9 deletions
+84 -1
View File
@@ -1,7 +1,8 @@
"""LLM router: local (Ollama) chat, cloud (OpenAI) chat + embeddings."""
from __future__ import annotations
from typing import Literal, TypedDict
import json
from typing import Iterator, Literal, TypedDict
import httpx
from openai import OpenAI
@@ -80,6 +81,88 @@ def chat_call(
return {"role": "assistant", "content": complete(messages, backend=backend, model=model)}, None
def chat_call_stream(
messages: list, backend: Backend = "cloud", model: str | None = None,
tools: list | None = None,
) -> Iterator[tuple[str, object]]:
"""Streaming variant of `chat_call`. Yields ("delta", text) for each content
chunk as it arrives, then exactly two terminal events:
("message", assistant_dict) — the full assistant turn, to append back
("tool_calls", calls | None) — list of {id,name,arguments} or None
`local` (Ollama) streams NDJSON and never returns tool calls.
"""
cfg = load()
if backend in ("cloud", "mi50"):
if backend == "cloud":
if not cfg.openai_api_key:
raise RuntimeError("OPENAI_API_KEY is not set")
client = OpenAI(api_key=cfg.openai_api_key)
mdl = model or cfg.cloud_model
else:
client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url)
mdl = model or cfg.mi50_model
kwargs: dict = {"model": mdl, "messages": messages, "stream": True}
if tools:
kwargs["tools"] = tools
parts: list[str] = []
frags: dict[int, dict] = {} # tool-call fragments accumulated by index
for chunk in client.chat.completions.create(**kwargs):
if not chunk.choices:
continue
delta = chunk.choices[0].delta
if getattr(delta, "content", None):
parts.append(delta.content)
yield ("delta", delta.content)
for tc in getattr(delta, "tool_calls", None) or []:
slot = frags.setdefault(tc.index, {"id": "", "name": "", "arguments": ""})
if tc.id:
slot["id"] = tc.id
if tc.function and tc.function.name:
slot["name"] = tc.function.name
if tc.function and tc.function.arguments:
slot["arguments"] += tc.function.arguments
content = "".join(parts)
if frags:
calls = [frags[i] for i in sorted(frags)]
assistant = {
"role": "assistant",
"content": content or None,
"tool_calls": [
{"id": c["id"], "type": "function",
"function": {"name": c["name"], "arguments": c["arguments"]}}
for c in calls
],
}
yield ("message", assistant)
yield ("tool_calls", [{"id": c["id"], "name": c["name"], "arguments": c["arguments"]} for c in calls])
else:
yield ("message", {"role": "assistant", "content": content})
yield ("tool_calls", None)
return
# local (Ollama): stream NDJSON, no tools.
parts = []
with httpx.stream(
"POST", f"{cfg.local_base_url}/api/chat",
json={"model": model or cfg.local_model, "messages": messages, "stream": True},
timeout=120,
) as resp:
resp.raise_for_status()
for line in resp.iter_lines():
if not line:
continue
data = json.loads(line)
piece = (data.get("message") or {}).get("content", "")
if piece:
parts.append(piece)
yield ("delta", piece)
if data.get("done"):
break
yield ("message", {"role": "assistant", "content": "".join(parts)})
yield ("tool_calls", None)
def embed(texts: list[str]) -> list[list[float]]:
"""Embed texts using the configured backend (EMBED_BACKEND: "cloud" or "local").