feat(web): stream chat replies token-by-token (M3)

- llm.chat_call_stream: streaming generator for all 3 backends (Ollama NDJSON, OpenAI/MI50 SSE), accumulating tool-call fragments by index. - chat.respond_stream: mirrors respond()'s tool loop and persistence/compaction, yielding ("delta", text) / ("tool", name) / ("done", reply). - POST /v1/chat/stream: SSE endpoint; blocking generator bridged to async via a worker thread + asyncio.Queue. Old completions endpoint kept as fallback. - Client streams into a live bubble with a blinking caret; rAF-throttled render (no full re-parse per token) and instant scroll during stream — fixes iOS Safari ghosting from per-token smooth-scroll. Falls back to the blocking endpoint only if nothing streamed (no double-persist). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 00:06:51 +00:00
parent fa168271e1
commit 5dc3fa17d7
5 changed files with 281 additions and 9 deletions
@@ -1,7 +1,8 @@
 """LLM router: local (Ollama) chat, cloud (OpenAI) chat + embeddings."""
 from __future__ import annotations

-from typing import Literal, TypedDict
+import json
+from typing import Iterator, Literal, TypedDict

 import httpx
 from openai import OpenAI
@@ -80,6 +81,88 @@ def chat_call(
    return {"role": "assistant", "content": complete(messages, backend=backend, model=model)}, None


+def chat_call_stream(
+    messages: list, backend: Backend = "cloud", model: str | None = None,
+    tools: list | None = None,
+) -> Iterator[tuple[str, object]]:
+    """Streaming variant of `chat_call`. Yields ("delta", text) for each content
+    chunk as it arrives, then exactly two terminal events:
+      ("message", assistant_dict)  — the full assistant turn, to append back
+      ("tool_calls", calls | None) — list of {id,name,arguments} or None
+
+    `local` (Ollama) streams NDJSON and never returns tool calls.
+    """
+    cfg = load()
+    if backend in ("cloud", "mi50"):
+        if backend == "cloud":
+            if not cfg.openai_api_key:
+                raise RuntimeError("OPENAI_API_KEY is not set")
+            client = OpenAI(api_key=cfg.openai_api_key)
+            mdl = model or cfg.cloud_model
+        else:
+            client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url)
+            mdl = model or cfg.mi50_model
+        kwargs: dict = {"model": mdl, "messages": messages, "stream": True}
+        if tools:
+            kwargs["tools"] = tools
+        parts: list[str] = []
+        frags: dict[int, dict] = {}  # tool-call fragments accumulated by index
+        for chunk in client.chat.completions.create(**kwargs):
+            if not chunk.choices:
+                continue
+            delta = chunk.choices[0].delta
+            if getattr(delta, "content", None):
+                parts.append(delta.content)
+                yield ("delta", delta.content)
+            for tc in getattr(delta, "tool_calls", None) or []:
+                slot = frags.setdefault(tc.index, {"id": "", "name": "", "arguments": ""})
+                if tc.id:
+                    slot["id"] = tc.id
+                if tc.function and tc.function.name:
+                    slot["name"] = tc.function.name
+                if tc.function and tc.function.arguments:
+                    slot["arguments"] += tc.function.arguments
+        content = "".join(parts)
+        if frags:
+            calls = [frags[i] for i in sorted(frags)]
+            assistant = {
+                "role": "assistant",
+                "content": content or None,
+                "tool_calls": [
+                    {"id": c["id"], "type": "function",
+                     "function": {"name": c["name"], "arguments": c["arguments"]}}
+                    for c in calls
+                ],
+            }
+            yield ("message", assistant)
+            yield ("tool_calls", [{"id": c["id"], "name": c["name"], "arguments": c["arguments"]} for c in calls])
+        else:
+            yield ("message", {"role": "assistant", "content": content})
+            yield ("tool_calls", None)
+        return
+
+    # local (Ollama): stream NDJSON, no tools.
+    parts = []
+    with httpx.stream(
+        "POST", f"{cfg.local_base_url}/api/chat",
+        json={"model": model or cfg.local_model, "messages": messages, "stream": True},
+        timeout=120,
+    ) as resp:
+        resp.raise_for_status()
+        for line in resp.iter_lines():
+            if not line:
+                continue
+            data = json.loads(line)
+            piece = (data.get("message") or {}).get("content", "")
+            if piece:
+                parts.append(piece)
+                yield ("delta", piece)
+            if data.get("done"):
+                break
+    yield ("message", {"role": "assistant", "content": "".join(parts)})
+    yield ("tool_calls", None)
+
+
 def embed(texts: list[str]) -> list[list[float]]:
    """Embed texts using the configured backend (EMBED_BACKEND: "cloud" or "local").