feat(web): stream chat replies token-by-token (M3)

- llm.chat_call_stream: streaming generator for all 3 backends (Ollama NDJSON, OpenAI/MI50 SSE), accumulating tool-call fragments by index. - chat.respond_stream: mirrors respond()'s tool loop and persistence/compaction, yielding ("delta", text) / ("tool", name) / ("done", reply). - POST /v1/chat/stream: SSE endpoint; blocking generator bridged to async via a worker thread + asyncio.Queue. Old completions endpoint kept as fallback. - Client streams into a live bubble with a blinking caret; rAF-throttled render (no full re-parse per token) and instant scroll during stream — fixes iOS Safari ghosting from per-token smooth-scroll. Falls back to the blocking endpoint only if nothing streamed (no double-persist). Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-19 00:06:51 +00:00
parent fa168271e1
commit 5dc3fa17d7
5 changed files with 281 additions and 9 deletions
@@ -180,6 +180,7 @@
  <script>
    const RELAY_BASE = "";  // same-origin: served by lyra.web.server
    const API_URL = `${RELAY_BASE}/v1/chat/completions`;
+    const STREAM_URL = `${RELAY_BASE}/v1/chat/stream`;

 	function generateSessionId() {
      return "sess-" + Math.random().toString(36).substring(2, 10);
@@ -308,21 +309,101 @@
        body.model = cloudModel;
      }

+      // Stream the reply token-by-token (SSE). Fall back to the blocking
+      // endpoint only if nothing streamed (e.g. streaming unavailable).
+      const div = createAssistantBubble();
+      let full = "";
      try {
-        const resp = await fetch(API_URL, {
+        const resp = await fetch(STREAM_URL, {
          method: "POST",
          headers: { "Content-Type": "application/json" },
          body: JSON.stringify(body)
        });
+        if (!resp.ok || !resp.body) throw new Error("HTTP " + resp.status);

-        const data = await resp.json();
-        const reply = data.choices?.[0]?.message?.content || "(no reply)";
-        addMessage("assistant", reply);
-        history.push({ role: "assistant", content: reply });
-		await saveSession();
+        const reader = resp.body.getReader();
+        const decoder = new TextDecoder();
+        let buf = "";
+        for (;;) {
+          const { value, done } = await reader.read();
+          if (done) break;
+          buf += decoder.decode(value, { stream: true });
+          let i;
+          while ((i = buf.indexOf("\n\n")) !== -1) {
+            const frame = buf.slice(0, i).trim();
+            buf = buf.slice(i + 2);
+            if (!frame.startsWith("data:")) continue;
+            let evt;
+            try { evt = JSON.parse(frame.slice(5).trim()); } catch (e) { continue; }
+            if (evt.type === "delta") {
+              full += evt.payload;
+              updateAssistantBubble(div, full);
+            } else if (evt.type === "done") {
+              if (evt.payload) full = evt.payload;
+            } else if (evt.type === "error") {
+              throw new Error(evt.payload);
+            }
+          }
+        }
      } catch (err) {
-        addMessage("system", "Error: " + err.message);
+        if (!full) {
+          div.remove();
+          try {
+            const resp = await fetch(API_URL, {
+              method: "POST",
+              headers: { "Content-Type": "application/json" },
+              body: JSON.stringify(body)
+            });
+            const data = await resp.json();
+            const reply = data.choices?.[0]?.message?.content || "(no reply)";
+            addMessage("assistant", reply);
+            history.push({ role: "assistant", content: reply });
+            await saveSession();
+          } catch (err2) {
+            addMessage("system", "Error: " + err2.message);
+          }
+          return;
+        }
+        // Partial content arrived before the error — keep what we streamed.
      }
+
+      finalizeAssistantBubble(div, full || "(no reply)");
+      history.push({ role: "assistant", content: full || "(no reply)" });
+      await saveSession();
+    }
+
+    function createAssistantBubble() {
+      const messagesEl = document.getElementById("messages");
+      const div = document.createElement("div");
+      div.className = "msg assistant streaming";
+      messagesEl.appendChild(div);
+      messagesEl.scrollTop = messagesEl.scrollHeight;  // instant — no smooth chasing
+      return div;
+    }
+
+    // Coalesce token updates to one render per animation frame (avoids re-parsing
+    // the whole message on every token, and the iOS ghosting from rapid repaints).
+    function updateAssistantBubble(div, text) {
+      div._pending = text;
+      if (div._raf) return;
+      div._raf = requestAnimationFrame(() => {
+        div._raf = 0;
+        const messagesEl = document.getElementById("messages");
+        const stick = messagesEl.scrollHeight - messagesEl.scrollTop - messagesEl.clientHeight < 90;
+        div.innerHTML = renderMarkdown(div._pending);
+        div.dataset.raw = div._pending;
+        if (stick) messagesEl.scrollTop = messagesEl.scrollHeight;  // follow only if near bottom
+      });
+    }
+
+    function finalizeAssistantBubble(div, text) {
+      if (div._raf) { cancelAnimationFrame(div._raf); div._raf = 0; }  // drop any queued render
+      div.classList.remove("streaming");
+      div.innerHTML = renderMarkdown(text);
+      div.dataset.raw = text;
+      addRateBar(div);
+      const messagesEl = document.getElementById("messages");
+      requestAnimationFrame(() => messagesEl.scrollTo({ top: messagesEl.scrollHeight, behavior: "smooth" }));
    }

 	function renderMarkdown(text) {