feat(web): stream chat replies token-by-token (M3)
- llm.chat_call_stream: streaming generator for all 3 backends (Ollama NDJSON,
OpenAI/MI50 SSE), accumulating tool-call fragments by index.
- chat.respond_stream: mirrors respond()'s tool loop and persistence/compaction,
yielding ("delta", text) / ("tool", name) / ("done", reply).
- POST /v1/chat/stream: SSE endpoint; blocking generator bridged to async via a
worker thread + asyncio.Queue. Old completions endpoint kept as fallback.
- Client streams into a live bubble with a blinking caret; rAF-throttled render
(no full re-parse per token) and instant scroll during stream — fixes iOS
Safari ghosting from per-token smooth-scroll. Falls back to the blocking
endpoint only if nothing streamed (no double-persist).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -180,6 +180,7 @@
|
||||
<script>
|
||||
const RELAY_BASE = ""; // same-origin: served by lyra.web.server
|
||||
const API_URL = `${RELAY_BASE}/v1/chat/completions`;
|
||||
const STREAM_URL = `${RELAY_BASE}/v1/chat/stream`;
|
||||
|
||||
function generateSessionId() {
|
||||
return "sess-" + Math.random().toString(36).substring(2, 10);
|
||||
@@ -308,21 +309,101 @@
|
||||
body.model = cloudModel;
|
||||
}
|
||||
|
||||
// Stream the reply token-by-token (SSE). Fall back to the blocking
|
||||
// endpoint only if nothing streamed (e.g. streaming unavailable).
|
||||
const div = createAssistantBubble();
|
||||
let full = "";
|
||||
try {
|
||||
const resp = await fetch(API_URL, {
|
||||
const resp = await fetch(STREAM_URL, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(body)
|
||||
});
|
||||
if (!resp.ok || !resp.body) throw new Error("HTTP " + resp.status);
|
||||
|
||||
const data = await resp.json();
|
||||
const reply = data.choices?.[0]?.message?.content || "(no reply)";
|
||||
addMessage("assistant", reply);
|
||||
history.push({ role: "assistant", content: reply });
|
||||
await saveSession();
|
||||
const reader = resp.body.getReader();
|
||||
const decoder = new TextDecoder();
|
||||
let buf = "";
|
||||
for (;;) {
|
||||
const { value, done } = await reader.read();
|
||||
if (done) break;
|
||||
buf += decoder.decode(value, { stream: true });
|
||||
let i;
|
||||
while ((i = buf.indexOf("\n\n")) !== -1) {
|
||||
const frame = buf.slice(0, i).trim();
|
||||
buf = buf.slice(i + 2);
|
||||
if (!frame.startsWith("data:")) continue;
|
||||
let evt;
|
||||
try { evt = JSON.parse(frame.slice(5).trim()); } catch (e) { continue; }
|
||||
if (evt.type === "delta") {
|
||||
full += evt.payload;
|
||||
updateAssistantBubble(div, full);
|
||||
} else if (evt.type === "done") {
|
||||
if (evt.payload) full = evt.payload;
|
||||
} else if (evt.type === "error") {
|
||||
throw new Error(evt.payload);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
addMessage("system", "Error: " + err.message);
|
||||
if (!full) {
|
||||
div.remove();
|
||||
try {
|
||||
const resp = await fetch(API_URL, {
|
||||
method: "POST",
|
||||
headers: { "Content-Type": "application/json" },
|
||||
body: JSON.stringify(body)
|
||||
});
|
||||
const data = await resp.json();
|
||||
const reply = data.choices?.[0]?.message?.content || "(no reply)";
|
||||
addMessage("assistant", reply);
|
||||
history.push({ role: "assistant", content: reply });
|
||||
await saveSession();
|
||||
} catch (err2) {
|
||||
addMessage("system", "Error: " + err2.message);
|
||||
}
|
||||
return;
|
||||
}
|
||||
// Partial content arrived before the error — keep what we streamed.
|
||||
}
|
||||
|
||||
finalizeAssistantBubble(div, full || "(no reply)");
|
||||
history.push({ role: "assistant", content: full || "(no reply)" });
|
||||
await saveSession();
|
||||
}
|
||||
|
||||
function createAssistantBubble() {
|
||||
const messagesEl = document.getElementById("messages");
|
||||
const div = document.createElement("div");
|
||||
div.className = "msg assistant streaming";
|
||||
messagesEl.appendChild(div);
|
||||
messagesEl.scrollTop = messagesEl.scrollHeight; // instant — no smooth chasing
|
||||
return div;
|
||||
}
|
||||
|
||||
// Coalesce token updates to one render per animation frame (avoids re-parsing
|
||||
// the whole message on every token, and the iOS ghosting from rapid repaints).
|
||||
function updateAssistantBubble(div, text) {
|
||||
div._pending = text;
|
||||
if (div._raf) return;
|
||||
div._raf = requestAnimationFrame(() => {
|
||||
div._raf = 0;
|
||||
const messagesEl = document.getElementById("messages");
|
||||
const stick = messagesEl.scrollHeight - messagesEl.scrollTop - messagesEl.clientHeight < 90;
|
||||
div.innerHTML = renderMarkdown(div._pending);
|
||||
div.dataset.raw = div._pending;
|
||||
if (stick) messagesEl.scrollTop = messagesEl.scrollHeight; // follow only if near bottom
|
||||
});
|
||||
}
|
||||
|
||||
function finalizeAssistantBubble(div, text) {
|
||||
if (div._raf) { cancelAnimationFrame(div._raf); div._raf = 0; } // drop any queued render
|
||||
div.classList.remove("streaming");
|
||||
div.innerHTML = renderMarkdown(text);
|
||||
div.dataset.raw = text;
|
||||
addRateBar(div);
|
||||
const messagesEl = document.getElementById("messages");
|
||||
requestAnimationFrame(() => messagesEl.scrollTo({ top: messagesEl.scrollHeight, behavior: "smooth" }));
|
||||
}
|
||||
|
||||
function renderMarkdown(text) {
|
||||
|
||||
@@ -139,7 +139,9 @@ button:hover, select:hover {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 8px;
|
||||
scroll-behavior: smooth;
|
||||
/* No CSS smooth-scroll: during streaming, per-token smooth scrolls pile up and
|
||||
iOS Safari leaves ghost paint frames. Smooth is applied explicitly in JS where
|
||||
it's a one-shot (load/finalize). */
|
||||
}
|
||||
|
||||
/* Messages */
|
||||
@@ -1090,6 +1092,16 @@ select:hover {
|
||||
}
|
||||
.msg.assistant pre code { background: none; padding: 0; font-size: 0.85em; }
|
||||
|
||||
/* Streaming: a blinking caret while tokens arrive (and a min-size while empty). */
|
||||
.msg.assistant.streaming { min-width: 1.4em; min-height: 1.1em; }
|
||||
.msg.assistant.streaming::after {
|
||||
content: "▋";
|
||||
margin-left: 1px;
|
||||
color: var(--accent);
|
||||
animation: caretBlink 1s steps(1) infinite;
|
||||
}
|
||||
@keyframes caretBlink { 0%, 50% { opacity: 0.85; } 50.01%, 100% { opacity: 0; } }
|
||||
|
||||
/* Behind-the-scenes 👍/👎 feedback (fine-tune signal) — subtle until hovered. */
|
||||
.rate-bar { display: flex; gap: 6px; margin-top: 7px; opacity: 0.3; transition: opacity .15s; }
|
||||
.msg.assistant:hover .rate-bar { opacity: 0.85; }
|
||||
|
||||
Reference in New Issue
Block a user