feat(web): stream chat replies token-by-token (M3)
- llm.chat_call_stream: streaming generator for all 3 backends (Ollama NDJSON,
OpenAI/MI50 SSE), accumulating tool-call fragments by index.
- chat.respond_stream: mirrors respond()'s tool loop and persistence/compaction,
yielding ("delta", text) / ("tool", name) / ("done", reply).
- POST /v1/chat/stream: SSE endpoint; blocking generator bridged to async via a
worker thread + asyncio.Queue. Old completions endpoint kept as fallback.
- Client streams into a live bubble with a blinking caret; rAF-throttled render
(no full re-parse per token) and instant scroll during stream — fixes iOS
Safari ghosting from per-token smooth-scroll. Falls back to the blocking
endpoint only if nothing streamed (no double-persist).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -111,6 +111,45 @@ def create_app() -> FastAPI:
|
||||
],
|
||||
}
|
||||
|
||||
@app.post("/v1/chat/stream")
|
||||
async def chat_stream(request: Request) -> StreamingResponse:
|
||||
"""Server-Sent Events: stream Lyra's reply token-by-token.
|
||||
|
||||
`chat.respond_stream` is a blocking generator (httpx/openai), so it runs in
|
||||
a worker thread and bridges chunks to this async generator via a queue.
|
||||
"""
|
||||
body = await request.json()
|
||||
session_id = body.get("sessionId") or "default"
|
||||
backend = _backend_for(body.get("backend"))
|
||||
user_msg = _last_user_message(body.get("messages", []))
|
||||
model_override = body.get("model") or None
|
||||
memory.ensure_session(session_id)
|
||||
|
||||
async def gen():
|
||||
loop = asyncio.get_running_loop()
|
||||
q: asyncio.Queue = asyncio.Queue()
|
||||
done = object()
|
||||
|
||||
def produce():
|
||||
try:
|
||||
for event in chat.respond_stream(session_id, user_msg, backend, model_override):
|
||||
loop.call_soon_threadsafe(q.put_nowait, event)
|
||||
except Exception as exc: # surface to the client stream, don't hang
|
||||
logbus.log("error", "chat stream failed", session=session_id, error=str(exc))
|
||||
loop.call_soon_threadsafe(q.put_nowait, ("error", str(exc)))
|
||||
finally:
|
||||
loop.call_soon_threadsafe(q.put_nowait, done)
|
||||
|
||||
loop.run_in_executor(None, produce)
|
||||
while True:
|
||||
item = await q.get()
|
||||
if item is done:
|
||||
break
|
||||
ev, payload = item
|
||||
yield f"data: {json.dumps({'type': ev, 'payload': payload})}\n\n"
|
||||
|
||||
return StreamingResponse(gen(), media_type="text/event-stream")
|
||||
|
||||
@app.get("/logs")
|
||||
async def logs_page() -> FileResponse:
|
||||
"""Full-page, mobile-friendly live log viewer (separate from the chat UI)."""
|
||||
|
||||
Reference in New Issue
Block a user