"""The chat turn: assemble the prompt (lyra.mind) then speak + persist. `mind.assemble()` runs the society of parts (perceive → route → compose → deliberate) and hands back a ready message list + the active mode. Then: - the MIND (the chat backend/model) runs the tool/generation loop — decide, reason, run tools — and produces a draft. - the MOUTH (a separate character model, if configured) re-voices that draft in her own voice. Default: no mouth configured → the mind's draft IS the reply (bit-for-bit the old behavior). The mouth slot is where a fine-tuned voice lands. """ from __future__ import annotations from lyra import config, llm, logbus, memory, mind, modes, summary from lyra import tools as toolkit from lyra.llm import Backend MAX_TOOL_ROUNDS = 5 # cap tool-call iterations per turn # Backends that support function-calling. The MI50's llama.cpp server only does # tools when launched with --jinja; until it is, keep tools to cloud so MI50 chat # doesn't 500 on the tools param. Add "mi50" here once that flag is set. TOOL_BACKENDS = {"cloud"} _TANGLED = "(I got tangled using my tools there — say that again?)" def _resolve_model(backend: Backend, model_override: str | None, cfg) -> str: """Live chat uses the stronger chat_model on cloud; local/mi50 use their own. The UI's cloud-model picker only applies on the cloud backend.""" model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get( backend, backend ) if model_override and backend == "cloud": model = model_override return model def _mouth_target(cfg, mind_backend: Backend, mind_model: str | None): """The mouth (backend, model) if configured AND different from the mind; else None (mouth == mind → no separate voice pass).""" if not cfg.mouth_backend and not cfg.mouth_model: return None backend = cfg.mouth_backend or mind_backend model = cfg.mouth_model or None if backend == mind_backend and model == mind_model: return None return backend, model def _maybe_switch_mode(session_id: str, tool_name: str) -> None: """Opening a poker session auto-flips this chat into Poker mode. Manual UI switching still overrides anytime.""" if tool_name == "start_session": memory.set_session_mode(session_id, modes.CASH.key) logbus.log("info", "mode auto-switch", session=session_id, mode=modes.CASH.key) def _mind_loop(messages, backend: Backend, model: str | None, tool_specs, ctx: dict, session_id: str) -> tuple[str, list[str]]: """Run the tool/generation loop on the MIND model (non-streaming). Mutates `messages` with tool calls/results. Returns (draft_reply, tool_names_run).""" tools_run: list[str] = [] reply = "" for _ in range(MAX_TOOL_ROUNDS): assistant_msg, tool_calls = llm.chat_call( messages, backend=backend, model=model, tools=tool_specs ) if not tool_calls: reply = assistant_msg.get("content") or "" break messages.append(assistant_msg) for tc in tool_calls: result = toolkit.dispatch(tc["name"], tc["arguments"], ctx) logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80]) messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result}) _maybe_switch_mode(session_id, tc["name"]) tools_run.append(tc["name"]) return reply, tools_run def _voice_pass(messages, draft: str, backend: Backend, model: str | None) -> str: """Mouth: re-render the mind's draft in her voice. Falls back to the draft on failure.""" try: out = llm.complete(mind.voice_messages(messages, draft), backend=backend, model=model) return (out or "").strip() or draft except Exception as exc: logbus.log("error", "voice pass failed", error=str(exc)[:160]) return draft def respond(session_id: str, user_msg: str, backend: Backend = "cloud", model_override: str | None = None) -> str: """Produce Lyra's reply to a single user message and persist the exchange.""" cfg = config.load() model = _resolve_model(backend, model_override, cfg) logbus.log("info", "chat request", session=session_id, backend=backend, model=model, embed=cfg.embed_backend) turn = mind.assemble(session_id, user_msg, backend, model) messages = turn.messages tool_specs = toolkit.specs(turn.mode.tools) if backend in TOOL_BACKENDS else None ctx = {"session_id": session_id, "backend": backend} reply, _ = _mind_loop(messages, backend, model, tool_specs, ctx, session_id) mouth = _mouth_target(cfg, backend, model) if mouth and reply: reply = _voice_pass(messages, reply, *mouth) if not reply: reply = _TANGLED logbus.log("info", "reply", session=session_id, chars=len(reply), voiced=bool(mouth)) memory.remember(session_id, "user", user_msg) memory.remember(session_id, "assistant", reply) summary.maybe_summarize_async(session_id) # compact once enough new turns pile up return reply def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud", model_override: str | None = None): """Streaming generator version of `respond`. Yields ("delta", text), ("tool", name), and a final ("done", reply). Same side effects as `respond`.""" cfg = config.load() model = _resolve_model(backend, model_override, cfg) logbus.log("info", "chat request (stream)", session=session_id, backend=backend, model=model, embed=cfg.embed_backend) turn = mind.assemble(session_id, user_msg, backend, model) messages = turn.messages tool_specs = toolkit.specs(turn.mode.tools) if backend in TOOL_BACKENDS else None ctx = {"session_id": session_id, "backend": backend} mouth = _mouth_target(cfg, backend, model) if mouth is None: # No separate voice: stream the mind directly (the original path, unchanged). parts: list[str] = [] for _ in range(MAX_TOOL_ROUNDS): assistant_msg = None tool_calls = None for ev, payload in llm.chat_call_stream( messages, backend=backend, model=model, tools=tool_specs ): if ev == "delta": parts.append(payload) yield ("delta", payload) elif ev == "message": assistant_msg = payload elif ev == "tool_calls": tool_calls = payload if not tool_calls: break messages.append(assistant_msg) for tc in tool_calls: result = toolkit.dispatch(tc["name"], tc["arguments"], ctx) logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80]) messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result}) _maybe_switch_mode(session_id, tc["name"]) yield ("tool", tc["name"]) reply = "".join(parts) if not reply: reply = _TANGLED yield ("delta", reply) else: # Mind decides + runs tools (non-streamed); mouth re-voices, streamed. draft, tools_run = _mind_loop(messages, backend, model, tool_specs, ctx, session_id) for name in tools_run: yield ("tool", name) parts = [] try: for ev, payload in llm.chat_call_stream( mind.voice_messages(messages, draft), backend=mouth[0], model=mouth[1], tools=None ): if ev == "delta": parts.append(payload) yield ("delta", payload) except Exception as exc: logbus.log("error", "voice stream failed", error=str(exc)[:160]) reply = "".join(parts).strip() or draft or _TANGLED if not parts: yield ("delta", reply) logbus.log("info", "reply", session=session_id, chars=len(reply), voiced=bool(mouth)) memory.remember(session_id, "user", user_msg) memory.remember(session_id, "assistant", reply) summary.maybe_summarize_async(session_id) yield ("done", reply)