feat(P3): mind/mouth split — separate voice model for the final reply (seam, default off)

The mind (chat backend/model) decides, reasons, and runs tools → a draft; the mouth re-voices that draft in her character. Default: no mouth configured → the mind's draft IS the reply, bit-for-bit the old behavior (and old streaming path untouched). - config: MOUTH_BACKEND / MOUTH_MODEL. The slot for an eventual fine-tuned voice. - chat: _mind_loop (tool/generation loop, non-stream, returns draft + tools_run), _voice_pass / mind.voice_messages (re-voice the draft, keep every fact/number), _mouth_target (active only when configured AND != mind). respond + respond_stream branch: mouth off = stream the mind directly (unchanged); mouth on = mind decides + runs tools, then the mouth streams the re-voiced reply. Falls back to the draft on any mouth failure (chat never breaks). - Key payoff: the mouth needs no tool support (the mind handles tools), so it can be a non-tool character model (Dolphin / Claude / fine-tune). Makes the fine-tune easy: teach a small model to *sound* like Lyra, not to be smart. - tests: mouth target on/off, voice_messages shape, voice_pass revoice+fallback. Suite 96 green, ruff clean. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 06:08:06 +00:00
parent a7af461cdb
commit 03aceec6fa
5 changed files with 183 additions and 58 deletions
@@ -1,9 +1,12 @@
 """The chat turn: assemble the prompt (lyra.mind) then speak + persist.

 `mind.assemble()` runs the society of parts (perceive → route → compose →
-deliberate) and hands back a ready message list + the active mode; `chat` runs the
-tool/generation loop (the "speak" part) and persists the exchange. Keeping speak
-here (not in mind) is deliberate — it's tangled with streaming and tool dispatch.
+deliberate) and hands back a ready message list + the active mode. Then:
+  - the MIND (the chat backend/model) runs the tool/generation loop — decide,
+    reason, run tools — and produces a draft.
+  - the MOUTH (a separate character model, if configured) re-voices that draft in
+    her own voice. Default: no mouth configured → the mind's draft IS the reply
+    (bit-for-bit the old behavior). The mouth slot is where a fine-tuned voice lands.
 """
 from __future__ import annotations

@@ -16,6 +19,7 @@ MAX_TOOL_ROUNDS = 5  # cap tool-call iterations per turn
 # tools when launched with --jinja; until it is, keep tools to cloud so MI50 chat
 # doesn't 500 on the tools param. Add "mi50" here once that flag is set.
 TOOL_BACKENDS = {"cloud"}
+_TANGLED = "(I got tangled using my tools there — say that again?)"


 def _resolve_model(backend: Backend, model_override: str | None, cfg) -> str:
@@ -29,15 +33,59 @@ def _resolve_model(backend: Backend, model_override: str | None, cfg) -> str:
    return model


+def _mouth_target(cfg, mind_backend: Backend, mind_model: str | None):
+    """The mouth (backend, model) if configured AND different from the mind; else None
+    (mouth == mind → no separate voice pass)."""
+    if not cfg.mouth_backend and not cfg.mouth_model:
+        return None
+    backend = cfg.mouth_backend or mind_backend
+    model = cfg.mouth_model or None
+    if backend == mind_backend and model == mind_model:
+        return None
+    return backend, model
+
+
 def _maybe_switch_mode(session_id: str, tool_name: str) -> None:
-    """Keep the chat framing aligned with the live data: opening a poker session
-    auto-flips this chat into Poker mode (next turn gets the card + full live tools).
-    Manual UI switching still overrides anytime."""
+    """Opening a poker session auto-flips this chat into Poker mode. Manual UI switching
+    still overrides anytime."""
    if tool_name == "start_session":
        memory.set_session_mode(session_id, modes.CASH.key)
        logbus.log("info", "mode auto-switch", session=session_id, mode=modes.CASH.key)


+def _mind_loop(messages, backend: Backend, model: str | None, tool_specs,
+               ctx: dict, session_id: str) -> tuple[str, list[str]]:
+    """Run the tool/generation loop on the MIND model (non-streaming). Mutates
+    `messages` with tool calls/results. Returns (draft_reply, tool_names_run)."""
+    tools_run: list[str] = []
+    reply = ""
+    for _ in range(MAX_TOOL_ROUNDS):
+        assistant_msg, tool_calls = llm.chat_call(
+            messages, backend=backend, model=model, tools=tool_specs
+        )
+        if not tool_calls:
+            reply = assistant_msg.get("content") or ""
+            break
+        messages.append(assistant_msg)
+        for tc in tool_calls:
+            result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
+            logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
+            messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
+            _maybe_switch_mode(session_id, tc["name"])
+            tools_run.append(tc["name"])
+    return reply, tools_run
+
+
+def _voice_pass(messages, draft: str, backend: Backend, model: str | None) -> str:
+    """Mouth: re-render the mind's draft in her voice. Falls back to the draft on failure."""
+    try:
+        out = llm.complete(mind.voice_messages(messages, draft), backend=backend, model=model)
+        return (out or "").strip() or draft
+    except Exception as exc:
+        logbus.log("error", "voice pass failed", error=str(exc)[:160])
+        return draft
+
+
 def respond(session_id: str, user_msg: str, backend: Backend = "cloud",
            model_override: str | None = None) -> str:
    """Produce Lyra's reply to a single user message and persist the exchange."""
@@ -48,28 +96,16 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud",

    turn = mind.assemble(session_id, user_msg, backend, model)
    messages = turn.messages
-
-    # Tool loop (speak): offer her tools (scoped to the mode); run any she calls and
-    # feed results back until she returns a text reply.
    tool_specs = toolkit.specs(turn.mode.tools) if backend in TOOL_BACKENDS else None
    ctx = {"session_id": session_id, "backend": backend}
-    reply = ""
-    for _ in range(MAX_TOOL_ROUNDS):
-        assistant_msg, tool_calls = llm.chat_call(
-            messages, backend=backend, model=model, tools=tool_specs
-        )
-        if not tool_calls:
-            reply = assistant_msg.get("content") or ""
-            break
-        messages.append(assistant_msg)  # her tool-call request
-        for tc in tool_calls:
-            result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
-            logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
-            messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
-            _maybe_switch_mode(session_id, tc["name"])
+
+    reply, _ = _mind_loop(messages, backend, model, tool_specs, ctx, session_id)
+    mouth = _mouth_target(cfg, backend, model)
+    if mouth and reply:
+        reply = _voice_pass(messages, reply, *mouth)
    if not reply:
-        reply = "(I got tangled using my tools there — say that again?)"
-    logbus.log("info", "reply", session=session_id, chars=len(reply))
+        reply = _TANGLED
+    logbus.log("info", "reply", session=session_id, chars=len(reply), voiced=bool(mouth))

    memory.remember(session_id, "user", user_msg)
    memory.remember(session_id, "assistant", reply)
@@ -79,11 +115,8 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud",

 def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud",
                   model_override: str | None = None):
-    """Streaming generator version of `respond`.
-
-    Yields ("delta", text) as content streams in, ("tool", name) when a tool runs,
-    and a final ("done", reply). Persists the exchange — same side effects as `respond`.
-    """
+    """Streaming generator version of `respond`. Yields ("delta", text), ("tool", name),
+    and a final ("done", reply). Same side effects as `respond`."""
    cfg = config.load()
    model = _resolve_model(backend, model_override, cfg)
    logbus.log("info", "chat request (stream)", session=session_id, backend=backend,
@@ -93,36 +126,57 @@ def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud",
    messages = turn.messages
    tool_specs = toolkit.specs(turn.mode.tools) if backend in TOOL_BACKENDS else None
    ctx = {"session_id": session_id, "backend": backend}
-    parts: list[str] = []
-    for _ in range(MAX_TOOL_ROUNDS):
-        assistant_msg = None
-        tool_calls = None
-        for ev, payload in llm.chat_call_stream(
-            messages, backend=backend, model=model, tools=tool_specs
-        ):
-            if ev == "delta":
-                parts.append(payload)
-                yield ("delta", payload)
-            elif ev == "message":
-                assistant_msg = payload
-            elif ev == "tool_calls":
-                tool_calls = payload
-        if not tool_calls:
-            break
-        messages.append(assistant_msg)  # her tool-call request
-        for tc in tool_calls:
-            result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
-            logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
-            messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
-            _maybe_switch_mode(session_id, tc["name"])
-            yield ("tool", tc["name"])
+    mouth = _mouth_target(cfg, backend, model)

-    reply = "".join(parts)
-    if not reply:
-        reply = "(I got tangled using my tools there — say that again?)"
-        yield ("delta", reply)
-    logbus.log("info", "reply", session=session_id, chars=len(reply))
+    if mouth is None:
+        # No separate voice: stream the mind directly (the original path, unchanged).
+        parts: list[str] = []
+        for _ in range(MAX_TOOL_ROUNDS):
+            assistant_msg = None
+            tool_calls = None
+            for ev, payload in llm.chat_call_stream(
+                messages, backend=backend, model=model, tools=tool_specs
+            ):
+                if ev == "delta":
+                    parts.append(payload)
+                    yield ("delta", payload)
+                elif ev == "message":
+                    assistant_msg = payload
+                elif ev == "tool_calls":
+                    tool_calls = payload
+            if not tool_calls:
+                break
+            messages.append(assistant_msg)
+            for tc in tool_calls:
+                result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
+                logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
+                messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
+                _maybe_switch_mode(session_id, tc["name"])
+                yield ("tool", tc["name"])
+        reply = "".join(parts)
+        if not reply:
+            reply = _TANGLED
+            yield ("delta", reply)
+    else:
+        # Mind decides + runs tools (non-streamed); mouth re-voices, streamed.
+        draft, tools_run = _mind_loop(messages, backend, model, tool_specs, ctx, session_id)
+        for name in tools_run:
+            yield ("tool", name)
+        parts = []
+        try:
+            for ev, payload in llm.chat_call_stream(
+                mind.voice_messages(messages, draft), backend=mouth[0], model=mouth[1], tools=None
+            ):
+                if ev == "delta":
+                    parts.append(payload)
+                    yield ("delta", payload)
+        except Exception as exc:
+            logbus.log("error", "voice stream failed", error=str(exc)[:160])
+        reply = "".join(parts).strip() or draft or _TANGLED
+        if not parts:
+            yield ("delta", reply)

+    logbus.log("info", "reply", session=session_id, chars=len(reply), voiced=bool(mouth))
    memory.remember(session_id, "user", user_msg)
    memory.remember(session_id, "assistant", reply)
    summary.maybe_summarize_async(session_id)