project-lyra/lyra/chat.py

"""The chat turn: assemble the prompt (lyra.mind) then speak + persist.

`mind.assemble()` runs the society of parts (perceive → route → compose →
deliberate) and hands back a ready message list + the active mode. Then:
  - the MIND (the chat backend/model) runs the tool/generation loop — decide,
    reason, run tools — and produces a draft.
  - the MOUTH (a separate character model, if configured) re-voices that draft in
    her own voice. Default: no mouth configured → the mind's draft IS the reply
    (bit-for-bit the old behavior). The mouth slot is where a fine-tuned voice lands.
"""
from __future__ import annotations

from lyra import config, llm, logbus, memory, mind, modes, summary
from lyra import tools as toolkit
from lyra.llm import Backend

MAX_TOOL_ROUNDS = 5  # cap tool-call iterations per turn
# Backends that support function-calling. The MI50's llama.cpp server only does
# tools when launched with --jinja; until it is, keep tools to cloud so MI50 chat
# doesn't 500 on the tools param. Add "mi50" here once that flag is set.
TOOL_BACKENDS = {"cloud"}
_TANGLED = "(I got tangled using my tools there — say that again?)"


def _resolve_model(backend: Backend, model_override: str | None, cfg) -> str:
    """Live chat uses the stronger chat_model on cloud; local/mi50 use their own.
    The UI's cloud-model picker only applies on the cloud backend."""
    model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get(
        backend, backend
    )
    if model_override and backend == "cloud":
        model = model_override
    return model


def _mouth_target(cfg, mind_backend: Backend, mind_model: str | None):
    """The mouth (backend, model) if configured AND different from the mind; else None
    (mouth == mind → no separate voice pass)."""
    if not cfg.mouth_backend and not cfg.mouth_model:
        return None
    backend = cfg.mouth_backend or mind_backend
    model = cfg.mouth_model or None
    if backend == mind_backend and model == mind_model:
        return None
    return backend, model


def _maybe_switch_mode(session_id: str, tool_name: str) -> None:
    """Opening a poker session auto-flips this chat into Poker mode. Manual UI switching
    still overrides anytime."""
    if tool_name == "start_session":
        memory.set_session_mode(session_id, modes.CASH.key)
        logbus.log("info", "mode auto-switch", session=session_id, mode=modes.CASH.key)


def _mind_loop(messages, backend: Backend, model: str | None, tool_specs,
               ctx: dict, session_id: str) -> tuple[str, list[str]]:
    """Run the tool/generation loop on the MIND model (non-streaming). Mutates
    `messages` with tool calls/results. Returns (draft_reply, tool_names_run)."""
    tools_run: list[str] = []
    reply = ""
    for _ in range(MAX_TOOL_ROUNDS):
        assistant_msg, tool_calls = llm.chat_call(
            messages, backend=backend, model=model, tools=tool_specs
        )
        if not tool_calls:
            reply = assistant_msg.get("content") or ""
            break
        messages.append(assistant_msg)
        for tc in tool_calls:
            result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
            logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
            messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
            _maybe_switch_mode(session_id, tc["name"])
            tools_run.append(tc["name"])
    return reply, tools_run


def _voice_pass(messages, draft: str, backend: Backend, model: str | None) -> str:
    """Mouth: re-render the mind's draft in her voice. Falls back to the draft on failure."""
    try:
        out = llm.complete(mind.voice_messages(messages, draft), backend=backend, model=model)
        return (out or "").strip() or draft
    except Exception as exc:
        logbus.log("error", "voice pass failed", error=str(exc)[:160])
        return draft


def respond(session_id: str, user_msg: str, backend: Backend = "cloud",
            model_override: str | None = None) -> str:
    """Produce Lyra's reply to a single user message and persist the exchange."""
    cfg = config.load()
    model = _resolve_model(backend, model_override, cfg)
    logbus.log("info", "chat request", session=session_id, backend=backend,
               model=model, embed=cfg.embed_backend)

    turn = mind.assemble(session_id, user_msg, backend, model)
    messages = turn.messages
    tool_specs = toolkit.specs(turn.mode.tools) if backend in TOOL_BACKENDS else None
    ctx = {"session_id": session_id, "backend": backend}

    reply, _ = _mind_loop(messages, backend, model, tool_specs, ctx, session_id)
    mouth = _mouth_target(cfg, backend, model)
    if mouth and reply:
        reply = _voice_pass(messages, reply, *mouth)
    if not reply:
        reply = _TANGLED
    logbus.log("info", "reply", session=session_id, chars=len(reply), voiced=bool(mouth))

    memory.remember(session_id, "user", user_msg)
    memory.remember(session_id, "assistant", reply)
    summary.maybe_summarize_async(session_id)  # compact once enough new turns pile up
    return reply


def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud",
                   model_override: str | None = None):
    """Streaming generator version of `respond`. Yields ("delta", text), ("tool", name),
    and a final ("done", reply). Same side effects as `respond`."""
    cfg = config.load()
    model = _resolve_model(backend, model_override, cfg)
    logbus.log("info", "chat request (stream)", session=session_id, backend=backend,
               model=model, embed=cfg.embed_backend)

    turn = mind.assemble(session_id, user_msg, backend, model)
    messages = turn.messages
    tool_specs = toolkit.specs(turn.mode.tools) if backend in TOOL_BACKENDS else None
    ctx = {"session_id": session_id, "backend": backend}
    mouth = _mouth_target(cfg, backend, model)

    if mouth is None:
        # No separate voice: stream the mind directly (the original path, unchanged).
        parts: list[str] = []
        for _ in range(MAX_TOOL_ROUNDS):
            assistant_msg = None
            tool_calls = None
            for ev, payload in llm.chat_call_stream(
                messages, backend=backend, model=model, tools=tool_specs
            ):
                if ev == "delta":
                    parts.append(payload)
                    yield ("delta", payload)
                elif ev == "message":
                    assistant_msg = payload
                elif ev == "tool_calls":
                    tool_calls = payload
            if not tool_calls:
                break
            messages.append(assistant_msg)
            for tc in tool_calls:
                result = toolkit.dispatch(tc["name"], tc["arguments"], ctx)
                logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80])
                messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result})
                _maybe_switch_mode(session_id, tc["name"])
                yield ("tool", tc["name"])
        reply = "".join(parts)
        if not reply:
            reply = _TANGLED
            yield ("delta", reply)
    else:
        # Mind decides + runs tools (non-streamed); mouth re-voices, streamed.
        draft, tools_run = _mind_loop(messages, backend, model, tool_specs, ctx, session_id)
        for name in tools_run:
            yield ("tool", name)
        parts = []
        try:
            for ev, payload in llm.chat_call_stream(
                mind.voice_messages(messages, draft), backend=mouth[0], model=mouth[1], tools=None
            ):
                if ev == "delta":
                    parts.append(payload)
                    yield ("delta", payload)
        except Exception as exc:
            logbus.log("error", "voice stream failed", error=str(exc)[:160])
        reply = "".join(parts).strip() or draft or _TANGLED
        if not parts:
            yield ("delta", reply)

    logbus.log("info", "reply", session=session_id, chars=len(reply), voiced=bool(mouth))
    memory.remember(session_id, "user", user_msg)
    memory.remember(session_id, "assistant", reply)
    summary.maybe_summarize_async(session_id)
    yield ("done", reply)