"""The chat turn loop: persona + tiered memory + recent context -> reply. Context is assembled in tiers (oldest/most-compacted first): 1. persona 2. long-term gist — relevant *summaries* of other sessions 3. sharp details — a few raw cross-session exchanges (so specifics survive) 4. recent raw turns of the current session (full fidelity) 5. the new user message After replying, the session is compacted if enough new turns have accumulated. """ from __future__ import annotations from lyra import clock, config, llm, logbus, memory, persona, self_state, summary from lyra import tools as toolkit from lyra.llm import Backend, Message RECALL_K = 3 # raw cross-session "sharp detail" hits RECENT_N = 10 # raw turns of the current session SUMMARY_K = 3 # other-session gists MAX_TOOL_ROUNDS = 5 # cap tool-call iterations per turn # Backends that support function-calling. The MI50's llama.cpp server only does # tools when launched with --jinja; until it is, keep tools to cloud so MI50 chat # doesn't 500 on the tools param. Add "mi50" here once that flag is set. TOOL_BACKENDS = {"cloud"} def _summary_note(summaries: list[memory.Summary]) -> Message: lines = [f"- ({(s.session_started_at or s.created_at)[:10]}) {s.content}" for s in summaries] body = "Gist of earlier sessions (compacted — ask if you need specifics):\n" + "\n".join(lines) return {"role": "system", "content": body} def _detail_note(exchanges: list[memory.Exchange]) -> Message: lines = [f"- ({ex.created_at[:10]}, {ex.role}) {ex.content}" for ex in exchanges] body = "Specific things you recall from past conversations:\n" + "\n".join(lines) return {"role": "system", "content": body} def _now_note() -> Message: """Current wall-clock time + how long since Brian last said anything. Stated as plain fact — she has no clock otherwise, so without this 'now' and the gap since the last turn are invisible to her. """ line = f"The current date and time is {clock.stamp()}." gap = clock.humanize_gap(memory.last_exchange_at()) line += ( f" It has been {gap} since Brian last spoke with you." if gap else " This is the first thing Brian has ever said to you." ) return {"role": "system", "content": line} def _render(messages: list[Message]) -> str: """Human-readable dump of the exact prompt, for the live-log inspector.""" return "\n\n".join(f"[{m['role']}]\n{m['content']}" for m in messages) def build_messages(session_id: str, user_msg: str) -> list[Message]: """Assemble the full, tiered message list for one turn.""" messages: list[Message] = [{"role": "system", "content": persona.system_prompt()}] # Autonomy Core: Lyra's own evolving interiority (mood, self-narrative). Comes # right after the persona — her sense of self before her model of the world. messages.append({"role": "system", "content": self_state.render_for_context(self_state.load())}) # When she is: current time + the gap since Brian last spoke (she has no clock). messages.append(_now_note()) # Semantic memory: the distilled profile (who Brian is) — answers identity # questions that raw recall can't. Always in context when it exists. profile = memory.get_profile() if profile: messages.append( {"role": "system", "content": "What you know about Brian:\n" + profile} ) # Time-aware memory: the current narrative (recent arc, trends, callbacks). narrative = memory.get_narrative() if narrative: messages.append( {"role": "system", "content": "What's going on with Brian lately:\n" + narrative} ) recent = memory.recent(session_id, n=RECENT_N) recent_ids = {ex.id for ex in recent} # Tier 1: compacted gists of *other* sessions (long-term, general idea). summaries = memory.recall_summaries(user_msg, k=SUMMARY_K, exclude_session=session_id) if summaries: messages.append(_summary_note(summaries)) # Tier 2: a few sharp raw details from other sessions (so specifics survive # compaction). Skip the current session (its raw turns are in `recent`). recalled = [ ex for ex in memory.recall(user_msg, k=RECALL_K) if ex.id not in recent_ids and ex.session_id != session_id ] if recalled: messages.append(_detail_note(recalled)) # Tier 3: current session, full fidelity. for ex in recent: messages.append({"role": ex.role, "content": ex.content}) messages.append({"role": "user", "content": user_msg}) logbus.log( "debug", "context built", recent=len(recent), summaries=len(summaries), details=len(recalled), chars=sum(len(m["content"]) for m in messages), detail=_render(messages), ) return messages def respond(session_id: str, user_msg: str, backend: Backend = "cloud", model_override: str | None = None) -> str: """Produce Lyra's reply to a single user message and persist the exchange. `model_override` (from the UI's cloud-model picker) only applies on the cloud backend; local/mi50 keep their own configured models. """ cfg = config.load() # Live chat uses the stronger chat_model on cloud (bulk consolidation keeps # cloud_model). local/mi50 use their own configured model. model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get( backend, backend ) if model_override and backend == "cloud": model = model_override logbus.log( "info", "chat request", session=session_id, backend=backend, model=model, embed=cfg.embed_backend, ) messages = build_messages(session_id, user_msg) # Tool loop: offer Lyra her tools; if she calls one, run it and feed the # result back so she can continue, until she returns a normal text reply. tool_specs = toolkit.specs() if backend in TOOL_BACKENDS else None ctx = {"session_id": session_id, "backend": backend} reply = "" for _ in range(MAX_TOOL_ROUNDS): assistant_msg, tool_calls = llm.chat_call( messages, backend=backend, model=model, tools=tool_specs ) if not tool_calls: reply = assistant_msg.get("content") or "" break messages.append(assistant_msg) # her tool-call request for tc in tool_calls: result = toolkit.dispatch(tc["name"], tc["arguments"], ctx) logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80]) messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result}) if not reply: reply = "(I got tangled using my tools there — say that again?)" logbus.log("info", "reply", session=session_id, chars=len(reply)) memory.remember(session_id, "user", user_msg) memory.remember(session_id, "assistant", reply) # Compact this session once enough new turns have piled up. summary.maybe_summarize(session_id) return reply def respond_stream(session_id: str, user_msg: str, backend: Backend = "cloud", model_override: str | None = None): """Streaming generator version of `respond`. Yields ("delta", text) as content streams in, and ("tool", name) when a tool runs. Persists the full exchange and yields a final ("done", reply) — matching `respond`'s side effects (memory + compaction) exactly. """ cfg = config.load() model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get( backend, backend ) if model_override and backend == "cloud": model = model_override logbus.log( "info", "chat request (stream)", session=session_id, backend=backend, model=model, embed=cfg.embed_backend, ) messages = build_messages(session_id, user_msg) tool_specs = toolkit.specs() if backend in TOOL_BACKENDS else None ctx = {"session_id": session_id, "backend": backend} parts: list[str] = [] for _ in range(MAX_TOOL_ROUNDS): assistant_msg = None tool_calls = None for ev, payload in llm.chat_call_stream( messages, backend=backend, model=model, tools=tool_specs ): if ev == "delta": parts.append(payload) yield ("delta", payload) elif ev == "message": assistant_msg = payload elif ev == "tool_calls": tool_calls = payload if not tool_calls: break messages.append(assistant_msg) # her tool-call request for tc in tool_calls: result = toolkit.dispatch(tc["name"], tc["arguments"], ctx) logbus.log("info", "tool call", session=session_id, tool=tc["name"], result=result[:80]) messages.append({"role": "tool", "tool_call_id": tc["id"], "content": result}) yield ("tool", tc["name"]) reply = "".join(parts) if not reply: reply = "(I got tangled using my tools there — say that again?)" yield ("delta", reply) logbus.log("info", "reply", session=session_id, chars=len(reply)) memory.remember(session_id, "user", user_msg) memory.remember(session_id, "assistant", reply) summary.maybe_summarize(session_id) yield ("done", reply)