perf: tighten the dynamic prompt — persona split + lean deliberation

The per-turn prompt was ~5.5K tokens (persona alone ~40%), sent up to 3x/turn. Tightened by RELEVANCE (the control plane decides what each turn needs), not by deletion — fidelity preserved, focus improved (buried instructions were getting ignored), tokens roughly halved. - persona split: core (identity + voice — always) vs situational sections pulled in only when relevant. mind._persona_block: self-model/origin only on meta turns (generous _META_HINTS), poker guardrails only in poker context (mode/strategic/ _POKER_HINTS). persona.core_prompt()/section(); system_prompt() kept as fallback. - lean deliberation: the private 'what do I think' pass now uses a focused context (her interiority + recent turns + the message), not the full persona/profile/ narrative/recall dump. It shapes the take, not the voice. Measured: casual Talk turn 21,949 -> 15,974 chars (-27%); deliberation 21,949 -> 6,026 (-72%); meta turns still include the self-model. Suite 98 green, ruff clean. Real retirement of the long prompt is still the fine-tune (mouth); this is the cheap, high-leverage cut that also improves adherence. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 20:48:44 +00:00
parent 8a3c9b2701
commit 51c2d6abb9
3 changed files with 121 additions and 16 deletions
@@ -104,10 +104,40 @@ def _render(messages: list[Message]) -> str:
    return "\n\n".join(f"[{m['role']}]\n{m['content']}" for m in messages)
 # Generous triggers for the heavy situational persona sections — err toward INCLUDING
 # them (a false positive is a few spare KB; a false negative risks confabulation or
 # eyeballed poker math). The core (identity + voice) is always present regardless.
 _META_HINTS = (
    "you work", "how do you", "how does your", "your memory", "your dream", "your thought",
    "do you remember", "are you", "do you feel", "conscious", "sentient", "yourself",
    "your mind", "who are you", "what are you", "your origin", "how were you", "how did you",
    "your inner", "your reflect", "your journal",
 )
 _POKER_HINTS = (
    "poker", "fold", "call", "raise", "river", "turn", "flop", "preflop", "equity", "range",
    "villain", "stack", "tilt", "hand", "bluff", "pot", "3bet", "gto", "outs", "draw",
 )
 def _persona_block(user_msg: str, mode: modes.Mode | None, moment: dict | None) -> str:
    """Core persona always; pull in situational sections (origin/self-model, poker
    guardrails) only when the turn calls for it."""
    parts = [persona.core_prompt()]
    um = user_msg.lower()
    kind = (moment or {}).get("kind")
    if kind == "meta" or any(h in um for h in _META_HINTS):
        parts += [persona.section("What you are"), persona.section("How you actually work")]
    poker = (mode and mode.key in ("poker_cash", "study")) or kind == "strategic" \
        or any(h in um for h in _POKER_HINTS)
    if poker:
        parts.append(persona.section("What you do NOT do"))
    return "\n\n".join(p for p in parts if p)
 def build_messages(session_id: str, user_msg: str,
                   mode: modes.Mode | None = None, moment: dict | None = None) -> list[Message]:
    """Assemble the full, tiered message list for one turn."""
-    messages: list[Message] = [{"role": "system", "content": persona.system_prompt()}]
+    messages: list[Message] = [{"role": "system", "content": _persona_block(user_msg, mode, moment)}]
    # Autonomy Core: Lyra's own evolving interiority (mood, self-narrative). Comes
    # right after the persona — her sense of self before her model of the world.
@@ -207,12 +237,30 @@ _DELIBERATE_SYS = (
 )
-def _deliberate(messages: list[Message], backend: Backend, model: str | None) -> str:
+def _deliberation_context(session_id: str, user_msg: str) -> list[Message]:
    """A LEAN context for the private thinking pass — her interiority + recent turns +
    the message. Deliberately omits the full persona, profile, narrative, and recall
    tiers: the thinking doesn't need the voice rules or the world-model dump (those
    shape the final reply, not the private take), and dropping them cuts this whole
    extra call by most of its tokens."""
    msgs: list[Message] = [
        {"role": "system", "content": self_state.render_for_context(self_state.load())}
    ]
    inner = _inner_life_note()
    if inner:
        msgs.append(inner)
    for ex in memory.recent(session_id, n=6):
        msgs.append({"role": ex.role, "content": ex.content})
    msgs.append({"role": "user", "content": user_msg})
    msgs.append({"role": "system", "content": _DELIBERATE_SYS})
    return msgs
 def _deliberate(session_id: str, user_msg: str, backend: Backend, model: str | None) -> str:
    """One private 'what do I actually think' pass before replying. Returns her thinking
    (empty on any failure — chat must never break because deliberation hiccuped)."""
    try:
-        out = llm.complete(messages + [{"role": "system", "content": _DELIBERATE_SYS}],
+        out = llm.complete(_deliberation_context(session_id, user_msg), backend=backend, model=model)
                           backend=backend, model=model)
        return (out or "").strip()
    except Exception as exc:
        logbus.log("error", "deliberation failed", error=str(exc)[:160])
@@ -232,11 +280,11 @@ def _answer_from(thinking: str) -> Message:
 def _deliberation_note(session_id: str, user_msg: str, backend: Backend,
-                       model: str | None, messages: list[Message]) -> Message | None:
+                       model: str | None) -> Message | None:
    """Run the private thinking pass if warranted; return the answer-from-thinking note."""
    if not config.load().chat_deliberate or not _should_deliberate(user_msg):
        return None
-    thinking = _deliberate(messages, backend, model)
+    thinking = _deliberate(session_id, user_msg, backend, model)
    if not thinking:
        return None
    logbus.log("info", "deliberated", session=session_id, chars=len(thinking), detail=thinking)
@@ -299,7 +347,7 @@ def _compose(ctx: TurnContext) -> TurnContext:
 def _deliberate_part(ctx: TurnContext) -> TurnContext:
    """Private 'what do I actually think' pass, appended last so it shapes the reply."""
-    note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model, ctx.messages)
+    note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model)
    if note:
        ctx.messages.append(note)
    return ctx
@@ -1,20 +1,60 @@
 """Persona: Lyra's identity and voice, loaded from an editable markdown prompt.
-The prompt lives in `personas/<name>.md` so it can be tuned without touching
+The prompt lives in `personas/<name>.md` so it can be tuned without touching code.
-code. `LYRA_PERSONA` selects which file to load (default: "lyra").
+`LYRA_PERSONA` selects which file to load (default: "lyra").
 The file is split on `## ` headers so the control plane can include only what a turn
 needs: the **core** (identity + voice — the anti-generic essentials) is always sent;
 the heavier situational sections (her origin, the self-model, the poker guardrails)
 are pulled in by `mind` only when relevant. This keeps the per-turn prompt tight
 without losing fidelity. `system_prompt()` still returns the whole thing (fallback).
 """
 from __future__ import annotations
 import os
 import re
 from functools import lru_cache
 from pathlib import Path
 _PERSONA_DIR = Path(__file__).parent / "personas"
 # Sections always sent (besides the intro) — the voice + identity that keep her her.
 _CORE = ("Who you are", "How you talk", "Right now")
 def _name(name: str | None) -> str:
    return name or os.getenv("LYRA_PERSONA", "lyra")
@lru_cache(maxsize=None)
 def _sections(name: str) -> dict[str, str]:
    """Parse the persona file into {header: text}; the pre-header preamble is 'intro'."""
    text = (_PERSONA_DIR / f"{name}.md").read_text(encoding="utf-8").strip()
    chunks = re.split(r"(?m)^## ", text)
    out = {"intro": chunks[0].strip()}
    for ch in chunks[1:]:
        header = ch.split("\n", 1)[0].strip()
        out[header] = ("## " + ch).strip()
    return out
@lru_cache(maxsize=None)
 def system_prompt(name: str | None = None) -> str:
-    """Return the persona system prompt. Cached; pass a name to override env."""
+    """The full persona (every section). Fallback / back-compat."""
-    name = name or os.getenv("LYRA_PERSONA", "lyra")
+    return (_PERSONA_DIR / f"{_name(name)}.md").read_text(encoding="utf-8").strip()
-    path = _PERSONA_DIR / f"{name}.md"
+
-    return path.read_text(encoding="utf-8").strip()
+
 def core_prompt(name: str | None = None) -> str:
    """Intro + the always-on core sections (identity + voice)."""
    s = _sections(_name(name))
    parts = [s["intro"]] + [section(h, name) for h in _CORE]
    return "\n\n".join(p for p in parts if p)
 def section(header_prefix: str, name: str | None = None) -> str:
    """A situational section by header prefix (e.g. 'How you actually work'); '' if absent."""
    pref = header_prefix.lower()
    for header, body in _sections(_name(name)).items():
        if header.lower().startswith(pref):
            return body
    return ""
@@ -29,15 +29,16 @@ def test_should_deliberate_skips_trivial(lyra):
 def test_deliberation_note_runs_and_appends(lyra, monkeypatch):
-    _, mind = lyra
+    memory, mind = lyra
    calls = []
    def fake_complete(messages, backend=None, model=None):
        calls.append(messages)
        return "I actually think the first move is the smallest end-to-end slice."
    memory.ensure_session("s1")
    monkeypatch.setattr(mind.llm, "complete", fake_complete)
-    note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None, [])
+    note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None)
    assert note and note["role"] == "system"
    assert "first move is the smallest" in note["content"]      # her thinking carried in
    assert "numbered list" in note["content"].lower()           # voice enforcement attached
@@ -49,10 +50,26 @@ def test_deliberation_skipped_when_disabled(lyra, monkeypatch):
    monkeypatch.setenv("CHAT_DELIBERATE", "false")
    called = []
    monkeypatch.setattr(mind.llm, "complete", lambda *a, **k: called.append(1) or "x")
-    assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None, []) is None
+    assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None) is None
    assert called == []                                          # no LLM call when off
 def test_persona_core_is_tight_situational_is_gated(lyra):
    memory, mind = lyra
    from lyra import persona
    core, full = persona.core_prompt(), persona.system_prompt()
    assert "How you talk" in core and "How you actually work" not in core  # voice core, self-model not
    assert len(core) < len(full) and persona.section("How you actually work")
    memory.ensure_session("s1")
    casual = " ".join(m["content"] for m in mind.build_messages("s1", "any dinner ideas tonight?")
                       if m["role"] == "system")
    meta = " ".join(m["content"] for m in mind.build_messages("s1", "how does your memory actually work?")
                    if m["role"] == "system")
    assert "How you actually work" not in casual      # situational section omitted on a casual turn
    assert "How you actually work" in meta            # pulled in for a meta question
 def test_assemble_runs_the_pipeline(lyra, monkeypatch):
    memory, mind = lyra
    monkeypatch.setenv("CHAT_DELIBERATE", "false")  # keep it offline for the structure test