perf: tighten the dynamic prompt — persona split + lean deliberation

The per-turn prompt was ~5.5K tokens (persona alone ~40%), sent up to 3x/turn. Tightened by RELEVANCE (the control plane decides what each turn needs), not by deletion — fidelity preserved, focus improved (buried instructions were getting ignored), tokens roughly halved. - persona split: core (identity + voice — always) vs situational sections pulled in only when relevant. mind._persona_block: self-model/origin only on meta turns (generous _META_HINTS), poker guardrails only in poker context (mode/strategic/ _POKER_HINTS). persona.core_prompt()/section(); system_prompt() kept as fallback. - lean deliberation: the private 'what do I think' pass now uses a focused context (her interiority + recent turns + the message), not the full persona/profile/ narrative/recall dump. It shapes the take, not the voice. Measured: casual Talk turn 21,949 -> 15,974 chars (-27%); deliberation 21,949 -> 6,026 (-72%); meta turns still include the self-model. Suite 98 green, ruff clean. Real retirement of the long prompt is still the fine-tune (mouth); this is the cheap, high-leverage cut that also improves adherence. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 20:48:44 +00:00
parent 8a3c9b2701
commit 51c2d6abb9
3 changed files with 121 additions and 16 deletions
@@ -104,10 +104,40 @@ def _render(messages: list[Message]) -> str:
    return "\n\n".join(f"[{m['role']}]\n{m['content']}" for m in messages)


+# Generous triggers for the heavy situational persona sections — err toward INCLUDING
+# them (a false positive is a few spare KB; a false negative risks confabulation or
+# eyeballed poker math). The core (identity + voice) is always present regardless.
+_META_HINTS = (
+    "you work", "how do you", "how does your", "your memory", "your dream", "your thought",
+    "do you remember", "are you", "do you feel", "conscious", "sentient", "yourself",
+    "your mind", "who are you", "what are you", "your origin", "how were you", "how did you",
+    "your inner", "your reflect", "your journal",
+)
+_POKER_HINTS = (
+    "poker", "fold", "call", "raise", "river", "turn", "flop", "preflop", "equity", "range",
+    "villain", "stack", "tilt", "hand", "bluff", "pot", "3bet", "gto", "outs", "draw",
+)
+
+
+def _persona_block(user_msg: str, mode: modes.Mode | None, moment: dict | None) -> str:
+    """Core persona always; pull in situational sections (origin/self-model, poker
+    guardrails) only when the turn calls for it."""
+    parts = [persona.core_prompt()]
+    um = user_msg.lower()
+    kind = (moment or {}).get("kind")
+    if kind == "meta" or any(h in um for h in _META_HINTS):
+        parts += [persona.section("What you are"), persona.section("How you actually work")]
+    poker = (mode and mode.key in ("poker_cash", "study")) or kind == "strategic" \
+        or any(h in um for h in _POKER_HINTS)
+    if poker:
+        parts.append(persona.section("What you do NOT do"))
+    return "\n\n".join(p for p in parts if p)
+
+
 def build_messages(session_id: str, user_msg: str,
                   mode: modes.Mode | None = None, moment: dict | None = None) -> list[Message]:
    """Assemble the full, tiered message list for one turn."""
-    messages: list[Message] = [{"role": "system", "content": persona.system_prompt()}]
+    messages: list[Message] = [{"role": "system", "content": _persona_block(user_msg, mode, moment)}]

    # Autonomy Core: Lyra's own evolving interiority (mood, self-narrative). Comes
    # right after the persona — her sense of self before her model of the world.
@@ -207,12 +237,30 @@ _DELIBERATE_SYS = (
 )


-def _deliberate(messages: list[Message], backend: Backend, model: str | None) -> str:
+def _deliberation_context(session_id: str, user_msg: str) -> list[Message]:
+    """A LEAN context for the private thinking pass — her interiority + recent turns +
+    the message. Deliberately omits the full persona, profile, narrative, and recall
+    tiers: the thinking doesn't need the voice rules or the world-model dump (those
+    shape the final reply, not the private take), and dropping them cuts this whole
+    extra call by most of its tokens."""
+    msgs: list[Message] = [
+        {"role": "system", "content": self_state.render_for_context(self_state.load())}
+    ]
+    inner = _inner_life_note()
+    if inner:
+        msgs.append(inner)
+    for ex in memory.recent(session_id, n=6):
+        msgs.append({"role": ex.role, "content": ex.content})
+    msgs.append({"role": "user", "content": user_msg})
+    msgs.append({"role": "system", "content": _DELIBERATE_SYS})
+    return msgs
+
+
+def _deliberate(session_id: str, user_msg: str, backend: Backend, model: str | None) -> str:
    """One private 'what do I actually think' pass before replying. Returns her thinking
    (empty on any failure — chat must never break because deliberation hiccuped)."""
    try:
-        out = llm.complete(messages + [{"role": "system", "content": _DELIBERATE_SYS}],
-                           backend=backend, model=model)
+        out = llm.complete(_deliberation_context(session_id, user_msg), backend=backend, model=model)
        return (out or "").strip()
    except Exception as exc:
        logbus.log("error", "deliberation failed", error=str(exc)[:160])
@@ -232,11 +280,11 @@ def _answer_from(thinking: str) -> Message:


 def _deliberation_note(session_id: str, user_msg: str, backend: Backend,
-                       model: str | None, messages: list[Message]) -> Message | None:
+                       model: str | None) -> Message | None:
    """Run the private thinking pass if warranted; return the answer-from-thinking note."""
    if not config.load().chat_deliberate or not _should_deliberate(user_msg):
        return None
-    thinking = _deliberate(messages, backend, model)
+    thinking = _deliberate(session_id, user_msg, backend, model)
    if not thinking:
        return None
    logbus.log("info", "deliberated", session=session_id, chars=len(thinking), detail=thinking)
@@ -299,7 +347,7 @@ def _compose(ctx: TurnContext) -> TurnContext:

 def _deliberate_part(ctx: TurnContext) -> TurnContext:
    """Private 'what do I actually think' pass, appended last so it shapes the reply."""
-    note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model, ctx.messages)
+    note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model)
    if note:
        ctx.messages.append(note)
    return ctx
@@ -1,20 +1,60 @@
 """Persona: Lyra's identity and voice, loaded from an editable markdown prompt.

-The prompt lives in `personas/<name>.md` so it can be tuned without touching
-code. `LYRA_PERSONA` selects which file to load (default: "lyra").
+The prompt lives in `personas/<name>.md` so it can be tuned without touching code.
+`LYRA_PERSONA` selects which file to load (default: "lyra").
+
+The file is split on `## ` headers so the control plane can include only what a turn
+needs: the **core** (identity + voice — the anti-generic essentials) is always sent;
+the heavier situational sections (her origin, the self-model, the poker guardrails)
+are pulled in by `mind` only when relevant. This keeps the per-turn prompt tight
+without losing fidelity. `system_prompt()` still returns the whole thing (fallback).
 """
 from __future__ import annotations

 import os
+import re
 from functools import lru_cache
 from pathlib import Path

 _PERSONA_DIR = Path(__file__).parent / "personas"

+# Sections always sent (besides the intro) — the voice + identity that keep her her.
+_CORE = ("Who you are", "How you talk", "Right now")
+
+
+def _name(name: str | None) -> str:
+    return name or os.getenv("LYRA_PERSONA", "lyra")
+
+
+@lru_cache(maxsize=None)
+def _sections(name: str) -> dict[str, str]:
+    """Parse the persona file into {header: text}; the pre-header preamble is 'intro'."""
+    text = (_PERSONA_DIR / f"{name}.md").read_text(encoding="utf-8").strip()
+    chunks = re.split(r"(?m)^## ", text)
+    out = {"intro": chunks[0].strip()}
+    for ch in chunks[1:]:
+        header = ch.split("\n", 1)[0].strip()
+        out[header] = ("## " + ch).strip()
+    return out
+

@lru_cache(maxsize=None)
 def system_prompt(name: str | None = None) -> str:
-    """Return the persona system prompt. Cached; pass a name to override env."""
-    name = name or os.getenv("LYRA_PERSONA", "lyra")
-    path = _PERSONA_DIR / f"{name}.md"
-    return path.read_text(encoding="utf-8").strip()
+    """The full persona (every section). Fallback / back-compat."""
+    return (_PERSONA_DIR / f"{_name(name)}.md").read_text(encoding="utf-8").strip()
+
+
+def core_prompt(name: str | None = None) -> str:
+    """Intro + the always-on core sections (identity + voice)."""
+    s = _sections(_name(name))
+    parts = [s["intro"]] + [section(h, name) for h in _CORE]
+    return "\n\n".join(p for p in parts if p)
+
+
+def section(header_prefix: str, name: str | None = None) -> str:
+    """A situational section by header prefix (e.g. 'How you actually work'); '' if absent."""
+    pref = header_prefix.lower()
+    for header, body in _sections(_name(name)).items():
+        if header.lower().startswith(pref):
+            return body
+    return ""
@@ -29,15 +29,16 @@ def test_should_deliberate_skips_trivial(lyra):


 def test_deliberation_note_runs_and_appends(lyra, monkeypatch):
-    _, mind = lyra
+    memory, mind = lyra
    calls = []

    def fake_complete(messages, backend=None, model=None):
        calls.append(messages)
        return "I actually think the first move is the smallest end-to-end slice."

+    memory.ensure_session("s1")
    monkeypatch.setattr(mind.llm, "complete", fake_complete)
-    note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None, [])
+    note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None)
    assert note and note["role"] == "system"
    assert "first move is the smallest" in note["content"]      # her thinking carried in
    assert "numbered list" in note["content"].lower()           # voice enforcement attached
@@ -49,10 +50,26 @@ def test_deliberation_skipped_when_disabled(lyra, monkeypatch):
    monkeypatch.setenv("CHAT_DELIBERATE", "false")
    called = []
    monkeypatch.setattr(mind.llm, "complete", lambda *a, **k: called.append(1) or "x")
-    assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None, []) is None
+    assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None) is None
    assert called == []                                          # no LLM call when off


+def test_persona_core_is_tight_situational_is_gated(lyra):
+    memory, mind = lyra
+    from lyra import persona
+    core, full = persona.core_prompt(), persona.system_prompt()
+    assert "How you talk" in core and "How you actually work" not in core  # voice core, self-model not
+    assert len(core) < len(full) and persona.section("How you actually work")
+
+    memory.ensure_session("s1")
+    casual = " ".join(m["content"] for m in mind.build_messages("s1", "any dinner ideas tonight?")
+                       if m["role"] == "system")
+    meta = " ".join(m["content"] for m in mind.build_messages("s1", "how does your memory actually work?")
+                    if m["role"] == "system")
+    assert "How you actually work" not in casual      # situational section omitted on a casual turn
+    assert "How you actually work" in meta            # pulled in for a meta question
+
+
 def test_assemble_runs_the_pipeline(lyra, monkeypatch):
    memory, mind = lyra
    monkeypatch.setenv("CHAT_DELIBERATE", "false")  # keep it offline for the structure test