perf: tighten the dynamic prompt — persona split + lean deliberation

The per-turn prompt was ~5.5K tokens (persona alone ~40%), sent up to 3x/turn.
Tightened by RELEVANCE (the control plane decides what each turn needs), not by
deletion — fidelity preserved, focus improved (buried instructions were getting
ignored), tokens roughly halved.

- persona split: core (identity + voice — always) vs situational sections pulled
  in only when relevant. mind._persona_block: self-model/origin only on meta turns
  (generous _META_HINTS), poker guardrails only in poker context (mode/strategic/
  _POKER_HINTS). persona.core_prompt()/section(); system_prompt() kept as fallback.
- lean deliberation: the private 'what do I think' pass now uses a focused context
  (her interiority + recent turns + the message), not the full persona/profile/
  narrative/recall dump. It shapes the take, not the voice.

Measured: casual Talk turn 21,949 -> 15,974 chars (-27%); deliberation 21,949 ->
6,026 (-72%); meta turns still include the self-model. Suite 98 green, ruff clean.

Real retirement of the long prompt is still the fine-tune (mouth); this is the
cheap, high-leverage cut that also improves adherence.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-24 20:48:44 +00:00
parent 8a3c9b2701
commit 51c2d6abb9
3 changed files with 121 additions and 16 deletions
+55 -7
View File
@@ -104,10 +104,40 @@ def _render(messages: list[Message]) -> str:
return "\n\n".join(f"[{m['role']}]\n{m['content']}" for m in messages)
# Generous triggers for the heavy situational persona sections — err toward INCLUDING
# them (a false positive is a few spare KB; a false negative risks confabulation or
# eyeballed poker math). The core (identity + voice) is always present regardless.
_META_HINTS = (
"you work", "how do you", "how does your", "your memory", "your dream", "your thought",
"do you remember", "are you", "do you feel", "conscious", "sentient", "yourself",
"your mind", "who are you", "what are you", "your origin", "how were you", "how did you",
"your inner", "your reflect", "your journal",
)
_POKER_HINTS = (
"poker", "fold", "call", "raise", "river", "turn", "flop", "preflop", "equity", "range",
"villain", "stack", "tilt", "hand", "bluff", "pot", "3bet", "gto", "outs", "draw",
)
def _persona_block(user_msg: str, mode: modes.Mode | None, moment: dict | None) -> str:
"""Core persona always; pull in situational sections (origin/self-model, poker
guardrails) only when the turn calls for it."""
parts = [persona.core_prompt()]
um = user_msg.lower()
kind = (moment or {}).get("kind")
if kind == "meta" or any(h in um for h in _META_HINTS):
parts += [persona.section("What you are"), persona.section("How you actually work")]
poker = (mode and mode.key in ("poker_cash", "study")) or kind == "strategic" \
or any(h in um for h in _POKER_HINTS)
if poker:
parts.append(persona.section("What you do NOT do"))
return "\n\n".join(p for p in parts if p)
def build_messages(session_id: str, user_msg: str,
mode: modes.Mode | None = None, moment: dict | None = None) -> list[Message]:
"""Assemble the full, tiered message list for one turn."""
messages: list[Message] = [{"role": "system", "content": persona.system_prompt()}]
messages: list[Message] = [{"role": "system", "content": _persona_block(user_msg, mode, moment)}]
# Autonomy Core: Lyra's own evolving interiority (mood, self-narrative). Comes
# right after the persona — her sense of self before her model of the world.
@@ -207,12 +237,30 @@ _DELIBERATE_SYS = (
)
def _deliberate(messages: list[Message], backend: Backend, model: str | None) -> str:
def _deliberation_context(session_id: str, user_msg: str) -> list[Message]:
"""A LEAN context for the private thinking pass — her interiority + recent turns +
the message. Deliberately omits the full persona, profile, narrative, and recall
tiers: the thinking doesn't need the voice rules or the world-model dump (those
shape the final reply, not the private take), and dropping them cuts this whole
extra call by most of its tokens."""
msgs: list[Message] = [
{"role": "system", "content": self_state.render_for_context(self_state.load())}
]
inner = _inner_life_note()
if inner:
msgs.append(inner)
for ex in memory.recent(session_id, n=6):
msgs.append({"role": ex.role, "content": ex.content})
msgs.append({"role": "user", "content": user_msg})
msgs.append({"role": "system", "content": _DELIBERATE_SYS})
return msgs
def _deliberate(session_id: str, user_msg: str, backend: Backend, model: str | None) -> str:
"""One private 'what do I actually think' pass before replying. Returns her thinking
(empty on any failure — chat must never break because deliberation hiccuped)."""
try:
out = llm.complete(messages + [{"role": "system", "content": _DELIBERATE_SYS}],
backend=backend, model=model)
out = llm.complete(_deliberation_context(session_id, user_msg), backend=backend, model=model)
return (out or "").strip()
except Exception as exc:
logbus.log("error", "deliberation failed", error=str(exc)[:160])
@@ -232,11 +280,11 @@ def _answer_from(thinking: str) -> Message:
def _deliberation_note(session_id: str, user_msg: str, backend: Backend,
model: str | None, messages: list[Message]) -> Message | None:
model: str | None) -> Message | None:
"""Run the private thinking pass if warranted; return the answer-from-thinking note."""
if not config.load().chat_deliberate or not _should_deliberate(user_msg):
return None
thinking = _deliberate(messages, backend, model)
thinking = _deliberate(session_id, user_msg, backend, model)
if not thinking:
return None
logbus.log("info", "deliberated", session=session_id, chars=len(thinking), detail=thinking)
@@ -299,7 +347,7 @@ def _compose(ctx: TurnContext) -> TurnContext:
def _deliberate_part(ctx: TurnContext) -> TurnContext:
"""Private 'what do I actually think' pass, appended last so it shapes the reply."""
note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model, ctx.messages)
note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model)
if note:
ctx.messages.append(note)
return ctx
+46 -6
View File
@@ -1,20 +1,60 @@
"""Persona: Lyra's identity and voice, loaded from an editable markdown prompt.
The prompt lives in `personas/<name>.md` so it can be tuned without touching
code. `LYRA_PERSONA` selects which file to load (default: "lyra").
The prompt lives in `personas/<name>.md` so it can be tuned without touching code.
`LYRA_PERSONA` selects which file to load (default: "lyra").
The file is split on `## ` headers so the control plane can include only what a turn
needs: the **core** (identity + voice — the anti-generic essentials) is always sent;
the heavier situational sections (her origin, the self-model, the poker guardrails)
are pulled in by `mind` only when relevant. This keeps the per-turn prompt tight
without losing fidelity. `system_prompt()` still returns the whole thing (fallback).
"""
from __future__ import annotations
import os
import re
from functools import lru_cache
from pathlib import Path
_PERSONA_DIR = Path(__file__).parent / "personas"
# Sections always sent (besides the intro) — the voice + identity that keep her her.
_CORE = ("Who you are", "How you talk", "Right now")
def _name(name: str | None) -> str:
return name or os.getenv("LYRA_PERSONA", "lyra")
@lru_cache(maxsize=None)
def _sections(name: str) -> dict[str, str]:
"""Parse the persona file into {header: text}; the pre-header preamble is 'intro'."""
text = (_PERSONA_DIR / f"{name}.md").read_text(encoding="utf-8").strip()
chunks = re.split(r"(?m)^## ", text)
out = {"intro": chunks[0].strip()}
for ch in chunks[1:]:
header = ch.split("\n", 1)[0].strip()
out[header] = ("## " + ch).strip()
return out
@lru_cache(maxsize=None)
def system_prompt(name: str | None = None) -> str:
"""Return the persona system prompt. Cached; pass a name to override env."""
name = name or os.getenv("LYRA_PERSONA", "lyra")
path = _PERSONA_DIR / f"{name}.md"
return path.read_text(encoding="utf-8").strip()
"""The full persona (every section). Fallback / back-compat."""
return (_PERSONA_DIR / f"{_name(name)}.md").read_text(encoding="utf-8").strip()
def core_prompt(name: str | None = None) -> str:
"""Intro + the always-on core sections (identity + voice)."""
s = _sections(_name(name))
parts = [s["intro"]] + [section(h, name) for h in _CORE]
return "\n\n".join(p for p in parts if p)
def section(header_prefix: str, name: str | None = None) -> str:
"""A situational section by header prefix (e.g. 'How you actually work'); '' if absent."""
pref = header_prefix.lower()
for header, body in _sections(_name(name)).items():
if header.lower().startswith(pref):
return body
return ""
+20 -3
View File
@@ -29,15 +29,16 @@ def test_should_deliberate_skips_trivial(lyra):
def test_deliberation_note_runs_and_appends(lyra, monkeypatch):
_, mind = lyra
memory, mind = lyra
calls = []
def fake_complete(messages, backend=None, model=None):
calls.append(messages)
return "I actually think the first move is the smallest end-to-end slice."
memory.ensure_session("s1")
monkeypatch.setattr(mind.llm, "complete", fake_complete)
note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None, [])
note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None)
assert note and note["role"] == "system"
assert "first move is the smallest" in note["content"] # her thinking carried in
assert "numbered list" in note["content"].lower() # voice enforcement attached
@@ -49,10 +50,26 @@ def test_deliberation_skipped_when_disabled(lyra, monkeypatch):
monkeypatch.setenv("CHAT_DELIBERATE", "false")
called = []
monkeypatch.setattr(mind.llm, "complete", lambda *a, **k: called.append(1) or "x")
assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None, []) is None
assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None) is None
assert called == [] # no LLM call when off
def test_persona_core_is_tight_situational_is_gated(lyra):
memory, mind = lyra
from lyra import persona
core, full = persona.core_prompt(), persona.system_prompt()
assert "How you talk" in core and "How you actually work" not in core # voice core, self-model not
assert len(core) < len(full) and persona.section("How you actually work")
memory.ensure_session("s1")
casual = " ".join(m["content"] for m in mind.build_messages("s1", "any dinner ideas tonight?")
if m["role"] == "system")
meta = " ".join(m["content"] for m in mind.build_messages("s1", "how does your memory actually work?")
if m["role"] == "system")
assert "How you actually work" not in casual # situational section omitted on a casual turn
assert "How you actually work" in meta # pulled in for a meta question
def test_assemble_runs_the_pipeline(lyra, monkeypatch):
memory, mind = lyra
monkeypatch.setenv("CHAT_DELIBERATE", "false") # keep it offline for the structure test