perf: tighten the dynamic prompt — persona split + lean deliberation

The per-turn prompt was ~5.5K tokens (persona alone ~40%), sent up to 3x/turn.
Tightened by RELEVANCE (the control plane decides what each turn needs), not by
deletion — fidelity preserved, focus improved (buried instructions were getting
ignored), tokens roughly halved.

- persona split: core (identity + voice — always) vs situational sections pulled
  in only when relevant. mind._persona_block: self-model/origin only on meta turns
  (generous _META_HINTS), poker guardrails only in poker context (mode/strategic/
  _POKER_HINTS). persona.core_prompt()/section(); system_prompt() kept as fallback.
- lean deliberation: the private 'what do I think' pass now uses a focused context
  (her interiority + recent turns + the message), not the full persona/profile/
  narrative/recall dump. It shapes the take, not the voice.

Measured: casual Talk turn 21,949 -> 15,974 chars (-27%); deliberation 21,949 ->
6,026 (-72%); meta turns still include the self-model. Suite 98 green, ruff clean.

Real retirement of the long prompt is still the fine-tune (mouth); this is the
cheap, high-leverage cut that also improves adherence.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-24 20:48:44 +00:00
parent 8a3c9b2701
commit 51c2d6abb9
3 changed files with 121 additions and 16 deletions
+55 -7
View File
@@ -104,10 +104,40 @@ def _render(messages: list[Message]) -> str:
return "\n\n".join(f"[{m['role']}]\n{m['content']}" for m in messages)
# Generous triggers for the heavy situational persona sections — err toward INCLUDING
# them (a false positive is a few spare KB; a false negative risks confabulation or
# eyeballed poker math). The core (identity + voice) is always present regardless.
_META_HINTS = (
"you work", "how do you", "how does your", "your memory", "your dream", "your thought",
"do you remember", "are you", "do you feel", "conscious", "sentient", "yourself",
"your mind", "who are you", "what are you", "your origin", "how were you", "how did you",
"your inner", "your reflect", "your journal",
)
_POKER_HINTS = (
"poker", "fold", "call", "raise", "river", "turn", "flop", "preflop", "equity", "range",
"villain", "stack", "tilt", "hand", "bluff", "pot", "3bet", "gto", "outs", "draw",
)
def _persona_block(user_msg: str, mode: modes.Mode | None, moment: dict | None) -> str:
"""Core persona always; pull in situational sections (origin/self-model, poker
guardrails) only when the turn calls for it."""
parts = [persona.core_prompt()]
um = user_msg.lower()
kind = (moment or {}).get("kind")
if kind == "meta" or any(h in um for h in _META_HINTS):
parts += [persona.section("What you are"), persona.section("How you actually work")]
poker = (mode and mode.key in ("poker_cash", "study")) or kind == "strategic" \
or any(h in um for h in _POKER_HINTS)
if poker:
parts.append(persona.section("What you do NOT do"))
return "\n\n".join(p for p in parts if p)
def build_messages(session_id: str, user_msg: str,
mode: modes.Mode | None = None, moment: dict | None = None) -> list[Message]:
"""Assemble the full, tiered message list for one turn."""
messages: list[Message] = [{"role": "system", "content": persona.system_prompt()}]
messages: list[Message] = [{"role": "system", "content": _persona_block(user_msg, mode, moment)}]
# Autonomy Core: Lyra's own evolving interiority (mood, self-narrative). Comes
# right after the persona — her sense of self before her model of the world.
@@ -207,12 +237,30 @@ _DELIBERATE_SYS = (
)
def _deliberate(messages: list[Message], backend: Backend, model: str | None) -> str:
def _deliberation_context(session_id: str, user_msg: str) -> list[Message]:
"""A LEAN context for the private thinking pass — her interiority + recent turns +
the message. Deliberately omits the full persona, profile, narrative, and recall
tiers: the thinking doesn't need the voice rules or the world-model dump (those
shape the final reply, not the private take), and dropping them cuts this whole
extra call by most of its tokens."""
msgs: list[Message] = [
{"role": "system", "content": self_state.render_for_context(self_state.load())}
]
inner = _inner_life_note()
if inner:
msgs.append(inner)
for ex in memory.recent(session_id, n=6):
msgs.append({"role": ex.role, "content": ex.content})
msgs.append({"role": "user", "content": user_msg})
msgs.append({"role": "system", "content": _DELIBERATE_SYS})
return msgs
def _deliberate(session_id: str, user_msg: str, backend: Backend, model: str | None) -> str:
"""One private 'what do I actually think' pass before replying. Returns her thinking
(empty on any failure — chat must never break because deliberation hiccuped)."""
try:
out = llm.complete(messages + [{"role": "system", "content": _DELIBERATE_SYS}],
backend=backend, model=model)
out = llm.complete(_deliberation_context(session_id, user_msg), backend=backend, model=model)
return (out or "").strip()
except Exception as exc:
logbus.log("error", "deliberation failed", error=str(exc)[:160])
@@ -232,11 +280,11 @@ def _answer_from(thinking: str) -> Message:
def _deliberation_note(session_id: str, user_msg: str, backend: Backend,
model: str | None, messages: list[Message]) -> Message | None:
model: str | None) -> Message | None:
"""Run the private thinking pass if warranted; return the answer-from-thinking note."""
if not config.load().chat_deliberate or not _should_deliberate(user_msg):
return None
thinking = _deliberate(messages, backend, model)
thinking = _deliberate(session_id, user_msg, backend, model)
if not thinking:
return None
logbus.log("info", "deliberated", session=session_id, chars=len(thinking), detail=thinking)
@@ -299,7 +347,7 @@ def _compose(ctx: TurnContext) -> TurnContext:
def _deliberate_part(ctx: TurnContext) -> TurnContext:
"""Private 'what do I actually think' pass, appended last so it shapes the reply."""
note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model, ctx.messages)
note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model)
if note:
ctx.messages.append(note)
return ctx