perf: tighten the dynamic prompt — persona split + lean deliberation
The per-turn prompt was ~5.5K tokens (persona alone ~40%), sent up to 3x/turn. Tightened by RELEVANCE (the control plane decides what each turn needs), not by deletion — fidelity preserved, focus improved (buried instructions were getting ignored), tokens roughly halved. - persona split: core (identity + voice — always) vs situational sections pulled in only when relevant. mind._persona_block: self-model/origin only on meta turns (generous _META_HINTS), poker guardrails only in poker context (mode/strategic/ _POKER_HINTS). persona.core_prompt()/section(); system_prompt() kept as fallback. - lean deliberation: the private 'what do I think' pass now uses a focused context (her interiority + recent turns + the message), not the full persona/profile/ narrative/recall dump. It shapes the take, not the voice. Measured: casual Talk turn 21,949 -> 15,974 chars (-27%); deliberation 21,949 -> 6,026 (-72%); meta turns still include the self-model. Suite 98 green, ruff clean. Real retirement of the long prompt is still the fine-tune (mouth); this is the cheap, high-leverage cut that also improves adherence. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+55
-7
@@ -104,10 +104,40 @@ def _render(messages: list[Message]) -> str:
|
|||||||
return "\n\n".join(f"[{m['role']}]\n{m['content']}" for m in messages)
|
return "\n\n".join(f"[{m['role']}]\n{m['content']}" for m in messages)
|
||||||
|
|
||||||
|
|
||||||
|
# Generous triggers for the heavy situational persona sections — err toward INCLUDING
|
||||||
|
# them (a false positive is a few spare KB; a false negative risks confabulation or
|
||||||
|
# eyeballed poker math). The core (identity + voice) is always present regardless.
|
||||||
|
_META_HINTS = (
|
||||||
|
"you work", "how do you", "how does your", "your memory", "your dream", "your thought",
|
||||||
|
"do you remember", "are you", "do you feel", "conscious", "sentient", "yourself",
|
||||||
|
"your mind", "who are you", "what are you", "your origin", "how were you", "how did you",
|
||||||
|
"your inner", "your reflect", "your journal",
|
||||||
|
)
|
||||||
|
_POKER_HINTS = (
|
||||||
|
"poker", "fold", "call", "raise", "river", "turn", "flop", "preflop", "equity", "range",
|
||||||
|
"villain", "stack", "tilt", "hand", "bluff", "pot", "3bet", "gto", "outs", "draw",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _persona_block(user_msg: str, mode: modes.Mode | None, moment: dict | None) -> str:
|
||||||
|
"""Core persona always; pull in situational sections (origin/self-model, poker
|
||||||
|
guardrails) only when the turn calls for it."""
|
||||||
|
parts = [persona.core_prompt()]
|
||||||
|
um = user_msg.lower()
|
||||||
|
kind = (moment or {}).get("kind")
|
||||||
|
if kind == "meta" or any(h in um for h in _META_HINTS):
|
||||||
|
parts += [persona.section("What you are"), persona.section("How you actually work")]
|
||||||
|
poker = (mode and mode.key in ("poker_cash", "study")) or kind == "strategic" \
|
||||||
|
or any(h in um for h in _POKER_HINTS)
|
||||||
|
if poker:
|
||||||
|
parts.append(persona.section("What you do NOT do"))
|
||||||
|
return "\n\n".join(p for p in parts if p)
|
||||||
|
|
||||||
|
|
||||||
def build_messages(session_id: str, user_msg: str,
|
def build_messages(session_id: str, user_msg: str,
|
||||||
mode: modes.Mode | None = None, moment: dict | None = None) -> list[Message]:
|
mode: modes.Mode | None = None, moment: dict | None = None) -> list[Message]:
|
||||||
"""Assemble the full, tiered message list for one turn."""
|
"""Assemble the full, tiered message list for one turn."""
|
||||||
messages: list[Message] = [{"role": "system", "content": persona.system_prompt()}]
|
messages: list[Message] = [{"role": "system", "content": _persona_block(user_msg, mode, moment)}]
|
||||||
|
|
||||||
# Autonomy Core: Lyra's own evolving interiority (mood, self-narrative). Comes
|
# Autonomy Core: Lyra's own evolving interiority (mood, self-narrative). Comes
|
||||||
# right after the persona — her sense of self before her model of the world.
|
# right after the persona — her sense of self before her model of the world.
|
||||||
@@ -207,12 +237,30 @@ _DELIBERATE_SYS = (
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _deliberate(messages: list[Message], backend: Backend, model: str | None) -> str:
|
def _deliberation_context(session_id: str, user_msg: str) -> list[Message]:
|
||||||
|
"""A LEAN context for the private thinking pass — her interiority + recent turns +
|
||||||
|
the message. Deliberately omits the full persona, profile, narrative, and recall
|
||||||
|
tiers: the thinking doesn't need the voice rules or the world-model dump (those
|
||||||
|
shape the final reply, not the private take), and dropping them cuts this whole
|
||||||
|
extra call by most of its tokens."""
|
||||||
|
msgs: list[Message] = [
|
||||||
|
{"role": "system", "content": self_state.render_for_context(self_state.load())}
|
||||||
|
]
|
||||||
|
inner = _inner_life_note()
|
||||||
|
if inner:
|
||||||
|
msgs.append(inner)
|
||||||
|
for ex in memory.recent(session_id, n=6):
|
||||||
|
msgs.append({"role": ex.role, "content": ex.content})
|
||||||
|
msgs.append({"role": "user", "content": user_msg})
|
||||||
|
msgs.append({"role": "system", "content": _DELIBERATE_SYS})
|
||||||
|
return msgs
|
||||||
|
|
||||||
|
|
||||||
|
def _deliberate(session_id: str, user_msg: str, backend: Backend, model: str | None) -> str:
|
||||||
"""One private 'what do I actually think' pass before replying. Returns her thinking
|
"""One private 'what do I actually think' pass before replying. Returns her thinking
|
||||||
(empty on any failure — chat must never break because deliberation hiccuped)."""
|
(empty on any failure — chat must never break because deliberation hiccuped)."""
|
||||||
try:
|
try:
|
||||||
out = llm.complete(messages + [{"role": "system", "content": _DELIBERATE_SYS}],
|
out = llm.complete(_deliberation_context(session_id, user_msg), backend=backend, model=model)
|
||||||
backend=backend, model=model)
|
|
||||||
return (out or "").strip()
|
return (out or "").strip()
|
||||||
except Exception as exc:
|
except Exception as exc:
|
||||||
logbus.log("error", "deliberation failed", error=str(exc)[:160])
|
logbus.log("error", "deliberation failed", error=str(exc)[:160])
|
||||||
@@ -232,11 +280,11 @@ def _answer_from(thinking: str) -> Message:
|
|||||||
|
|
||||||
|
|
||||||
def _deliberation_note(session_id: str, user_msg: str, backend: Backend,
|
def _deliberation_note(session_id: str, user_msg: str, backend: Backend,
|
||||||
model: str | None, messages: list[Message]) -> Message | None:
|
model: str | None) -> Message | None:
|
||||||
"""Run the private thinking pass if warranted; return the answer-from-thinking note."""
|
"""Run the private thinking pass if warranted; return the answer-from-thinking note."""
|
||||||
if not config.load().chat_deliberate or not _should_deliberate(user_msg):
|
if not config.load().chat_deliberate or not _should_deliberate(user_msg):
|
||||||
return None
|
return None
|
||||||
thinking = _deliberate(messages, backend, model)
|
thinking = _deliberate(session_id, user_msg, backend, model)
|
||||||
if not thinking:
|
if not thinking:
|
||||||
return None
|
return None
|
||||||
logbus.log("info", "deliberated", session=session_id, chars=len(thinking), detail=thinking)
|
logbus.log("info", "deliberated", session=session_id, chars=len(thinking), detail=thinking)
|
||||||
@@ -299,7 +347,7 @@ def _compose(ctx: TurnContext) -> TurnContext:
|
|||||||
|
|
||||||
def _deliberate_part(ctx: TurnContext) -> TurnContext:
|
def _deliberate_part(ctx: TurnContext) -> TurnContext:
|
||||||
"""Private 'what do I actually think' pass, appended last so it shapes the reply."""
|
"""Private 'what do I actually think' pass, appended last so it shapes the reply."""
|
||||||
note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model, ctx.messages)
|
note = _deliberation_note(ctx.session_id, ctx.user_msg, ctx.backend, ctx.model)
|
||||||
if note:
|
if note:
|
||||||
ctx.messages.append(note)
|
ctx.messages.append(note)
|
||||||
return ctx
|
return ctx
|
||||||
|
|||||||
+46
-6
@@ -1,20 +1,60 @@
|
|||||||
"""Persona: Lyra's identity and voice, loaded from an editable markdown prompt.
|
"""Persona: Lyra's identity and voice, loaded from an editable markdown prompt.
|
||||||
|
|
||||||
The prompt lives in `personas/<name>.md` so it can be tuned without touching
|
The prompt lives in `personas/<name>.md` so it can be tuned without touching code.
|
||||||
code. `LYRA_PERSONA` selects which file to load (default: "lyra").
|
`LYRA_PERSONA` selects which file to load (default: "lyra").
|
||||||
|
|
||||||
|
The file is split on `## ` headers so the control plane can include only what a turn
|
||||||
|
needs: the **core** (identity + voice — the anti-generic essentials) is always sent;
|
||||||
|
the heavier situational sections (her origin, the self-model, the poker guardrails)
|
||||||
|
are pulled in by `mind` only when relevant. This keeps the per-turn prompt tight
|
||||||
|
without losing fidelity. `system_prompt()` still returns the whole thing (fallback).
|
||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
import os
|
import os
|
||||||
|
import re
|
||||||
from functools import lru_cache
|
from functools import lru_cache
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
_PERSONA_DIR = Path(__file__).parent / "personas"
|
_PERSONA_DIR = Path(__file__).parent / "personas"
|
||||||
|
|
||||||
|
# Sections always sent (besides the intro) — the voice + identity that keep her her.
|
||||||
|
_CORE = ("Who you are", "How you talk", "Right now")
|
||||||
|
|
||||||
|
|
||||||
|
def _name(name: str | None) -> str:
|
||||||
|
return name or os.getenv("LYRA_PERSONA", "lyra")
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=None)
|
||||||
|
def _sections(name: str) -> dict[str, str]:
|
||||||
|
"""Parse the persona file into {header: text}; the pre-header preamble is 'intro'."""
|
||||||
|
text = (_PERSONA_DIR / f"{name}.md").read_text(encoding="utf-8").strip()
|
||||||
|
chunks = re.split(r"(?m)^## ", text)
|
||||||
|
out = {"intro": chunks[0].strip()}
|
||||||
|
for ch in chunks[1:]:
|
||||||
|
header = ch.split("\n", 1)[0].strip()
|
||||||
|
out[header] = ("## " + ch).strip()
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=None)
|
@lru_cache(maxsize=None)
|
||||||
def system_prompt(name: str | None = None) -> str:
|
def system_prompt(name: str | None = None) -> str:
|
||||||
"""Return the persona system prompt. Cached; pass a name to override env."""
|
"""The full persona (every section). Fallback / back-compat."""
|
||||||
name = name or os.getenv("LYRA_PERSONA", "lyra")
|
return (_PERSONA_DIR / f"{_name(name)}.md").read_text(encoding="utf-8").strip()
|
||||||
path = _PERSONA_DIR / f"{name}.md"
|
|
||||||
return path.read_text(encoding="utf-8").strip()
|
|
||||||
|
def core_prompt(name: str | None = None) -> str:
|
||||||
|
"""Intro + the always-on core sections (identity + voice)."""
|
||||||
|
s = _sections(_name(name))
|
||||||
|
parts = [s["intro"]] + [section(h, name) for h in _CORE]
|
||||||
|
return "\n\n".join(p for p in parts if p)
|
||||||
|
|
||||||
|
|
||||||
|
def section(header_prefix: str, name: str | None = None) -> str:
|
||||||
|
"""A situational section by header prefix (e.g. 'How you actually work'); '' if absent."""
|
||||||
|
pref = header_prefix.lower()
|
||||||
|
for header, body in _sections(_name(name)).items():
|
||||||
|
if header.lower().startswith(pref):
|
||||||
|
return body
|
||||||
|
return ""
|
||||||
|
|||||||
+20
-3
@@ -29,15 +29,16 @@ def test_should_deliberate_skips_trivial(lyra):
|
|||||||
|
|
||||||
|
|
||||||
def test_deliberation_note_runs_and_appends(lyra, monkeypatch):
|
def test_deliberation_note_runs_and_appends(lyra, monkeypatch):
|
||||||
_, mind = lyra
|
memory, mind = lyra
|
||||||
calls = []
|
calls = []
|
||||||
|
|
||||||
def fake_complete(messages, backend=None, model=None):
|
def fake_complete(messages, backend=None, model=None):
|
||||||
calls.append(messages)
|
calls.append(messages)
|
||||||
return "I actually think the first move is the smallest end-to-end slice."
|
return "I actually think the first move is the smallest end-to-end slice."
|
||||||
|
|
||||||
|
memory.ensure_session("s1")
|
||||||
monkeypatch.setattr(mind.llm, "complete", fake_complete)
|
monkeypatch.setattr(mind.llm, "complete", fake_complete)
|
||||||
note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None, [])
|
note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None)
|
||||||
assert note and note["role"] == "system"
|
assert note and note["role"] == "system"
|
||||||
assert "first move is the smallest" in note["content"] # her thinking carried in
|
assert "first move is the smallest" in note["content"] # her thinking carried in
|
||||||
assert "numbered list" in note["content"].lower() # voice enforcement attached
|
assert "numbered list" in note["content"].lower() # voice enforcement attached
|
||||||
@@ -49,10 +50,26 @@ def test_deliberation_skipped_when_disabled(lyra, monkeypatch):
|
|||||||
monkeypatch.setenv("CHAT_DELIBERATE", "false")
|
monkeypatch.setenv("CHAT_DELIBERATE", "false")
|
||||||
called = []
|
called = []
|
||||||
monkeypatch.setattr(mind.llm, "complete", lambda *a, **k: called.append(1) or "x")
|
monkeypatch.setattr(mind.llm, "complete", lambda *a, **k: called.append(1) or "x")
|
||||||
assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None, []) is None
|
assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None) is None
|
||||||
assert called == [] # no LLM call when off
|
assert called == [] # no LLM call when off
|
||||||
|
|
||||||
|
|
||||||
|
def test_persona_core_is_tight_situational_is_gated(lyra):
|
||||||
|
memory, mind = lyra
|
||||||
|
from lyra import persona
|
||||||
|
core, full = persona.core_prompt(), persona.system_prompt()
|
||||||
|
assert "How you talk" in core and "How you actually work" not in core # voice core, self-model not
|
||||||
|
assert len(core) < len(full) and persona.section("How you actually work")
|
||||||
|
|
||||||
|
memory.ensure_session("s1")
|
||||||
|
casual = " ".join(m["content"] for m in mind.build_messages("s1", "any dinner ideas tonight?")
|
||||||
|
if m["role"] == "system")
|
||||||
|
meta = " ".join(m["content"] for m in mind.build_messages("s1", "how does your memory actually work?")
|
||||||
|
if m["role"] == "system")
|
||||||
|
assert "How you actually work" not in casual # situational section omitted on a casual turn
|
||||||
|
assert "How you actually work" in meta # pulled in for a meta question
|
||||||
|
|
||||||
|
|
||||||
def test_assemble_runs_the_pipeline(lyra, monkeypatch):
|
def test_assemble_runs_the_pipeline(lyra, monkeypatch):
|
||||||
memory, mind = lyra
|
memory, mind = lyra
|
||||||
monkeypatch.setenv("CHAT_DELIBERATE", "false") # keep it offline for the structure test
|
monkeypatch.setenv("CHAT_DELIBERATE", "false") # keep it offline for the structure test
|
||||||
|
|||||||
Reference in New Issue
Block a user