perf: tighten the dynamic prompt — persona split + lean deliberation

The per-turn prompt was ~5.5K tokens (persona alone ~40%), sent up to 3x/turn. Tightened by RELEVANCE (the control plane decides what each turn needs), not by deletion — fidelity preserved, focus improved (buried instructions were getting ignored), tokens roughly halved. - persona split: core (identity + voice — always) vs situational sections pulled in only when relevant. mind._persona_block: self-model/origin only on meta turns (generous _META_HINTS), poker guardrails only in poker context (mode/strategic/ _POKER_HINTS). persona.core_prompt()/section(); system_prompt() kept as fallback. - lean deliberation: the private 'what do I think' pass now uses a focused context (her interiority + recent turns + the message), not the full persona/profile/ narrative/recall dump. It shapes the take, not the voice. Measured: casual Talk turn 21,949 -> 15,974 chars (-27%); deliberation 21,949 -> 6,026 (-72%); meta turns still include the self-model. Suite 98 green, ruff clean. Real retirement of the long prompt is still the fine-tune (mouth); this is the cheap, high-leverage cut that also improves adherence. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 20:48:44 +00:00
parent 8a3c9b2701
commit 51c2d6abb9
3 changed files with 121 additions and 16 deletions
@@ -29,15 +29,16 @@ def test_should_deliberate_skips_trivial(lyra):


 def test_deliberation_note_runs_and_appends(lyra, monkeypatch):
-    _, mind = lyra
+    memory, mind = lyra
    calls = []

    def fake_complete(messages, backend=None, model=None):
        calls.append(messages)
        return "I actually think the first move is the smallest end-to-end slice."

+    memory.ensure_session("s1")
    monkeypatch.setattr(mind.llm, "complete", fake_complete)
-    note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None, [])
+    note = mind._deliberation_note("s1", "How would we start on this?", "cloud", None)
    assert note and note["role"] == "system"
    assert "first move is the smallest" in note["content"]      # her thinking carried in
    assert "numbered list" in note["content"].lower()           # voice enforcement attached
@@ -49,10 +50,26 @@ def test_deliberation_skipped_when_disabled(lyra, monkeypatch):
    monkeypatch.setenv("CHAT_DELIBERATE", "false")
    called = []
    monkeypatch.setattr(mind.llm, "complete", lambda *a, **k: called.append(1) or "x")
-    assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None, []) is None
+    assert mind._deliberation_note("s1", "a real substantive question here", "cloud", None) is None
    assert called == []                                          # no LLM call when off


+def test_persona_core_is_tight_situational_is_gated(lyra):
+    memory, mind = lyra
+    from lyra import persona
+    core, full = persona.core_prompt(), persona.system_prompt()
+    assert "How you talk" in core and "How you actually work" not in core  # voice core, self-model not
+    assert len(core) < len(full) and persona.section("How you actually work")
+
+    memory.ensure_session("s1")
+    casual = " ".join(m["content"] for m in mind.build_messages("s1", "any dinner ideas tonight?")
+                       if m["role"] == "system")
+    meta = " ".join(m["content"] for m in mind.build_messages("s1", "how does your memory actually work?")
+                    if m["role"] == "system")
+    assert "How you actually work" not in casual      # situational section omitted on a casual turn
+    assert "How you actually work" in meta            # pulled in for a meta question
+
+
 def test_assemble_runs_the_pipeline(lyra, monkeypatch):
    memory, mind = lyra
    monkeypatch.setenv("CHAT_DELIBERATE", "false")  # keep it offline for the structure test