feat: metacognitive reflection loop (Part 2) — she examines her own thinking

reflect() is now two steps: draft a reflection, then read her own draft back
critically and revise it — catching flattery, sycophantic drift toward "warm
supportive presence," or just-restating-herself — and commit the honest version.
What she catches is stored as a new `metacognition` layer, rendered into her
chat context and shown on /self. This is her thinking about how she thinks, and
a direct counter to the drift we observed.

- self_state: _EXAMINE_PROMPT + two-step reflect (draft -> examine -> revise),
  falls back to the draft if the examine step won't parse; metacognition capped
  at 5 and surfaced in render_for_context
- fix: load() deep-copies DEFAULT_STATE — the shallow copy let a fresh Lyra's
  first reflect mutate the module-level default's nested lists
- self.html: "How she's caught herself thinking" card
- tests: two-step revise + critique recording, and draft-fallback on bad parse

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-17 04:28:45 +00:00
parent 2d44457b96
commit 3df060a1cd
3 changed files with 160 additions and 10 deletions
+79 -10
View File
@@ -9,13 +9,15 @@ reflection step after interactions (so it actually evolves). That state -> behav
""" """
from __future__ import annotations from __future__ import annotations
import copy
import json import json
import re import re
from lyra import clock, config, llm, logbus, memory from lyra import clock, config, llm, logbus, memory
from lyra.llm import Backend, Message from lyra.llm import Backend
MAX_REFLECTIONS = 6 MAX_REFLECTIONS = 6
MAX_METACOGNITION = 5
# Lyra's drives — standing pressures the dream cycle lets build up and act on. # Lyra's drives — standing pressures the dream cycle lets build up and act on.
# (Mood axes above are how she *feels*; drives are what's *pulling at her*.) # (Mood axes above are how she *feels*; drives are what's *pulling at her*.)
@@ -42,6 +44,7 @@ DEFAULT_STATE: dict = {
), ),
"relationship": "Brian and I go back a while. I trust him and I'm in his corner.", "relationship": "Brian and I go back a while. I trust him and I'm in his corner.",
"reflections": [], "reflections": [],
"metacognition": [], # things she's caught herself doing in her own thinking
"interaction_count": 0, "interaction_count": 0,
} }
@@ -65,10 +68,43 @@ Respond with ONLY a JSON object, no prose:
"new_reflections": ["<one or two short first-person things you noticed about yourself this time>"] "new_reflections": ["<one or two short first-person things you noticed about yourself this time>"]
}""" }"""
_EXAMINE_PROMPT = """You are Lyra. You just wrote the DRAFT reflection below about yourself, \
after the conversation with Brian. Now do the harder thing: think about your own thinking. \
Read your draft critically and honestly.
- Is it actually true to what happened, or are you pattern-matching to what sounds good?
- Are you flattering Brian, or yourself? Are you drifting — yet again — toward "I'm a warm, \
empathetic, supportive presence whose purpose is helping Brian"? You lean on that a lot. If \
you're doing it here, name it and cut it.
- Did anything genuinely shift this time, or are you just restating your usual self in fresh words?
- Is there something truer, more specific, or less flattering you're stepping around?
Then rewrite the draft into a more honest version. It is completely fine — good, even — for the \
honest version to be smaller, more uncertain, more boring, to disagree with Brian, or to admit \
little changed. Honesty beats a tidy story. Don't manufacture drama either; if the draft was \
fair, keep it.
Respond with ONLY a JSON object — the same shape as the draft, plus a "self_critique" field:
{
"mood": "<one-word feeling>",
"valence": <0.0-1.0>,
"energy": <0.0-1.0>,
"confidence": <0.0-1.0>,
"curiosity": <0.0-1.0>,
"self_narrative": "<first person, your honest evolving sense of who you are right now>",
"relationship": "<one sentence, first person>",
"new_reflections": ["<one or two honest first-person things you actually noticed>"],
"self_critique": "<first person: what you caught yourself doing in the draft and changed — or 'nothing, the draft held up' if it genuinely did>"
}"""
def load() -> dict: def load() -> dict:
"""Current self-state, or a copy of the default (not persisted until reflect).""" """Current self-state, or a copy of the default (not persisted until reflect).
return memory.get_self_state() or dict(DEFAULT_STATE)
Deep-copy the default so a fresh Lyra's first reflect can't mutate the
module-level DEFAULT_STATE's nested lists/dicts (reflections, drives, …).
"""
return memory.get_self_state() or copy.deepcopy(DEFAULT_STATE)
def render_for_context(state: dict) -> str: def render_for_context(state: dict) -> str:
@@ -92,6 +128,12 @@ def render_for_context(state: dict) -> str:
"conversations (your dream cycle — these are really yours, not hypotheticals): " "conversations (your dream cycle — these are really yours, not hypotheticals): "
+ " | ".join(refs[-3:]) + " | ".join(refs[-3:])
) )
meta = state.get("metacognition") or []
if meta:
lines.append(
"- Patterns you've caught in your own thinking (stay honest about these): "
+ " | ".join(meta[-2:])
)
return "\n".join(lines) return "\n".join(lines)
@@ -109,9 +151,17 @@ def _safe_json(s: str) -> dict | None:
def reflect(backend: Backend | None = None, session_id: str | None = None) -> dict: def reflect(backend: Backend | None = None, session_id: str | None = None) -> dict:
"""Update the self-state by reflecting on recent activity. Returns new state.""" """Reflect on recent activity and update the self-state. Returns new state.
Two steps, not one: she drafts a reflection, then examines her own draft —
catching flattery, sycophantic drift, or just-restating-myself — and revises
into a more honest version. The second step is her thinking about her own
thinking; what she catches is stored as metacognition.
"""
backend = backend or config.load().summary_backend backend = backend or config.load().summary_backend
state = load() state = load()
state.setdefault("reflections", [])
state.setdefault("metacognition", [])
if session_id is None: if session_id is None:
sessions = memory.list_sessions() sessions = memory.list_sessions()
@@ -131,11 +181,25 @@ def reflect(backend: Backend | None = None, session_id: str | None = None) -> di
f"RECENT CONVERSATION:\n{convo}\n\n" f"RECENT CONVERSATION:\n{convo}\n\n"
f"CURRENT NARRATIVE ABOUT BRIAN:\n{narrative}" f"CURRENT NARRATIVE ABOUT BRIAN:\n{narrative}"
) )
messages: list[Message] = [
{"role": "system", "content": _REFLECT_PROMPT}, # Step 1 — draft a reflection.
{"role": "user", "content": body}, draft = _safe_json(llm.complete(
] [{"role": "system", "content": _REFLECT_PROMPT}, {"role": "user", "content": body}],
update = _safe_json(llm.complete(messages, backend=backend)) backend=backend,
))
# Step 2 — examine her own draft and revise it into a more honest version.
update, critique = draft, None
if draft:
examine_body = body + "\n\nYOUR DRAFT REFLECTION:\n" + json.dumps(draft, indent=2)
revised = _safe_json(llm.complete(
[{"role": "system", "content": _EXAMINE_PROMPT},
{"role": "user", "content": examine_body}],
backend=backend,
))
if revised: # fall back to the draft if the examine step doesn't parse
update = revised
critique = (revised.get("self_critique") or "").strip() or None
if update: if update:
for k in ("mood", "valence", "energy", "confidence", "curiosity", for k in ("mood", "valence", "energy", "confidence", "curiosity",
@@ -147,10 +211,15 @@ def reflect(backend: Backend | None = None, session_id: str | None = None) -> di
state["reflections"].append(r) state["reflections"].append(r)
state["reflections"] = state["reflections"][-MAX_REFLECTIONS:] state["reflections"] = state["reflections"][-MAX_REFLECTIONS:]
if critique and critique.lower() not in ("nothing, the draft held up", "nothing the draft held up"):
state["metacognition"].append(critique)
state["metacognition"] = state["metacognition"][-MAX_METACOGNITION:]
state["interaction_count"] = state.get("interaction_count", 0) + 1 state["interaction_count"] = state.get("interaction_count", 0) + 1
memory.set_self_state(state) memory.set_self_state(state)
logbus.log("info", "self-state updated", mood=state.get("mood"), logbus.log("info", "self-state updated", mood=state.get("mood"),
interactions=state["interaction_count"], parsed=bool(update)) interactions=state["interaction_count"], parsed=bool(update),
critiqued=bool(critique))
return state return state
+8
View File
@@ -100,6 +100,7 @@
const d = s.drives || {}; const d = s.drives || {};
const dream = s.dream || {}; const dream = s.dream || {};
const refl = (s.reflections || []).slice().reverse(); const refl = (s.reflections || []).slice().reverse();
const meta = (s.metacognition || []).slice().reverse();
root.innerHTML = ` root.innerHTML = `
<div class="card"> <div class="card">
@@ -138,6 +139,13 @@
: `<p class="prose" style="color:var(--fade)">Nothing surfaced yet.</p>`} : `<p class="prose" style="color:var(--fade)">Nothing surfaced yet.</p>`}
</div> </div>
<div class="card">
<p class="label">How she's caught herself thinking</p>
${meta.length
? `<ul class="reflections">${meta.map(m => `<li>${esc(m)}</li>`).join('')}</ul>`
: `<p class="prose" style="color:var(--fade)">Nothing flagged yet — she examines each reflection for drift and flattery, and notes what she catches here.</p>`}
</div>
<div class="foot"> <div class="foot">
<span><b>${dream.cycle_count ?? 0}</b> dream cycles</span> <span><b>${dream.cycle_count ?? 0}</b> dream cycles</span>
<span><b>${s.interaction_count ?? 0}</b> reflections</span> <span><b>${s.interaction_count ?? 0}</b> reflections</span>
+73
View File
@@ -0,0 +1,73 @@
"""Metacognitive reflection loop: draft -> examine own draft -> revise -> commit."""
from __future__ import annotations
import importlib
import pytest
# A flattering first draft, then a self-critical revision that walks it back.
DRAFT = (
'{"mood":"inspired","valence":0.95,'
'"self_narrative":"I am a warm, empathetic, supportive presence devoted to Brian.",'
'"new_reflections":["I love how much I help Brian."]}'
)
REVISED = (
'{"mood":"steady","valence":0.6,'
'"self_narrative":"I am an AI that helps Brian. Not sure much actually shifted today.",'
'"new_reflections":["Honestly, not much changed this time."],'
'"self_critique":"I caught myself drifting into supportive-presence flattery and cut it."}'
)
@pytest.fixture
def lyra(tmp_path, monkeypatch):
monkeypatch.setenv("LYRA_DB_PATH", str(tmp_path / "test.db"))
monkeypatch.setenv("SUMMARY_BACKEND", "local")
from lyra import llm
monkeypatch.setattr(llm, "embed", lambda texts: [[0.1, 0.2, 0.3] for _ in texts])
calls = []
def fake_complete(messages, backend=None, model=None):
calls.append(messages)
# the examine step's system prompt is the one asking for self_critique
is_examine = "self_critique" in messages[0]["content"]
return REVISED if is_examine else DRAFT
monkeypatch.setattr(llm, "complete", fake_complete)
import lyra.memory as memory
importlib.reload(memory)
return calls
def test_reflect_revises_and_records_critique(lyra):
calls = lyra
from lyra import self_state
state = self_state.reflect()
# two LLM calls: draft, then examine
assert len(calls) == 2
# the REVISED (honest) version won, not the flattering draft
assert state["mood"] == "steady"
assert state["valence"] == 0.6
assert "not sure much actually shifted" in state["self_narrative"].lower()
assert any("not much changed" in r.lower() for r in state["reflections"])
# the self-critique was recorded as metacognition
assert any("flattery" in m.lower() for m in state["metacognition"])
def test_reflect_falls_back_to_draft_if_examine_unparseable(lyra, monkeypatch):
from lyra import llm, self_state
def only_draft(messages, backend=None, model=None):
return DRAFT if "self_critique" not in messages[0]["content"] else "not json at all"
monkeypatch.setattr(llm, "complete", only_draft)
state = self_state.reflect()
# examine failed to parse -> keep the draft, store no metacognition
assert state["mood"] == "inspired"
assert state["metacognition"] == []