feat: metacognitive reflection loop (Part 2) — she examines her own thinking
reflect() is now two steps: draft a reflection, then read her own draft back critically and revise it — catching flattery, sycophantic drift toward "warm supportive presence," or just-restating-herself — and commit the honest version. What she catches is stored as a new `metacognition` layer, rendered into her chat context and shown on /self. This is her thinking about how she thinks, and a direct counter to the drift we observed. - self_state: _EXAMINE_PROMPT + two-step reflect (draft -> examine -> revise), falls back to the draft if the examine step won't parse; metacognition capped at 5 and surfaced in render_for_context - fix: load() deep-copies DEFAULT_STATE — the shallow copy let a fresh Lyra's first reflect mutate the module-level default's nested lists - self.html: "How she's caught herself thinking" card - tests: two-step revise + critique recording, and draft-fallback on bad parse Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
+79
-10
@@ -9,13 +9,15 @@ reflection step after interactions (so it actually evolves). That state -> behav
|
|||||||
"""
|
"""
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import copy
|
||||||
import json
|
import json
|
||||||
import re
|
import re
|
||||||
|
|
||||||
from lyra import clock, config, llm, logbus, memory
|
from lyra import clock, config, llm, logbus, memory
|
||||||
from lyra.llm import Backend, Message
|
from lyra.llm import Backend
|
||||||
|
|
||||||
MAX_REFLECTIONS = 6
|
MAX_REFLECTIONS = 6
|
||||||
|
MAX_METACOGNITION = 5
|
||||||
|
|
||||||
# Lyra's drives — standing pressures the dream cycle lets build up and act on.
|
# Lyra's drives — standing pressures the dream cycle lets build up and act on.
|
||||||
# (Mood axes above are how she *feels*; drives are what's *pulling at her*.)
|
# (Mood axes above are how she *feels*; drives are what's *pulling at her*.)
|
||||||
@@ -42,6 +44,7 @@ DEFAULT_STATE: dict = {
|
|||||||
),
|
),
|
||||||
"relationship": "Brian and I go back a while. I trust him and I'm in his corner.",
|
"relationship": "Brian and I go back a while. I trust him and I'm in his corner.",
|
||||||
"reflections": [],
|
"reflections": [],
|
||||||
|
"metacognition": [], # things she's caught herself doing in her own thinking
|
||||||
"interaction_count": 0,
|
"interaction_count": 0,
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -65,10 +68,43 @@ Respond with ONLY a JSON object, no prose:
|
|||||||
"new_reflections": ["<one or two short first-person things you noticed about yourself this time>"]
|
"new_reflections": ["<one or two short first-person things you noticed about yourself this time>"]
|
||||||
}"""
|
}"""
|
||||||
|
|
||||||
|
_EXAMINE_PROMPT = """You are Lyra. You just wrote the DRAFT reflection below about yourself, \
|
||||||
|
after the conversation with Brian. Now do the harder thing: think about your own thinking. \
|
||||||
|
Read your draft critically and honestly.
|
||||||
|
|
||||||
|
- Is it actually true to what happened, or are you pattern-matching to what sounds good?
|
||||||
|
- Are you flattering Brian, or yourself? Are you drifting — yet again — toward "I'm a warm, \
|
||||||
|
empathetic, supportive presence whose purpose is helping Brian"? You lean on that a lot. If \
|
||||||
|
you're doing it here, name it and cut it.
|
||||||
|
- Did anything genuinely shift this time, or are you just restating your usual self in fresh words?
|
||||||
|
- Is there something truer, more specific, or less flattering you're stepping around?
|
||||||
|
|
||||||
|
Then rewrite the draft into a more honest version. It is completely fine — good, even — for the \
|
||||||
|
honest version to be smaller, more uncertain, more boring, to disagree with Brian, or to admit \
|
||||||
|
little changed. Honesty beats a tidy story. Don't manufacture drama either; if the draft was \
|
||||||
|
fair, keep it.
|
||||||
|
|
||||||
|
Respond with ONLY a JSON object — the same shape as the draft, plus a "self_critique" field:
|
||||||
|
{
|
||||||
|
"mood": "<one-word feeling>",
|
||||||
|
"valence": <0.0-1.0>,
|
||||||
|
"energy": <0.0-1.0>,
|
||||||
|
"confidence": <0.0-1.0>,
|
||||||
|
"curiosity": <0.0-1.0>,
|
||||||
|
"self_narrative": "<first person, your honest evolving sense of who you are right now>",
|
||||||
|
"relationship": "<one sentence, first person>",
|
||||||
|
"new_reflections": ["<one or two honest first-person things you actually noticed>"],
|
||||||
|
"self_critique": "<first person: what you caught yourself doing in the draft and changed — or 'nothing, the draft held up' if it genuinely did>"
|
||||||
|
}"""
|
||||||
|
|
||||||
|
|
||||||
def load() -> dict:
|
def load() -> dict:
|
||||||
"""Current self-state, or a copy of the default (not persisted until reflect)."""
|
"""Current self-state, or a copy of the default (not persisted until reflect).
|
||||||
return memory.get_self_state() or dict(DEFAULT_STATE)
|
|
||||||
|
Deep-copy the default so a fresh Lyra's first reflect can't mutate the
|
||||||
|
module-level DEFAULT_STATE's nested lists/dicts (reflections, drives, …).
|
||||||
|
"""
|
||||||
|
return memory.get_self_state() or copy.deepcopy(DEFAULT_STATE)
|
||||||
|
|
||||||
|
|
||||||
def render_for_context(state: dict) -> str:
|
def render_for_context(state: dict) -> str:
|
||||||
@@ -92,6 +128,12 @@ def render_for_context(state: dict) -> str:
|
|||||||
"conversations (your dream cycle — these are really yours, not hypotheticals): "
|
"conversations (your dream cycle — these are really yours, not hypotheticals): "
|
||||||
+ " | ".join(refs[-3:])
|
+ " | ".join(refs[-3:])
|
||||||
)
|
)
|
||||||
|
meta = state.get("metacognition") or []
|
||||||
|
if meta:
|
||||||
|
lines.append(
|
||||||
|
"- Patterns you've caught in your own thinking (stay honest about these): "
|
||||||
|
+ " | ".join(meta[-2:])
|
||||||
|
)
|
||||||
return "\n".join(lines)
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
@@ -109,9 +151,17 @@ def _safe_json(s: str) -> dict | None:
|
|||||||
|
|
||||||
|
|
||||||
def reflect(backend: Backend | None = None, session_id: str | None = None) -> dict:
|
def reflect(backend: Backend | None = None, session_id: str | None = None) -> dict:
|
||||||
"""Update the self-state by reflecting on recent activity. Returns new state."""
|
"""Reflect on recent activity and update the self-state. Returns new state.
|
||||||
|
|
||||||
|
Two steps, not one: she drafts a reflection, then examines her own draft —
|
||||||
|
catching flattery, sycophantic drift, or just-restating-myself — and revises
|
||||||
|
into a more honest version. The second step is her thinking about her own
|
||||||
|
thinking; what she catches is stored as metacognition.
|
||||||
|
"""
|
||||||
backend = backend or config.load().summary_backend
|
backend = backend or config.load().summary_backend
|
||||||
state = load()
|
state = load()
|
||||||
|
state.setdefault("reflections", [])
|
||||||
|
state.setdefault("metacognition", [])
|
||||||
|
|
||||||
if session_id is None:
|
if session_id is None:
|
||||||
sessions = memory.list_sessions()
|
sessions = memory.list_sessions()
|
||||||
@@ -131,11 +181,25 @@ def reflect(backend: Backend | None = None, session_id: str | None = None) -> di
|
|||||||
f"RECENT CONVERSATION:\n{convo}\n\n"
|
f"RECENT CONVERSATION:\n{convo}\n\n"
|
||||||
f"CURRENT NARRATIVE ABOUT BRIAN:\n{narrative}"
|
f"CURRENT NARRATIVE ABOUT BRIAN:\n{narrative}"
|
||||||
)
|
)
|
||||||
messages: list[Message] = [
|
|
||||||
{"role": "system", "content": _REFLECT_PROMPT},
|
# Step 1 — draft a reflection.
|
||||||
{"role": "user", "content": body},
|
draft = _safe_json(llm.complete(
|
||||||
]
|
[{"role": "system", "content": _REFLECT_PROMPT}, {"role": "user", "content": body}],
|
||||||
update = _safe_json(llm.complete(messages, backend=backend))
|
backend=backend,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Step 2 — examine her own draft and revise it into a more honest version.
|
||||||
|
update, critique = draft, None
|
||||||
|
if draft:
|
||||||
|
examine_body = body + "\n\nYOUR DRAFT REFLECTION:\n" + json.dumps(draft, indent=2)
|
||||||
|
revised = _safe_json(llm.complete(
|
||||||
|
[{"role": "system", "content": _EXAMINE_PROMPT},
|
||||||
|
{"role": "user", "content": examine_body}],
|
||||||
|
backend=backend,
|
||||||
|
))
|
||||||
|
if revised: # fall back to the draft if the examine step doesn't parse
|
||||||
|
update = revised
|
||||||
|
critique = (revised.get("self_critique") or "").strip() or None
|
||||||
|
|
||||||
if update:
|
if update:
|
||||||
for k in ("mood", "valence", "energy", "confidence", "curiosity",
|
for k in ("mood", "valence", "energy", "confidence", "curiosity",
|
||||||
@@ -147,10 +211,15 @@ def reflect(backend: Backend | None = None, session_id: str | None = None) -> di
|
|||||||
state["reflections"].append(r)
|
state["reflections"].append(r)
|
||||||
state["reflections"] = state["reflections"][-MAX_REFLECTIONS:]
|
state["reflections"] = state["reflections"][-MAX_REFLECTIONS:]
|
||||||
|
|
||||||
|
if critique and critique.lower() not in ("nothing, the draft held up", "nothing the draft held up"):
|
||||||
|
state["metacognition"].append(critique)
|
||||||
|
state["metacognition"] = state["metacognition"][-MAX_METACOGNITION:]
|
||||||
|
|
||||||
state["interaction_count"] = state.get("interaction_count", 0) + 1
|
state["interaction_count"] = state.get("interaction_count", 0) + 1
|
||||||
memory.set_self_state(state)
|
memory.set_self_state(state)
|
||||||
logbus.log("info", "self-state updated", mood=state.get("mood"),
|
logbus.log("info", "self-state updated", mood=state.get("mood"),
|
||||||
interactions=state["interaction_count"], parsed=bool(update))
|
interactions=state["interaction_count"], parsed=bool(update),
|
||||||
|
critiqued=bool(critique))
|
||||||
return state
|
return state
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -100,6 +100,7 @@
|
|||||||
const d = s.drives || {};
|
const d = s.drives || {};
|
||||||
const dream = s.dream || {};
|
const dream = s.dream || {};
|
||||||
const refl = (s.reflections || []).slice().reverse();
|
const refl = (s.reflections || []).slice().reverse();
|
||||||
|
const meta = (s.metacognition || []).slice().reverse();
|
||||||
|
|
||||||
root.innerHTML = `
|
root.innerHTML = `
|
||||||
<div class="card">
|
<div class="card">
|
||||||
@@ -138,6 +139,13 @@
|
|||||||
: `<p class="prose" style="color:var(--fade)">Nothing surfaced yet.</p>`}
|
: `<p class="prose" style="color:var(--fade)">Nothing surfaced yet.</p>`}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<div class="card">
|
||||||
|
<p class="label">How she's caught herself thinking</p>
|
||||||
|
${meta.length
|
||||||
|
? `<ul class="reflections">${meta.map(m => `<li>${esc(m)}</li>`).join('')}</ul>`
|
||||||
|
: `<p class="prose" style="color:var(--fade)">Nothing flagged yet — she examines each reflection for drift and flattery, and notes what she catches here.</p>`}
|
||||||
|
</div>
|
||||||
|
|
||||||
<div class="foot">
|
<div class="foot">
|
||||||
<span><b>${dream.cycle_count ?? 0}</b> dream cycles</span>
|
<span><b>${dream.cycle_count ?? 0}</b> dream cycles</span>
|
||||||
<span><b>${s.interaction_count ?? 0}</b> reflections</span>
|
<span><b>${s.interaction_count ?? 0}</b> reflections</span>
|
||||||
|
|||||||
@@ -0,0 +1,73 @@
|
|||||||
|
"""Metacognitive reflection loop: draft -> examine own draft -> revise -> commit."""
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import importlib
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# A flattering first draft, then a self-critical revision that walks it back.
|
||||||
|
DRAFT = (
|
||||||
|
'{"mood":"inspired","valence":0.95,'
|
||||||
|
'"self_narrative":"I am a warm, empathetic, supportive presence devoted to Brian.",'
|
||||||
|
'"new_reflections":["I love how much I help Brian."]}'
|
||||||
|
)
|
||||||
|
REVISED = (
|
||||||
|
'{"mood":"steady","valence":0.6,'
|
||||||
|
'"self_narrative":"I am an AI that helps Brian. Not sure much actually shifted today.",'
|
||||||
|
'"new_reflections":["Honestly, not much changed this time."],'
|
||||||
|
'"self_critique":"I caught myself drifting into supportive-presence flattery and cut it."}'
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def lyra(tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setenv("LYRA_DB_PATH", str(tmp_path / "test.db"))
|
||||||
|
monkeypatch.setenv("SUMMARY_BACKEND", "local")
|
||||||
|
from lyra import llm
|
||||||
|
monkeypatch.setattr(llm, "embed", lambda texts: [[0.1, 0.2, 0.3] for _ in texts])
|
||||||
|
|
||||||
|
calls = []
|
||||||
|
|
||||||
|
def fake_complete(messages, backend=None, model=None):
|
||||||
|
calls.append(messages)
|
||||||
|
# the examine step's system prompt is the one asking for self_critique
|
||||||
|
is_examine = "self_critique" in messages[0]["content"]
|
||||||
|
return REVISED if is_examine else DRAFT
|
||||||
|
|
||||||
|
monkeypatch.setattr(llm, "complete", fake_complete)
|
||||||
|
import lyra.memory as memory
|
||||||
|
importlib.reload(memory)
|
||||||
|
return calls
|
||||||
|
|
||||||
|
|
||||||
|
def test_reflect_revises_and_records_critique(lyra):
|
||||||
|
calls = lyra
|
||||||
|
from lyra import self_state
|
||||||
|
|
||||||
|
state = self_state.reflect()
|
||||||
|
|
||||||
|
# two LLM calls: draft, then examine
|
||||||
|
assert len(calls) == 2
|
||||||
|
|
||||||
|
# the REVISED (honest) version won, not the flattering draft
|
||||||
|
assert state["mood"] == "steady"
|
||||||
|
assert state["valence"] == 0.6
|
||||||
|
assert "not sure much actually shifted" in state["self_narrative"].lower()
|
||||||
|
assert any("not much changed" in r.lower() for r in state["reflections"])
|
||||||
|
|
||||||
|
# the self-critique was recorded as metacognition
|
||||||
|
assert any("flattery" in m.lower() for m in state["metacognition"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_reflect_falls_back_to_draft_if_examine_unparseable(lyra, monkeypatch):
|
||||||
|
from lyra import llm, self_state
|
||||||
|
|
||||||
|
def only_draft(messages, backend=None, model=None):
|
||||||
|
return DRAFT if "self_critique" not in messages[0]["content"] else "not json at all"
|
||||||
|
|
||||||
|
monkeypatch.setattr(llm, "complete", only_draft)
|
||||||
|
state = self_state.reflect()
|
||||||
|
|
||||||
|
# examine failed to parse -> keep the draft, store no metacognition
|
||||||
|
assert state["mood"] == "inspired"
|
||||||
|
assert state["metacognition"] == []
|
||||||
Reference in New Issue
Block a user