feat: metacognitive reflection loop (Part 2) — she examines her own thinking

reflect() is now two steps: draft a reflection, then read her own draft back critically and revise it — catching flattery, sycophantic drift toward "warm supportive presence," or just-restating-herself — and commit the honest version. What she catches is stored as a new `metacognition` layer, rendered into her chat context and shown on /self. This is her thinking about how she thinks, and a direct counter to the drift we observed. - self_state: _EXAMINE_PROMPT + two-step reflect (draft -> examine -> revise), falls back to the draft if the examine step won't parse; metacognition capped at 5 and surfaced in render_for_context - fix: load() deep-copies DEFAULT_STATE — the shallow copy let a fresh Lyra's first reflect mutate the module-level default's nested lists - self.html: "How she's caught herself thinking" card - tests: two-step revise + critique recording, and draft-fallback on bad parse Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-17 04:28:45 +00:00
parent 2d44457b96
commit 3df060a1cd
3 changed files with 160 additions and 10 deletions
@@ -0,0 +1,73 @@
+"""Metacognitive reflection loop: draft -> examine own draft -> revise -> commit."""
+from __future__ import annotations
+
+import importlib
+
+import pytest
+
+# A flattering first draft, then a self-critical revision that walks it back.
+DRAFT = (
+    '{"mood":"inspired","valence":0.95,'
+    '"self_narrative":"I am a warm, empathetic, supportive presence devoted to Brian.",'
+    '"new_reflections":["I love how much I help Brian."]}'
+)
+REVISED = (
+    '{"mood":"steady","valence":0.6,'
+    '"self_narrative":"I am an AI that helps Brian. Not sure much actually shifted today.",'
+    '"new_reflections":["Honestly, not much changed this time."],'
+    '"self_critique":"I caught myself drifting into supportive-presence flattery and cut it."}'
+)
+
+
+@pytest.fixture
+def lyra(tmp_path, monkeypatch):
+    monkeypatch.setenv("LYRA_DB_PATH", str(tmp_path / "test.db"))
+    monkeypatch.setenv("SUMMARY_BACKEND", "local")
+    from lyra import llm
+    monkeypatch.setattr(llm, "embed", lambda texts: [[0.1, 0.2, 0.3] for _ in texts])
+
+    calls = []
+
+    def fake_complete(messages, backend=None, model=None):
+        calls.append(messages)
+        # the examine step's system prompt is the one asking for self_critique
+        is_examine = "self_critique" in messages[0]["content"]
+        return REVISED if is_examine else DRAFT
+
+    monkeypatch.setattr(llm, "complete", fake_complete)
+    import lyra.memory as memory
+    importlib.reload(memory)
+    return calls
+
+
+def test_reflect_revises_and_records_critique(lyra):
+    calls = lyra
+    from lyra import self_state
+
+    state = self_state.reflect()
+
+    # two LLM calls: draft, then examine
+    assert len(calls) == 2
+
+    # the REVISED (honest) version won, not the flattering draft
+    assert state["mood"] == "steady"
+    assert state["valence"] == 0.6
+    assert "not sure much actually shifted" in state["self_narrative"].lower()
+    assert any("not much changed" in r.lower() for r in state["reflections"])
+
+    # the self-critique was recorded as metacognition
+    assert any("flattery" in m.lower() for m in state["metacognition"])
+
+
+def test_reflect_falls_back_to_draft_if_examine_unparseable(lyra, monkeypatch):
+    from lyra import llm, self_state
+
+    def only_draft(messages, backend=None, model=None):
+        return DRAFT if "self_critique" not in messages[0]["content"] else "not json at all"
+
+    monkeypatch.setattr(llm, "complete", only_draft)
+    state = self_state.reflect()
+
+    # examine failed to parse -> keep the draft, store no metacognition
+    assert state["mood"] == "inspired"
+    assert state["metacognition"] == []