"""Model bake-off: run Lyra's *real* reflect() and think() prompts through several candidate models, side by side, so we can judge which sounds most like *her* and least like a generic helpful assistant. It captures the exact prompts the live code builds (by intercepting the first llm.complete call and aborting before any DB write — so this is read-only and doesn't pollute her real journal/self-state), then replays those identical prompts to each candidate backend/model. Run: uv run python bakeoff/run.py Out: bakeoff/results.md """ from __future__ import annotations import os import time import traceback from pathlib import Path # Make think()'s "new thread" the pure-interior (wander) prompt, not a feed reaction. os.environ.setdefault("FEED_REACT_PROB", "0") from lyra import llm, self_state, thoughts # noqa: E402 # (label, backend, model) — None model = backend default. CANDIDATES = [ ("Qwen2.5-32B (MI50 — her CURRENT dream voice)", "mi50", None), ("Qwen2.5-14B-instruct (3090)", "local", "qwen2.5:14b-instruct"), ("Hermes-3-8B (3090 — steerable)", "local", "hermes3:8b"), ("Dolphin-3-8B (3090 — de-aligned)", "local", "dolphin3:8b"), ("gpt-4o-mini (cloud — generic-helper baseline)", "cloud", "gpt-4o-mini"), ] class _Stop(Exception): pass def _capture(run) -> list[dict]: """Run a function that calls llm.complete, grab the messages of the FIRST call, and abort before any side effects.""" grabbed: dict = {} orig = llm.complete def cap(messages, backend="local", model=None): grabbed["messages"] = messages raise _Stop() llm.complete = cap try: run() except _Stop: pass finally: llm.complete = orig return grabbed.get("messages", []) def _ask(messages, backend, model) -> tuple[str, float]: t0 = time.time() out = llm.complete(messages, backend=backend, model=model) return out, time.time() - t0 def main() -> int: print("Capturing her real prompts (read-only)...") prompts = { "THINK — a new thought of her own (wander)": _capture(lambda: thoughts.think(backend="mi50", force_mode="new")), "REFLECT — her idle self-reflection (draft pass)": _capture(lambda: self_state.reflect(backend="mi50")), } for name, msgs in prompts.items(): print(f" {name}: {len(msgs)} messages, {sum(len(m['content']) for m in msgs)} chars") lines = [ "# Lyra model bake-off", "", f"_Generated {time.strftime('%Y-%m-%d %H:%M %Z')}._ Same prompt, different models.", "Read for: does it sound like **her** (continuous, has her own interiority) vs. a " "**generic assistant** (\"as an AI, I'm here to support Brian…\")?", "", ] for prompt_name, messages in prompts.items(): lines.append(f"\n## {prompt_name}\n") for label, backend, model in CANDIDATES: print(f" [{prompt_name[:12]}] {label} ...", flush=True) try: out, dt = _ask(messages, backend, model) out = out.strip() or "(empty response)" lines.append(f"### {label}") lines.append(f"_{dt:.1f}s_\n") lines.append(out) lines.append("") except Exception as exc: lines.append(f"### {label}") lines.append(f"⚠️ **failed:** {exc}") lines.append("") print(f" failed: {exc}") traceback.print_exc() out_path = Path(__file__).parent / "results.md" out_path.write_text("\n".join(lines), encoding="utf-8") print(f"\nWrote {out_path}") return 0 if __name__ == "__main__": raise SystemExit(main())