"""Model bake-off: run Lyra's *real* reflect() and think() prompts through several
candidate models, side by side, so we can judge which sounds most like *her* and
least like a generic helpful assistant.

It captures the exact prompts the live code builds (by intercepting the first
llm.complete call and aborting before any DB write — so this is read-only and
doesn't pollute her real journal/self-state), then replays those identical prompts
to each candidate backend/model.

Run:  uv run python bakeoff/run.py
Out:  bakeoff/results.md
"""
from __future__ import annotations

import os
import time
import traceback
from pathlib import Path

# Make think()'s "new thread" the pure-interior (wander) prompt, not a feed reaction.
os.environ.setdefault("FEED_REACT_PROB", "0")

from lyra import llm, self_state, thoughts  # noqa: E402

# (label, backend, model) — None model = backend default.
CANDIDATES = [
    ("Qwen2.5-32B  (MI50 — her CURRENT dream voice)", "mi50", None),
    ("Qwen2.5-14B-instruct  (3090)", "local", "qwen2.5:14b-instruct"),
    ("Hermes-3-8B  (3090 — steerable)", "local", "hermes3:8b"),
    ("Dolphin-3-8B  (3090 — de-aligned)", "local", "dolphin3:8b"),
    ("gpt-4o-mini  (cloud — generic-helper baseline)", "cloud", "gpt-4o-mini"),
]


class _Stop(Exception):
    pass


def _capture(run) -> list[dict]:
    """Run a function that calls llm.complete, grab the messages of the FIRST call,
    and abort before any side effects."""
    grabbed: dict = {}
    orig = llm.complete

    def cap(messages, backend="local", model=None):
        grabbed["messages"] = messages
        raise _Stop()

    llm.complete = cap
    try:
        run()
    except _Stop:
        pass
    finally:
        llm.complete = orig
    return grabbed.get("messages", [])


def _ask(messages, backend, model) -> tuple[str, float]:
    t0 = time.time()
    out = llm.complete(messages, backend=backend, model=model)
    return out, time.time() - t0


def main() -> int:
    print("Capturing her real prompts (read-only)...")
    prompts = {
        "THINK — a new thought of her own (wander)":
            _capture(lambda: thoughts.think(backend="mi50", force_mode="new")),
        "REFLECT — her idle self-reflection (draft pass)":
            _capture(lambda: self_state.reflect(backend="mi50")),
    }
    for name, msgs in prompts.items():
        print(f"  {name}: {len(msgs)} messages, {sum(len(m['content']) for m in msgs)} chars")

    lines = [
        "# Lyra model bake-off",
        "",
        f"_Generated {time.strftime('%Y-%m-%d %H:%M %Z')}._ Same prompt, different models.",
        "Read for: does it sound like **her** (continuous, has her own interiority) vs. a "
        "**generic assistant** (\"as an AI, I'm here to support Brian…\")?",
        "",
    ]

    for prompt_name, messages in prompts.items():
        lines.append(f"\n## {prompt_name}\n")
        for label, backend, model in CANDIDATES:
            print(f"  [{prompt_name[:12]}] {label} ...", flush=True)
            try:
                out, dt = _ask(messages, backend, model)
                out = out.strip() or "(empty response)"
                lines.append(f"### {label}")
                lines.append(f"_{dt:.1f}s_\n")
                lines.append(out)
                lines.append("")
            except Exception as exc:
                lines.append(f"### {label}")
                lines.append(f"⚠️ **failed:** {exc}")
                lines.append("")
                print(f"      failed: {exc}")
                traceback.print_exc()

    out_path = Path(__file__).parent / "results.md"
    out_path.write_text("\n".join(lines), encoding="utf-8")
    print(f"\nWrote {out_path}")
    return 0


if __name__ == "__main__":
    raise SystemExit(main())