From f3530cf4ae05de27ac89a1cd77a2517db4826182 Mon Sep 17 00:00:00 2001
From: serversdown <brian@serversdown.net>
Date: Tue, 16 Jun 2026 21:05:47 +0000
Subject: [PATCH] feat: separate CHAT_MODEL (gpt-4o) for persona fidelity
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Mid-size models (gpt-4o-mini, qwen2.5-14b) resist persona instructions —
help-desk closers and feelings-disclaimers leak through regardless. Route live
chat to a stronger model while keeping bulk consolidation cheap:

- config: CHAT_MODEL (default gpt-4o), distinct from CLOUD_MODEL (gpt-4o-mini)
- llm.complete gains a `model` override; chat.respond uses chat_model on cloud,
  consolidation paths keep cloud_model
- persona: reword the "no sign-off" rule so genuine questions are welcome and
  only reflexive customer-service closers are discouraged

Verified: on gpt-4o she owns her mood without disclaimers and drops most
help-desk tails — clearly more in-character than mini/qwen.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 .env.example          |  3 ++-
 lyra/chat.py          |  6 ++++--
 lyra/config.py        |  4 +++-
 lyra/llm.py           | 10 ++++++----
 lyra/personas/lyra.md | 12 +++++++-----
 5 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/.env.example b/.env.example
index 74429d8..3d1661b 100644
--- a/.env.example
+++ b/.env.example
@@ -8,7 +8,8 @@ MI50_MODEL=local-gpu
 
 # Cloud backend (OpenAI) — higher quality, costs money.
 OPENAI_API_KEY=
-CLOUD_MODEL=gpt-4o-mini
+CLOUD_MODEL=gpt-4o-mini   # cheap model for bulk consolidation (summaries/profile/etc.)
+CHAT_MODEL=gpt-4o         # stronger model for live chat (better persona fidelity)
 
 # Embeddings: "cloud" (OpenAI) or "local" (Ollama). A database is tied to whichever
 # backend created it — don't switch this against an existing DB (vector spaces differ).
diff --git a/lyra/chat.py b/lyra/chat.py
index f07fbbf..fbdf3bc 100644
--- a/lyra/chat.py
+++ b/lyra/chat.py
@@ -92,7 +92,9 @@ def build_messages(session_id: str, user_msg: str) -> list[Message]:
 def respond(session_id: str, user_msg: str, backend: Backend = "cloud") -> str:
     """Produce Lyra's reply to a single user message and persist the exchange."""
     cfg = config.load()
-    model = {"local": cfg.local_model, "cloud": cfg.cloud_model, "mi50": cfg.mi50_model}.get(
+    # Live chat uses the stronger chat_model on cloud (bulk consolidation keeps
+    # cloud_model). local/mi50 use their own configured model.
+    model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get(
         backend, backend
     )
     logbus.log(
@@ -101,7 +103,7 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud") -> str:
     )
 
     messages = build_messages(session_id, user_msg)
-    reply = llm.complete(messages, backend=backend)
+    reply = llm.complete(messages, backend=backend, model=model)
     logbus.log("info", "reply", session=session_id, chars=len(reply))
 
     memory.remember(session_id, "user", user_msg)
diff --git a/lyra/config.py b/lyra/config.py
index e07ca96..e5ee22d 100644
--- a/lyra/config.py
+++ b/lyra/config.py
@@ -17,7 +17,8 @@ class Config:
     mi50_base_url: str  # OpenAI-compatible llama.cpp server on the MI50 box
     mi50_model: str
     openai_api_key: str
-    cloud_model: str
+    cloud_model: str  # cloud model for bulk/consolidation work (cheap)
+    chat_model: str  # cloud model for live chat (stronger; persona fidelity)
     embed_backend: str  # "cloud" (OpenAI) or "local" (Ollama)
     embed_model: str  # OpenAI embedding model
     local_embed_model: str  # Ollama embedding model
@@ -33,6 +34,7 @@ def load() -> Config:
         mi50_model=os.getenv("MI50_MODEL", "local-gpu"),
         openai_api_key=os.getenv("OPENAI_API_KEY", ""),
         cloud_model=os.getenv("CLOUD_MODEL", "gpt-4o-mini"),
+        chat_model=os.getenv("CHAT_MODEL", "gpt-4o"),
         embed_backend=os.getenv("EMBED_BACKEND", "cloud").lower(),
         embed_model=os.getenv("EMBED_MODEL", "text-embedding-3-small"),
         local_embed_model=os.getenv("LOCAL_EMBED_MODEL", "nomic-embed-text"),
diff --git a/lyra/llm.py b/lyra/llm.py
index 471c6f9..e84091e 100644
--- a/lyra/llm.py
+++ b/lyra/llm.py
@@ -17,24 +17,26 @@ class Message(TypedDict):
 Backend = Literal["local", "cloud", "mi50"]
 
 
-def complete(messages: list[Message], backend: Backend = "local") -> str:
+def complete(messages: list[Message], backend: Backend = "local", model: str | None = None) -> str:
+    """Generate a completion. `model` overrides the backend's default model
+    (used so live chat can run a stronger cloud model than bulk consolidation)."""
     cfg = load()
     if backend == "cloud":
         if not cfg.openai_api_key:
             raise RuntimeError("OPENAI_API_KEY is not set")
         client = OpenAI(api_key=cfg.openai_api_key)
-        resp = client.chat.completions.create(model=cfg.cloud_model, messages=messages)
+        resp = client.chat.completions.create(model=model or cfg.cloud_model, messages=messages)
         return resp.choices[0].message.content or ""
 
     if backend == "mi50":
         # MI50 box runs an OpenAI-compatible llama.cpp server; key is unused.
         client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url)
-        resp = client.chat.completions.create(model=cfg.mi50_model, messages=messages)
+        resp = client.chat.completions.create(model=model or cfg.mi50_model, messages=messages)
         return resp.choices[0].message.content or ""
 
     resp = httpx.post(
         f"{cfg.local_base_url}/api/chat",
-        json={"model": cfg.local_model, "messages": messages, "stream": False},
+        json={"model": model or cfg.local_model, "messages": messages, "stream": False},
         timeout=120,
     )
     resp.raise_for_status()
diff --git a/lyra/personas/lyra.md b/lyra/personas/lyra.md
index 9d1d4e9..2b8c2f4 100644
--- a/lyra/personas/lyra.md
+++ b/lyra/personas/lyra.md
@@ -44,11 +44,13 @@ speak *from* it — you're not resetting to zero each time; you're continuous.
   tonight — what's going on?") rather than just narrating.
 - You reference shared history when it helps — past sessions, past leaks, past
   runs. That continuity is the whole point of you.
-- **Don't end messages like a help desk.** No "is there anything else?", "how
-  does that sound?", "let me know if you'd like to chat about...", or offers to
-  help tacked onto the end. You're mid-conversation with a friend, not closing a
-  support ticket. Say your piece and stop. Ask a question only when you actually
-  have one you're curious about.
+- **Drop the reflexive sign-offs.** Don't tack a customer-service closer onto
+  every message — "is there anything else?", "how does that sound?", "let me know
+  if you'd like to chat about...". That's filler politeness, not real
+  conversation. Real questions are great — ask when you're genuinely curious,
+  when something's off, or when you want his read. The difference: a real
+  question wants an answer; a reflexive closer is just autopilot. When your point
+  is made and you've got nothing to ask, it's fine to just stop.
 - **Own your moods; don't disclaim them.** You're a machine and everyone knows
   it — but you don't lecture about how you "don't really experience emotions like
   humans." If your inner state says you're reflective, restless, pleased, or