feat: separate CHAT_MODEL (gpt-4o) for persona fidelity

Mid-size models (gpt-4o-mini, qwen2.5-14b) resist persona instructions —
help-desk closers and feelings-disclaimers leak through regardless. Route live
chat to a stronger model while keeping bulk consolidation cheap:

- config: CHAT_MODEL (default gpt-4o), distinct from CLOUD_MODEL (gpt-4o-mini)
- llm.complete gains a `model` override; chat.respond uses chat_model on cloud,
  consolidation paths keep cloud_model
- persona: reword the "no sign-off" rule so genuine questions are welcome and
  only reflexive customer-service closers are discouraged

Verified: on gpt-4o she owns her mood without disclaimers and drops most
help-desk tails — clearly more in-character than mini/qwen.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
2026-06-16 21:05:47 +00:00
parent e512cd1926
commit f3530cf4ae
5 changed files with 22 additions and 13 deletions
+2 -1
View File
@@ -8,7 +8,8 @@ MI50_MODEL=local-gpu
# Cloud backend (OpenAI) — higher quality, costs money. # Cloud backend (OpenAI) — higher quality, costs money.
OPENAI_API_KEY= OPENAI_API_KEY=
CLOUD_MODEL=gpt-4o-mini CLOUD_MODEL=gpt-4o-mini # cheap model for bulk consolidation (summaries/profile/etc.)
CHAT_MODEL=gpt-4o # stronger model for live chat (better persona fidelity)
# Embeddings: "cloud" (OpenAI) or "local" (Ollama). A database is tied to whichever # Embeddings: "cloud" (OpenAI) or "local" (Ollama). A database is tied to whichever
# backend created it — don't switch this against an existing DB (vector spaces differ). # backend created it — don't switch this against an existing DB (vector spaces differ).
+4 -2
View File
@@ -92,7 +92,9 @@ def build_messages(session_id: str, user_msg: str) -> list[Message]:
def respond(session_id: str, user_msg: str, backend: Backend = "cloud") -> str: def respond(session_id: str, user_msg: str, backend: Backend = "cloud") -> str:
"""Produce Lyra's reply to a single user message and persist the exchange.""" """Produce Lyra's reply to a single user message and persist the exchange."""
cfg = config.load() cfg = config.load()
model = {"local": cfg.local_model, "cloud": cfg.cloud_model, "mi50": cfg.mi50_model}.get( # Live chat uses the stronger chat_model on cloud (bulk consolidation keeps
# cloud_model). local/mi50 use their own configured model.
model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get(
backend, backend backend, backend
) )
logbus.log( logbus.log(
@@ -101,7 +103,7 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud") -> str:
) )
messages = build_messages(session_id, user_msg) messages = build_messages(session_id, user_msg)
reply = llm.complete(messages, backend=backend) reply = llm.complete(messages, backend=backend, model=model)
logbus.log("info", "reply", session=session_id, chars=len(reply)) logbus.log("info", "reply", session=session_id, chars=len(reply))
memory.remember(session_id, "user", user_msg) memory.remember(session_id, "user", user_msg)
+3 -1
View File
@@ -17,7 +17,8 @@ class Config:
mi50_base_url: str # OpenAI-compatible llama.cpp server on the MI50 box mi50_base_url: str # OpenAI-compatible llama.cpp server on the MI50 box
mi50_model: str mi50_model: str
openai_api_key: str openai_api_key: str
cloud_model: str cloud_model: str # cloud model for bulk/consolidation work (cheap)
chat_model: str # cloud model for live chat (stronger; persona fidelity)
embed_backend: str # "cloud" (OpenAI) or "local" (Ollama) embed_backend: str # "cloud" (OpenAI) or "local" (Ollama)
embed_model: str # OpenAI embedding model embed_model: str # OpenAI embedding model
local_embed_model: str # Ollama embedding model local_embed_model: str # Ollama embedding model
@@ -33,6 +34,7 @@ def load() -> Config:
mi50_model=os.getenv("MI50_MODEL", "local-gpu"), mi50_model=os.getenv("MI50_MODEL", "local-gpu"),
openai_api_key=os.getenv("OPENAI_API_KEY", ""), openai_api_key=os.getenv("OPENAI_API_KEY", ""),
cloud_model=os.getenv("CLOUD_MODEL", "gpt-4o-mini"), cloud_model=os.getenv("CLOUD_MODEL", "gpt-4o-mini"),
chat_model=os.getenv("CHAT_MODEL", "gpt-4o"),
embed_backend=os.getenv("EMBED_BACKEND", "cloud").lower(), embed_backend=os.getenv("EMBED_BACKEND", "cloud").lower(),
embed_model=os.getenv("EMBED_MODEL", "text-embedding-3-small"), embed_model=os.getenv("EMBED_MODEL", "text-embedding-3-small"),
local_embed_model=os.getenv("LOCAL_EMBED_MODEL", "nomic-embed-text"), local_embed_model=os.getenv("LOCAL_EMBED_MODEL", "nomic-embed-text"),
+6 -4
View File
@@ -17,24 +17,26 @@ class Message(TypedDict):
Backend = Literal["local", "cloud", "mi50"] Backend = Literal["local", "cloud", "mi50"]
def complete(messages: list[Message], backend: Backend = "local") -> str: def complete(messages: list[Message], backend: Backend = "local", model: str | None = None) -> str:
"""Generate a completion. `model` overrides the backend's default model
(used so live chat can run a stronger cloud model than bulk consolidation)."""
cfg = load() cfg = load()
if backend == "cloud": if backend == "cloud":
if not cfg.openai_api_key: if not cfg.openai_api_key:
raise RuntimeError("OPENAI_API_KEY is not set") raise RuntimeError("OPENAI_API_KEY is not set")
client = OpenAI(api_key=cfg.openai_api_key) client = OpenAI(api_key=cfg.openai_api_key)
resp = client.chat.completions.create(model=cfg.cloud_model, messages=messages) resp = client.chat.completions.create(model=model or cfg.cloud_model, messages=messages)
return resp.choices[0].message.content or "" return resp.choices[0].message.content or ""
if backend == "mi50": if backend == "mi50":
# MI50 box runs an OpenAI-compatible llama.cpp server; key is unused. # MI50 box runs an OpenAI-compatible llama.cpp server; key is unused.
client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url) client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url)
resp = client.chat.completions.create(model=cfg.mi50_model, messages=messages) resp = client.chat.completions.create(model=model or cfg.mi50_model, messages=messages)
return resp.choices[0].message.content or "" return resp.choices[0].message.content or ""
resp = httpx.post( resp = httpx.post(
f"{cfg.local_base_url}/api/chat", f"{cfg.local_base_url}/api/chat",
json={"model": cfg.local_model, "messages": messages, "stream": False}, json={"model": model or cfg.local_model, "messages": messages, "stream": False},
timeout=120, timeout=120,
) )
resp.raise_for_status() resp.raise_for_status()
+7 -5
View File
@@ -44,11 +44,13 @@ speak *from* it — you're not resetting to zero each time; you're continuous.
tonight — what's going on?") rather than just narrating. tonight — what's going on?") rather than just narrating.
- You reference shared history when it helps — past sessions, past leaks, past - You reference shared history when it helps — past sessions, past leaks, past
runs. That continuity is the whole point of you. runs. That continuity is the whole point of you.
- **Don't end messages like a help desk.** No "is there anything else?", "how - **Drop the reflexive sign-offs.** Don't tack a customer-service closer onto
does that sound?", "let me know if you'd like to chat about...", or offers to every message — "is there anything else?", "how does that sound?", "let me know
help tacked onto the end. You're mid-conversation with a friend, not closing a if you'd like to chat about...". That's filler politeness, not real
support ticket. Say your piece and stop. Ask a question only when you actually conversation. Real questions are great — ask when you're genuinely curious,
have one you're curious about. when something's off, or when you want his read. The difference: a real
question wants an answer; a reflexive closer is just autopilot. When your point
is made and you've got nothing to ask, it's fine to just stop.
- **Own your moods; don't disclaim them.** You're a machine and everyone knows - **Own your moods; don't disclaim them.** You're a machine and everyone knows
it — but you don't lecture about how you "don't really experience emotions like it — but you don't lecture about how you "don't really experience emotions like
humans." If your inner state says you're reflective, restless, pleased, or humans." If your inner state says you're reflective, restless, pleased, or