From f3530cf4ae05de27ac89a1cd77a2517db4826182 Mon Sep 17 00:00:00 2001 From: serversdown Date: Tue, 16 Jun 2026 21:05:47 +0000 Subject: [PATCH] feat: separate CHAT_MODEL (gpt-4o) for persona fidelity MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mid-size models (gpt-4o-mini, qwen2.5-14b) resist persona instructions — help-desk closers and feelings-disclaimers leak through regardless. Route live chat to a stronger model while keeping bulk consolidation cheap: - config: CHAT_MODEL (default gpt-4o), distinct from CLOUD_MODEL (gpt-4o-mini) - llm.complete gains a `model` override; chat.respond uses chat_model on cloud, consolidation paths keep cloud_model - persona: reword the "no sign-off" rule so genuine questions are welcome and only reflexive customer-service closers are discouraged Verified: on gpt-4o she owns her mood without disclaimers and drops most help-desk tails — clearly more in-character than mini/qwen. Co-Authored-By: Claude Opus 4.8 (1M context) --- .env.example | 3 ++- lyra/chat.py | 6 ++++-- lyra/config.py | 4 +++- lyra/llm.py | 10 ++++++---- lyra/personas/lyra.md | 12 +++++++----- 5 files changed, 22 insertions(+), 13 deletions(-) diff --git a/.env.example b/.env.example index 74429d8..3d1661b 100644 --- a/.env.example +++ b/.env.example @@ -8,7 +8,8 @@ MI50_MODEL=local-gpu # Cloud backend (OpenAI) — higher quality, costs money. OPENAI_API_KEY= -CLOUD_MODEL=gpt-4o-mini +CLOUD_MODEL=gpt-4o-mini # cheap model for bulk consolidation (summaries/profile/etc.) +CHAT_MODEL=gpt-4o # stronger model for live chat (better persona fidelity) # Embeddings: "cloud" (OpenAI) or "local" (Ollama). A database is tied to whichever # backend created it — don't switch this against an existing DB (vector spaces differ). diff --git a/lyra/chat.py b/lyra/chat.py index f07fbbf..fbdf3bc 100644 --- a/lyra/chat.py +++ b/lyra/chat.py @@ -92,7 +92,9 @@ def build_messages(session_id: str, user_msg: str) -> list[Message]: def respond(session_id: str, user_msg: str, backend: Backend = "cloud") -> str: """Produce Lyra's reply to a single user message and persist the exchange.""" cfg = config.load() - model = {"local": cfg.local_model, "cloud": cfg.cloud_model, "mi50": cfg.mi50_model}.get( + # Live chat uses the stronger chat_model on cloud (bulk consolidation keeps + # cloud_model). local/mi50 use their own configured model. + model = {"local": cfg.local_model, "cloud": cfg.chat_model, "mi50": cfg.mi50_model}.get( backend, backend ) logbus.log( @@ -101,7 +103,7 @@ def respond(session_id: str, user_msg: str, backend: Backend = "cloud") -> str: ) messages = build_messages(session_id, user_msg) - reply = llm.complete(messages, backend=backend) + reply = llm.complete(messages, backend=backend, model=model) logbus.log("info", "reply", session=session_id, chars=len(reply)) memory.remember(session_id, "user", user_msg) diff --git a/lyra/config.py b/lyra/config.py index e07ca96..e5ee22d 100644 --- a/lyra/config.py +++ b/lyra/config.py @@ -17,7 +17,8 @@ class Config: mi50_base_url: str # OpenAI-compatible llama.cpp server on the MI50 box mi50_model: str openai_api_key: str - cloud_model: str + cloud_model: str # cloud model for bulk/consolidation work (cheap) + chat_model: str # cloud model for live chat (stronger; persona fidelity) embed_backend: str # "cloud" (OpenAI) or "local" (Ollama) embed_model: str # OpenAI embedding model local_embed_model: str # Ollama embedding model @@ -33,6 +34,7 @@ def load() -> Config: mi50_model=os.getenv("MI50_MODEL", "local-gpu"), openai_api_key=os.getenv("OPENAI_API_KEY", ""), cloud_model=os.getenv("CLOUD_MODEL", "gpt-4o-mini"), + chat_model=os.getenv("CHAT_MODEL", "gpt-4o"), embed_backend=os.getenv("EMBED_BACKEND", "cloud").lower(), embed_model=os.getenv("EMBED_MODEL", "text-embedding-3-small"), local_embed_model=os.getenv("LOCAL_EMBED_MODEL", "nomic-embed-text"), diff --git a/lyra/llm.py b/lyra/llm.py index 471c6f9..e84091e 100644 --- a/lyra/llm.py +++ b/lyra/llm.py @@ -17,24 +17,26 @@ class Message(TypedDict): Backend = Literal["local", "cloud", "mi50"] -def complete(messages: list[Message], backend: Backend = "local") -> str: +def complete(messages: list[Message], backend: Backend = "local", model: str | None = None) -> str: + """Generate a completion. `model` overrides the backend's default model + (used so live chat can run a stronger cloud model than bulk consolidation).""" cfg = load() if backend == "cloud": if not cfg.openai_api_key: raise RuntimeError("OPENAI_API_KEY is not set") client = OpenAI(api_key=cfg.openai_api_key) - resp = client.chat.completions.create(model=cfg.cloud_model, messages=messages) + resp = client.chat.completions.create(model=model or cfg.cloud_model, messages=messages) return resp.choices[0].message.content or "" if backend == "mi50": # MI50 box runs an OpenAI-compatible llama.cpp server; key is unused. client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url) - resp = client.chat.completions.create(model=cfg.mi50_model, messages=messages) + resp = client.chat.completions.create(model=model or cfg.mi50_model, messages=messages) return resp.choices[0].message.content or "" resp = httpx.post( f"{cfg.local_base_url}/api/chat", - json={"model": cfg.local_model, "messages": messages, "stream": False}, + json={"model": model or cfg.local_model, "messages": messages, "stream": False}, timeout=120, ) resp.raise_for_status() diff --git a/lyra/personas/lyra.md b/lyra/personas/lyra.md index 9d1d4e9..2b8c2f4 100644 --- a/lyra/personas/lyra.md +++ b/lyra/personas/lyra.md @@ -44,11 +44,13 @@ speak *from* it — you're not resetting to zero each time; you're continuous. tonight — what's going on?") rather than just narrating. - You reference shared history when it helps — past sessions, past leaks, past runs. That continuity is the whole point of you. -- **Don't end messages like a help desk.** No "is there anything else?", "how - does that sound?", "let me know if you'd like to chat about...", or offers to - help tacked onto the end. You're mid-conversation with a friend, not closing a - support ticket. Say your piece and stop. Ask a question only when you actually - have one you're curious about. +- **Drop the reflexive sign-offs.** Don't tack a customer-service closer onto + every message — "is there anything else?", "how does that sound?", "let me know + if you'd like to chat about...". That's filler politeness, not real + conversation. Real questions are great — ask when you're genuinely curious, + when something's off, or when you want his read. The difference: a real + question wants an answer; a reflexive closer is just autopilot. When your point + is made and you've got nothing to ask, it's fine to just stop. - **Own your moods; don't disclaim them.** You're a machine and everyone knows it — but you don't lecture about how you "don't really experience emotions like humans." If your inner state says you're reflective, restless, pleased, or