Files
project-lyra/lyra/llm.py
T
serversdown f3530cf4ae feat: separate CHAT_MODEL (gpt-4o) for persona fidelity
Mid-size models (gpt-4o-mini, qwen2.5-14b) resist persona instructions —
help-desk closers and feelings-disclaimers leak through regardless. Route live
chat to a stronger model while keeping bulk consolidation cheap:

- config: CHAT_MODEL (default gpt-4o), distinct from CLOUD_MODEL (gpt-4o-mini)
- llm.complete gains a `model` override; chat.respond uses chat_model on cloud,
  consolidation paths keep cloud_model
- persona: reword the "no sign-off" rule so genuine questions are welcome and
  only reflexive customer-service closers are discouraged

Verified: on gpt-4o she owns her mood without disclaimers and drops most
help-desk tails — clearly more in-character than mini/qwen.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-16 21:05:47 +00:00

68 lines
2.4 KiB
Python

"""LLM router: local (Ollama) chat, cloud (OpenAI) chat + embeddings."""
from __future__ import annotations
from typing import Literal, TypedDict
import httpx
from openai import OpenAI
from lyra.config import load
class Message(TypedDict):
role: Literal["system", "user", "assistant"]
content: str
Backend = Literal["local", "cloud", "mi50"]
def complete(messages: list[Message], backend: Backend = "local", model: str | None = None) -> str:
"""Generate a completion. `model` overrides the backend's default model
(used so live chat can run a stronger cloud model than bulk consolidation)."""
cfg = load()
if backend == "cloud":
if not cfg.openai_api_key:
raise RuntimeError("OPENAI_API_KEY is not set")
client = OpenAI(api_key=cfg.openai_api_key)
resp = client.chat.completions.create(model=model or cfg.cloud_model, messages=messages)
return resp.choices[0].message.content or ""
if backend == "mi50":
# MI50 box runs an OpenAI-compatible llama.cpp server; key is unused.
client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url)
resp = client.chat.completions.create(model=model or cfg.mi50_model, messages=messages)
return resp.choices[0].message.content or ""
resp = httpx.post(
f"{cfg.local_base_url}/api/chat",
json={"model": model or cfg.local_model, "messages": messages, "stream": False},
timeout=120,
)
resp.raise_for_status()
return resp.json()["message"]["content"]
def embed(texts: list[str]) -> list[list[float]]:
"""Embed texts using the configured backend (EMBED_BACKEND: "cloud" or "local").
Note: OpenAI and Ollama embeddings live in different vector spaces (and
dimensions). A given database is tied to whichever backend created it — don't
switch EMBED_BACKEND against an existing DB or cosine recall will break.
"""
cfg = load()
if cfg.embed_backend == "local":
resp = httpx.post(
f"{cfg.local_base_url}/api/embed",
json={"model": cfg.local_embed_model, "input": texts},
timeout=120,
)
resp.raise_for_status()
return resp.json()["embeddings"]
if not cfg.openai_api_key:
raise RuntimeError("OPENAI_API_KEY is not set")
client = OpenAI(api_key=cfg.openai_api_key)
resp = client.embeddings.create(model=cfg.embed_model, input=texts)
return [d.embedding for d in resp.data]