30185f3fd8
The MI50 box (CT202) runs an OpenAI-compatible llama.cpp server on 10.0.0.44:8080. Wire it in as a third backend: - llm.complete gains backend="mi50" (OpenAI client pointed at MI50_BASE_URL) - config: MI50_BASE_URL (default http://10.0.0.44:8080/v1) + MI50_MODEL - chat.respond labels the model per backend; web _backend_for maps "mi50" - UI backend selector adds "MI50 — local GPU" Verified end-to-end: llm.complete(backend="mi50") returns from the live server. See homelab-inference memory for the box topology. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
66 lines
2.2 KiB
Python
66 lines
2.2 KiB
Python
"""LLM router: local (Ollama) chat, cloud (OpenAI) chat + embeddings."""
|
|
from __future__ import annotations
|
|
|
|
from typing import Literal, TypedDict
|
|
|
|
import httpx
|
|
from openai import OpenAI
|
|
|
|
from lyra.config import load
|
|
|
|
|
|
class Message(TypedDict):
|
|
role: Literal["system", "user", "assistant"]
|
|
content: str
|
|
|
|
|
|
Backend = Literal["local", "cloud", "mi50"]
|
|
|
|
|
|
def complete(messages: list[Message], backend: Backend = "local") -> str:
|
|
cfg = load()
|
|
if backend == "cloud":
|
|
if not cfg.openai_api_key:
|
|
raise RuntimeError("OPENAI_API_KEY is not set")
|
|
client = OpenAI(api_key=cfg.openai_api_key)
|
|
resp = client.chat.completions.create(model=cfg.cloud_model, messages=messages)
|
|
return resp.choices[0].message.content or ""
|
|
|
|
if backend == "mi50":
|
|
# MI50 box runs an OpenAI-compatible llama.cpp server; key is unused.
|
|
client = OpenAI(api_key="not-needed", base_url=cfg.mi50_base_url)
|
|
resp = client.chat.completions.create(model=cfg.mi50_model, messages=messages)
|
|
return resp.choices[0].message.content or ""
|
|
|
|
resp = httpx.post(
|
|
f"{cfg.local_base_url}/api/chat",
|
|
json={"model": cfg.local_model, "messages": messages, "stream": False},
|
|
timeout=120,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()["message"]["content"]
|
|
|
|
|
|
def embed(texts: list[str]) -> list[list[float]]:
|
|
"""Embed texts using the configured backend (EMBED_BACKEND: "cloud" or "local").
|
|
|
|
Note: OpenAI and Ollama embeddings live in different vector spaces (and
|
|
dimensions). A given database is tied to whichever backend created it — don't
|
|
switch EMBED_BACKEND against an existing DB or cosine recall will break.
|
|
"""
|
|
cfg = load()
|
|
if cfg.embed_backend == "local":
|
|
resp = httpx.post(
|
|
f"{cfg.local_base_url}/api/embed",
|
|
json={"model": cfg.local_embed_model, "input": texts},
|
|
timeout=120,
|
|
)
|
|
resp.raise_for_status()
|
|
return resp.json()["embeddings"]
|
|
|
|
if not cfg.openai_api_key:
|
|
raise RuntimeError("OPENAI_API_KEY is not set")
|
|
client = OpenAI(api_key=cfg.openai_api_key)
|
|
resp = client.embeddings.create(model=cfg.embed_model, input=texts)
|
|
return [d.embedding for d in resp.data]
|