Major rewire, all modules connected. Intake still wonkey
This commit is contained in:
@@ -1,102 +1,114 @@
|
||||
# llm_router.py
|
||||
import os
|
||||
import requests
|
||||
import json
|
||||
|
||||
# ---------------------------------------------
|
||||
# Load backend definition from .env
|
||||
# ---------------------------------------------
|
||||
# ------------------------------------------------------------
|
||||
# Load backend registry from root .env
|
||||
# ------------------------------------------------------------
|
||||
|
||||
def load_backend_config(name: str):
|
||||
"""
|
||||
Given a backend name like 'PRIMARY' or 'OPENAI',
|
||||
load the matching provider / url / model from env.
|
||||
"""
|
||||
BACKENDS = {
|
||||
"PRIMARY": {
|
||||
"provider": os.getenv("LLM_PRIMARY_PROVIDER", "").lower(),
|
||||
"url": os.getenv("LLM_PRIMARY_URL", ""),
|
||||
"model": os.getenv("LLM_PRIMARY_MODEL", "")
|
||||
},
|
||||
"SECONDARY": {
|
||||
"provider": os.getenv("LLM_SECONDARY_PROVIDER", "").lower(),
|
||||
"url": os.getenv("LLM_SECONDARY_URL", ""),
|
||||
"model": os.getenv("LLM_SECONDARY_MODEL", "")
|
||||
},
|
||||
"OPENAI": {
|
||||
"provider": os.getenv("LLM_OPENAI_PROVIDER", "").lower(),
|
||||
"url": os.getenv("LLM_OPENAI_URL", ""),
|
||||
"model": os.getenv("LLM_OPENAI_MODEL", ""),
|
||||
"api_key": os.getenv("OPENAI_API_KEY", "")
|
||||
},
|
||||
"FALLBACK": {
|
||||
"provider": os.getenv("LLM_FALLBACK_PROVIDER", "").lower(),
|
||||
"url": os.getenv("LLM_FALLBACK_URL", ""),
|
||||
"model": os.getenv("LLM_FALLBACK_MODEL", "")
|
||||
},
|
||||
}
|
||||
|
||||
prefix = f"LLM_{name.upper()}"
|
||||
|
||||
provider = os.getenv(f"{prefix}_PROVIDER")
|
||||
url = os.getenv(f"{prefix}_URL")
|
||||
model = os.getenv(f"{prefix}_MODEL")
|
||||
|
||||
if not provider or not url or not model:
|
||||
raise RuntimeError(
|
||||
f"Backend '{name}' is missing configuration. "
|
||||
f"Expected {prefix}_PROVIDER / URL / MODEL in .env"
|
||||
)
|
||||
|
||||
return provider, url.rstrip("/"), model
|
||||
DEFAULT_BACKEND = "PRIMARY"
|
||||
|
||||
|
||||
# ---------------------------------------------
|
||||
# Core call_llm() — fail hard, no fallback
|
||||
# ---------------------------------------------
|
||||
# ------------------------------------------------------------
|
||||
# Public call
|
||||
# ------------------------------------------------------------
|
||||
async def call_llm(
|
||||
prompt: str,
|
||||
backend: str | None = None,
|
||||
temperature: float = 0.7,
|
||||
max_tokens: int = 512,
|
||||
):
|
||||
backend = (backend or DEFAULT_BACKEND).upper()
|
||||
|
||||
def call_llm(prompt: str, backend_env_var: str):
|
||||
"""
|
||||
Example:
|
||||
call_llm(prompt, backend_env_var="CORTEX_LLM")
|
||||
if backend not in BACKENDS:
|
||||
raise RuntimeError(f"Unknown backend '{backend}'")
|
||||
|
||||
backend_env_var should contain one of:
|
||||
PRIMARY, SECONDARY, OPENAI, FALLBACK, etc
|
||||
"""
|
||||
cfg = BACKENDS[backend]
|
||||
provider = cfg["provider"]
|
||||
url = cfg["url"]
|
||||
model = cfg["model"]
|
||||
|
||||
backend_name = os.getenv(backend_env_var)
|
||||
if not backend_name:
|
||||
raise RuntimeError(f"{backend_env_var} is not set in .env")
|
||||
|
||||
provider, base_url, model = load_backend_config(backend_name)
|
||||
|
||||
# ---------------------------------------------
|
||||
# Provider-specific behavior
|
||||
# ---------------------------------------------
|
||||
if not url or not model:
|
||||
raise RuntimeError(f"Backend '{backend}' missing url/model in env")
|
||||
|
||||
# -------------------------------
|
||||
# Provider: VLLM (your MI50)
|
||||
# -------------------------------
|
||||
if provider == "vllm":
|
||||
# vLLM OpenAI-compatible API
|
||||
response = requests.post(
|
||||
f"{base_url}/v1/completions",
|
||||
json={
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": 1024,
|
||||
"temperature": float(os.getenv("LLM_TEMPERATURE", "0.7"))
|
||||
},
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
payload = {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": max_tokens,
|
||||
"temperature": temperature
|
||||
}
|
||||
r = requests.post(url, json=payload, timeout=120)
|
||||
data = r.json()
|
||||
return data["choices"][0]["text"]
|
||||
|
||||
elif provider == "ollama":
|
||||
response = requests.post(
|
||||
f"{base_url}/api/chat",
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"stream": False
|
||||
},
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
# -------------------------------
|
||||
# Provider: OLLAMA (your 3090)
|
||||
# -------------------------------
|
||||
if provider == "ollama":
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"stream": False # <-- critical fix
|
||||
}
|
||||
|
||||
r = requests.post(f"{url}/api/chat", json=payload, timeout=120)
|
||||
data = r.json()
|
||||
|
||||
return data["message"]["content"]
|
||||
|
||||
elif provider == "openai":
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise RuntimeError("OPENAI_API_KEY missing but provider=openai was selected")
|
||||
|
||||
response = requests.post(
|
||||
f"{base_url}/chat/completions",
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
json={
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": float(os.getenv("LLM_TEMPERATURE", "0.7"))
|
||||
},
|
||||
timeout=30
|
||||
)
|
||||
response.raise_for_status()
|
||||
data = response.json()
|
||||
# -------------------------------
|
||||
# Provider: OPENAI
|
||||
# -------------------------------
|
||||
if provider == "openai":
|
||||
headers = {
|
||||
"Authorization": f"Bearer {cfg['api_key']}",
|
||||
"Content-Type": "application/json"
|
||||
}
|
||||
payload = {
|
||||
"model": model,
|
||||
"messages": [
|
||||
{"role": "user", "content": prompt}
|
||||
],
|
||||
"temperature": temperature,
|
||||
"max_tokens": max_tokens,
|
||||
}
|
||||
r = requests.post(f"{url}/chat/completions", json=payload, headers=headers, timeout=120)
|
||||
data = r.json()
|
||||
return data["choices"][0]["message"]["content"]
|
||||
|
||||
else:
|
||||
raise RuntimeError(f"Unknown LLM provider: {provider}")
|
||||
# -------------------------------
|
||||
# Unknown provider
|
||||
# -------------------------------
|
||||
raise RuntimeError(f"Provider '{provider}' not implemented.")
|
||||
|
||||
@@ -8,22 +8,22 @@ from llm.llm_router import call_llm
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ============================================================
|
||||
# Config
|
||||
# ============================================================
|
||||
# ===============================================
|
||||
# Configuration
|
||||
# ===============================================
|
||||
|
||||
REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
|
||||
REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
|
||||
REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"
|
||||
|
||||
# Module-level backend selection
|
||||
REFINE_LLM = os.getenv("REFINE_LLM", "PRIMARY").upper()
|
||||
# These come from root .env
|
||||
REFINE_LLM = os.getenv("REFINE_LLM", "").upper()
|
||||
CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# ===============================================
|
||||
# Prompt builder
|
||||
# ============================================================
|
||||
# ===============================================
|
||||
|
||||
def build_refine_prompt(
|
||||
draft_output: str,
|
||||
@@ -32,15 +32,10 @@ def build_refine_prompt(
|
||||
rag_block: Optional[str],
|
||||
) -> str:
|
||||
|
||||
if reflection_notes is None:
|
||||
reflection_text = "(none)"
|
||||
elif isinstance(reflection_notes, str):
|
||||
reflection_text = reflection_notes
|
||||
else:
|
||||
try:
|
||||
reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
|
||||
except Exception:
|
||||
reflection_text = str(reflection_notes)
|
||||
try:
|
||||
reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
|
||||
except Exception:
|
||||
reflection_text = str(reflection_notes)
|
||||
|
||||
identity_text = identity_block or "(none)"
|
||||
rag_text = rag_block or "(none)"
|
||||
@@ -49,12 +44,13 @@ def build_refine_prompt(
|
||||
You are Lyra Cortex's internal refiner.
|
||||
|
||||
Your job:
|
||||
- Fix factual errors, logical gaps, or missing info.
|
||||
- Use reflection notes for corrections.
|
||||
- Use RAG context as factual grounding.
|
||||
- Respect the identity block without adding style or personality.
|
||||
- Fix factual issues.
|
||||
- Improve clarity.
|
||||
- Apply reflection notes when helpful.
|
||||
- Respect identity constraints.
|
||||
- Apply RAG context as truth source.
|
||||
|
||||
Never mention RAG, reflection, or internal logic.
|
||||
Do NOT mention RAG, reflection, internal logic, or this refinement step.
|
||||
|
||||
------------------------------
|
||||
[IDENTITY BLOCK]
|
||||
@@ -74,14 +70,14 @@ Never mention RAG, reflection, or internal logic.
|
||||
|
||||
------------------------------
|
||||
Task:
|
||||
Rewrite the DRAFT ANSWER into a single, final answer.
|
||||
Rewrite the DRAFT into a single final answer for the user.
|
||||
Return ONLY the final answer text.
|
||||
""".strip()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Public API: async, using llm_router
|
||||
# ============================================================
|
||||
# ===============================================
|
||||
# Public API — now async & fully router-based
|
||||
# ===============================================
|
||||
|
||||
async def refine_answer(
|
||||
draft_output: str,
|
||||
@@ -104,7 +100,7 @@ async def refine_answer(
|
||||
rag_block,
|
||||
)
|
||||
|
||||
# Refinement backend → fallback to Cortex backend → fallback to PRIMARY
|
||||
# backend priority: REFINE_LLM → CORTEX_LLM → PRIMARY
|
||||
backend = REFINE_LLM or CORTEX_LLM or "PRIMARY"
|
||||
|
||||
try:
|
||||
|
||||
@@ -1,5 +1,6 @@
|
||||
# router.py
|
||||
|
||||
from unittest import result
|
||||
from fastapi import APIRouter, HTTPException
|
||||
from pydantic import BaseModel
|
||||
|
||||
@@ -55,7 +56,7 @@ async def run_reason(req: ReasonRequest):
|
||||
)
|
||||
|
||||
# 4. Refinement
|
||||
result = refine_answer(
|
||||
result = await refine_answer(
|
||||
draft_output=draft,
|
||||
reflection_notes=reflection_notes,
|
||||
identity_block=None,
|
||||
@@ -63,6 +64,7 @@ async def run_reason(req: ReasonRequest):
|
||||
)
|
||||
final_neutral = result["final_output"]
|
||||
|
||||
|
||||
# 5. Persona layer
|
||||
persona_answer = await speak(final_neutral)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user