Major rewire, all modules connected. Intake still wonkey

2025-11-28 15:14:47 -05:00
parent 734999e8bb
commit a83405beb1
19 changed files with 10109 additions and 4072 deletions
--- a/cortex/llm/llm_router.py
+++ b/cortex/llm/llm_router.py
@@ -1,102 +1,114 @@
+# llm_router.py
 import os
 import requests
+import json

-# ---------------------------------------------
-# Load backend definition from .env
-# ---------------------------------------------
+# ------------------------------------------------------------
+# Load backend registry from root .env
+# ------------------------------------------------------------

-def load_backend_config(name: str):
-    """
-    Given a backend name like 'PRIMARY' or 'OPENAI',
-    load the matching provider / url / model from env.
-    """
+BACKENDS = {
+    "PRIMARY": {
+        "provider": os.getenv("LLM_PRIMARY_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_PRIMARY_URL", ""),
+        "model": os.getenv("LLM_PRIMARY_MODEL", "")
+    },
+    "SECONDARY": {
+        "provider": os.getenv("LLM_SECONDARY_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_SECONDARY_URL", ""),
+        "model": os.getenv("LLM_SECONDARY_MODEL", "")
+    },
+    "OPENAI": {
+        "provider": os.getenv("LLM_OPENAI_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_OPENAI_URL", ""),
+        "model": os.getenv("LLM_OPENAI_MODEL", ""),
+        "api_key": os.getenv("OPENAI_API_KEY", "")
+    },
+    "FALLBACK": {
+        "provider": os.getenv("LLM_FALLBACK_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_FALLBACK_URL", ""),
+        "model": os.getenv("LLM_FALLBACK_MODEL", "")
+    },
+}

-    prefix = f"LLM_{name.upper()}"
-
-    provider = os.getenv(f"{prefix}_PROVIDER")
-    url      = os.getenv(f"{prefix}_URL")
-    model    = os.getenv(f"{prefix}_MODEL")
-
-    if not provider or not url or not model:
-        raise RuntimeError(
-            f"Backend '{name}' is missing configuration. "
-            f"Expected {prefix}_PROVIDER / URL / MODEL in .env"
-        )
-
-    return provider, url.rstrip("/"), model
+DEFAULT_BACKEND = "PRIMARY"


-# ---------------------------------------------
-# Core call_llm() — fail hard, no fallback
-# ---------------------------------------------
+# ------------------------------------------------------------
+# Public call
+# ------------------------------------------------------------
+async def call_llm(
+    prompt: str,
+    backend: str | None = None,
+    temperature: float = 0.7,
+    max_tokens: int = 512,
+):
+    backend = (backend or DEFAULT_BACKEND).upper()

-def call_llm(prompt: str, backend_env_var: str):
-    """
-    Example:
-        call_llm(prompt, backend_env_var="CORTEX_LLM")
+    if backend not in BACKENDS:
+        raise RuntimeError(f"Unknown backend '{backend}'")

-    backend_env_var should contain one of:
-        PRIMARY, SECONDARY, OPENAI, FALLBACK, etc
-    """
+    cfg = BACKENDS[backend]
+    provider = cfg["provider"]
+    url = cfg["url"]
+    model = cfg["model"]

-    backend_name = os.getenv(backend_env_var)
-    if not backend_name:
-        raise RuntimeError(f"{backend_env_var} is not set in .env")
-
-    provider, base_url, model = load_backend_config(backend_name)
-
-    # ---------------------------------------------
-    # Provider-specific behavior
-    # ---------------------------------------------
+    if not url or not model:
+        raise RuntimeError(f"Backend '{backend}' missing url/model in env")

+    # -------------------------------
+    # Provider: VLLM (your MI50)
+    # -------------------------------
    if provider == "vllm":
-        # vLLM OpenAI-compatible API
-        response = requests.post(
-            f"{base_url}/v1/completions",
-            json={
-                "model": model,
-                "prompt": prompt,
-                "max_tokens": 1024,
-                "temperature": float(os.getenv("LLM_TEMPERATURE", "0.7"))
-            },
-            timeout=30
-        )
-        response.raise_for_status()
-        data = response.json()
+        payload = {
+            "model": model,
+            "prompt": prompt,
+            "max_tokens": max_tokens,
+            "temperature": temperature
+        }
+        r = requests.post(url, json=payload, timeout=120)
+        data = r.json()
        return data["choices"][0]["text"]

-    elif provider == "ollama":
-        response = requests.post(
-            f"{base_url}/api/chat",
-            json={
-                "model": model,
-                "messages": [{"role": "user", "content": prompt}],
-                "stream": False
-            },
-            timeout=30
-        )
-        response.raise_for_status()
-        data = response.json()
+    # -------------------------------
+    # Provider: OLLAMA (your 3090)
+    # -------------------------------
+    if provider == "ollama":
+        payload = {
+            "model": model,
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "stream": False        # <-- critical fix
+        }
+
+        r = requests.post(f"{url}/api/chat", json=payload, timeout=120)
+        data = r.json()
+
        return data["message"]["content"]

-    elif provider == "openai":
-        api_key = os.getenv("OPENAI_API_KEY")
-        if not api_key:
-            raise RuntimeError("OPENAI_API_KEY missing but provider=openai was selected")

-        response = requests.post(
-            f"{base_url}/chat/completions",
-            headers={"Authorization": f"Bearer {api_key}"},
-            json={
-                "model": model,
-                "messages": [{"role": "user", "content": prompt}],
-                "temperature": float(os.getenv("LLM_TEMPERATURE", "0.7"))
-            },
-            timeout=30
-        )
-        response.raise_for_status()
-        data = response.json()
+    # -------------------------------
+    # Provider: OPENAI
+    # -------------------------------
+    if provider == "openai":
+        headers = {
+            "Authorization": f"Bearer {cfg['api_key']}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": model,
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        r = requests.post(f"{url}/chat/completions", json=payload, headers=headers, timeout=120)
+        data = r.json()
        return data["choices"][0]["message"]["content"]

-    else:
-        raise RuntimeError(f"Unknown LLM provider: {provider}")
+    # -------------------------------
+    # Unknown provider
+    # -------------------------------
+    raise RuntimeError(f"Provider '{provider}' not implemented.")
--- a/cortex/reasoning/refine.py
+++ b/cortex/reasoning/refine.py
@@ -8,22 +8,22 @@ from llm.llm_router import call_llm

 logger = logging.getLogger(__name__)

-# ============================================================
-# Config
-# ============================================================
+# ===============================================
+# Configuration
+# ===============================================

 REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
 REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
 REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"

-# Module-level backend selection
-REFINE_LLM = os.getenv("REFINE_LLM", "PRIMARY").upper()
+# These come from root .env
+REFINE_LLM = os.getenv("REFINE_LLM", "").upper()
 CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()


-# ============================================================
+# ===============================================
 # Prompt builder
-# ============================================================
+# ===============================================

 def build_refine_prompt(
    draft_output: str,
@@ -32,15 +32,10 @@ def build_refine_prompt(
    rag_block: Optional[str],
 ) -> str:

-    if reflection_notes is None:
-        reflection_text = "(none)"
-    elif isinstance(reflection_notes, str):
-        reflection_text = reflection_notes
-    else:
-        try:
-            reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
-        except Exception:
-            reflection_text = str(reflection_notes)
+    try:
+        reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
+    except Exception:
+        reflection_text = str(reflection_notes)

    identity_text = identity_block or "(none)"
    rag_text = rag_block or "(none)"
@@ -49,12 +44,13 @@ def build_refine_prompt(
 You are Lyra Cortex's internal refiner.

 Your job:
- Fix factual errors, logical gaps, or missing info.
- Use reflection notes for corrections.
- Use RAG context as factual grounding.
- Respect the identity block without adding style or personality.
+- Fix factual issues.
+- Improve clarity.
+- Apply reflection notes when helpful.
+- Respect identity constraints.
+- Apply RAG context as truth source.

-Never mention RAG, reflection, or internal logic.
+Do NOT mention RAG, reflection, internal logic, or this refinement step.

 ------------------------------
 [IDENTITY BLOCK]
@@ -74,14 +70,14 @@ Never mention RAG, reflection, or internal logic.

 ------------------------------
 Task:
-Rewrite the DRAFT ANSWER into a single, final answer.
+Rewrite the DRAFT into a single final answer for the user.
 Return ONLY the final answer text.
 """.strip()


-# ============================================================
-# Public API: async, using llm_router
-# ============================================================
+# ===============================================
+# Public API — now async & fully router-based
+# ===============================================

 async def refine_answer(
    draft_output: str,
@@ -104,7 +100,7 @@ async def refine_answer(
        rag_block,
    )

-    # Refinement backend → fallback to Cortex backend → fallback to PRIMARY
+    # backend priority: REFINE_LLM → CORTEX_LLM → PRIMARY
    backend = REFINE_LLM or CORTEX_LLM or "PRIMARY"

    try:
--- a/cortex/router.py
+++ b/cortex/router.py
@@ -1,5 +1,6 @@
 # router.py

+from unittest import result
 from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel

@@ -55,7 +56,7 @@ async def run_reason(req: ReasonRequest):
    )

    # 4. Refinement
-    result = refine_answer(
+    result = await refine_answer(
        draft_output=draft,
        reflection_notes=reflection_notes,
        identity_block=None,
@@ -63,6 +64,7 @@ async def run_reason(req: ReasonRequest):
    )
    final_neutral = result["final_output"]

+
    # 5. Persona layer
    persona_answer = await speak(final_neutral)