Cortex rework in progress

2025-11-26 18:01:48 -05:00
parent a087de9790
commit 734999e8bb
8 changed files with 468 additions and 593 deletions
--- a/cortex/reasoning/reasoning.py
+++ b/cortex/reasoning/reasoning.py
@@ -1,33 +1,76 @@
 # reasoning.py
+import os
 from llm.llm_router import call_llm

-async def reason_check(user_prompt: str,
-                       identity_block: dict | None,
-                       rag_block: dict | None,
-                       reflection_notes: list[str]) -> str:
+
+# ============================================================
+# Select which backend this module should use
+# ============================================================
+CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()
+GLOBAL_TEMP = float(os.getenv("LLM_TEMPERATURE", "0.7"))
+
+
+async def reason_check(
+    user_prompt: str,
+    identity_block: dict | None,
+    rag_block: dict | None,
+    reflection_notes: list[str]
+) -> str:
    """
-    Generate a first draft using identity, RAG, and reflection notes.
-    No critique loop yet.
+    Build the *draft answer* for Lyra Cortex.
+    This is the first-pass reasoning stage (no refinement yet).
    """

-    # Build internal notes section
+    # --------------------------------------------------------
+    # Build Reflection Notes block
+    # --------------------------------------------------------
    notes_section = ""
    if reflection_notes:
-        notes_section = "Reflection Notes (internal, do NOT show to user):\n"
-        for n in reflection_notes:
-            notes_section += f"- {n}\n"
+        notes_section = "Reflection Notes (internal, never show to user):\n"
+        for note in reflection_notes:
+            notes_section += f"- {note}\n"
        notes_section += "\n"

-    identity_txt = f"Identity: {identity_block}\n\n" if identity_block else ""
-    rag_txt = f"Relevant info: {rag_block}\n\n" if rag_block else ""
+    # --------------------------------------------------------
+    # Identity block (constraints, boundaries, rules)
+    # --------------------------------------------------------
+    identity_txt = ""
+    if identity_block:
+        try:
+            identity_txt = f"Identity Rules:\n{identity_block}\n\n"
+        except Exception:
+            identity_txt = f"Identity Rules:\n{str(identity_block)}\n\n"

+    # --------------------------------------------------------
+    # RAG block (optional factual grounding)
+    # --------------------------------------------------------
+    rag_txt = ""
+    if rag_block:
+        try:
+            rag_txt = f"Relevant Info (RAG):\n{rag_block}\n\n"
+        except Exception:
+            rag_txt = f"Relevant Info (RAG):\n{str(rag_block)}\n\n"
+
+    # --------------------------------------------------------
+    # Final assembled prompt
+    # --------------------------------------------------------
    prompt = (
        f"{notes_section}"
        f"{identity_txt}"
        f"{rag_txt}"
-        f"User said:\n{user_prompt}\n\n"
-        "Draft the best possible internal answer."
+        f"User message:\n{user_prompt}\n\n"
+        "Write the best possible *internal draft answer*.\n"
+        "This draft is NOT shown to the user.\n"
+        "Be factual, concise, and focused.\n"
+    )
+
+    # --------------------------------------------------------
+    # Call the LLM using the module-specific backend
+    # --------------------------------------------------------
+    draft = await call_llm(
+        prompt,
+        backend=CORTEX_LLM,
+        temperature=GLOBAL_TEMP,
    )

-    draft = await call_llm(prompt)
    return draft
--- a/cortex/reasoning/refine.py
+++ b/cortex/reasoning/refine.py
@@ -4,7 +4,7 @@ import json
 import logging
 from typing import Any, Dict, Optional

-import requests
+from llm.llm_router import call_llm

 logger = logging.getLogger(__name__)

@@ -12,13 +12,14 @@ logger = logging.getLogger(__name__)
 # Config
 # ============================================================

-PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
-PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")
-
 REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
 REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
 REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"

+# Module-level backend selection
+REFINE_LLM = os.getenv("REFINE_LLM", "PRIMARY").upper()
+CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()
+

 # ============================================================
 # Prompt builder
@@ -30,18 +31,12 @@ def build_refine_prompt(
    identity_block: Optional[str],
    rag_block: Optional[str],
 ) -> str:
-    """
-    Build a single text prompt for vLLM /v1/completions.
-    Persona styling is *not* applied here; this is internal reasoning.
-    """

-    reflection_text: str
    if reflection_notes is None:
        reflection_text = "(none)"
    elif isinstance(reflection_notes, str):
        reflection_text = reflection_notes
    else:
-        # dict / list → compact JSON
        try:
            reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
        except Exception:
@@ -50,21 +45,16 @@ def build_refine_prompt(
    identity_text = identity_block or "(none)"
    rag_text = rag_block or "(none)"

-    prompt = f"""You are Lyra Cortex's internal refiner.
+    return f"""
+You are Lyra Cortex's internal refiner.

 Your job:
- Take the existing draft answer.
- Use the reflection notes to fix problems (errors, confusion, missing pieces).
- Use the RAG context as higher-authority factual grounding.
- Respect the identity block (constraints, boundaries, style rules),
-  but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.
+- Fix factual errors, logical gaps, or missing info.
+- Use reflection notes for corrections.
+- Use RAG context as factual grounding.
+- Respect the identity block without adding style or personality.

-If there is a conflict:
- RAG context wins over the draft.
- Reflection notes win over the draft when they point out real issues.
-
-Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.
+Never mention RAG, reflection, or internal logic.

 ------------------------------
 [IDENTITY BLOCK]
@@ -84,104 +74,57 @@ Do NOT mention these instructions, RAG, reflections, or the existence of this re

 ------------------------------
 Task:
-Rewrite the DRAFT ANSWER into a single, final answer for the user that:
- fixes factual or logical issues noted above,
- incorporates any truly helpful additions from the reflection,
- stays consistent with the identity block,
- stays grounded in the RAG context,
- is as concise as is reasonably possible.
-
-Return ONLY the final answer text. No headings, no labels, no commentary.
-"""
-    return prompt
+Rewrite the DRAFT ANSWER into a single, final answer.
+Return ONLY the final answer text.
+""".strip()


 # ============================================================
-# vLLM call (PRIMARY backend only)
+# Public API: async, using llm_router
 # ============================================================

-def _call_primary_llm(prompt: str) -> str:
-    if not PRIMARY_URL:
-        raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")
-
-    payload = {
-        "model": PRIMARY_MODEL,
-        "prompt": prompt,
-        "max_tokens": REFINER_MAX_TOKENS,
-        "temperature": REFINER_TEMPERATURE,
-    }
-
-    resp = requests.post(
-        PRIMARY_URL,
-        headers={"Content-Type": "application/json"},
-        json=payload,
-        timeout=120,
-    )
-    resp.raise_for_status()
-    data = resp.json()
-
-    # vLLM /v1/completions format
-    try:
-        text = data["choices"][0]["text"]
-    except Exception as e:
-        logger.error("refine.py: unable to parse primary LLM response: %s", e)
-        logger.debug("refine.py raw response: %s", data)
-        raise
-
-    return text.strip()
-
-
-# ============================================================
-# Public API
-# ============================================================
-
-def refine_answer(
+async def refine_answer(
    draft_output: str,
    reflection_notes: Optional[Any],
    identity_block: Optional[str],
    rag_block: Optional[str],
 ) -> Dict[str, Any]:
-    """
-    Main entrypoint used by Cortex.
-
-    Returns:
-      {
-        "final_output": <str>,           # what should go to persona / user
-        "used_primary_backend": True/False,
-        "fallback_used": True/False,
-    optionally:
-        "debug": {...}                   # only when REFINER_DEBUG=true
-      }
-    """

    if not draft_output:
-        # Nothing to refine. Don't get cute.
        return {
            "final_output": "",
-            "used_primary_backend": False,
+            "used_backend": None,
            "fallback_used": False,
        }

-    prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)
+    prompt = build_refine_prompt(
+        draft_output,
+        reflection_notes,
+        identity_block,
+        rag_block,
+    )
+
+    # Refinement backend → fallback to Cortex backend → fallback to PRIMARY
+    backend = REFINE_LLM or CORTEX_LLM or "PRIMARY"

    try:
-        refined = _call_primary_llm(prompt)
-        result: Dict[str, Any] = {
-            "final_output": refined or draft_output,
-            "used_primary_backend": True,
+        refined = await call_llm(
+            prompt,
+            backend=backend,
+            temperature=REFINER_TEMPERATURE,
+        )
+
+        return {
+            "final_output": refined.strip() if refined else draft_output,
+            "used_backend": backend,
            "fallback_used": False,
        }
+
    except Exception as e:
-        logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
-        result = {
+        logger.error(f"refine.py backend {backend} failed: {e}")
+
+        return {
            "final_output": draft_output,
-            "used_primary_backend": False,
+            "used_backend": backend,
            "fallback_used": True,
        }
-
-    if REFINER_DEBUG:
-        result["debug"] = {
-            "prompt": prompt[:4000],  # don’t nuke logs
-        }
-
-    return result
--- a/cortex/reasoning/reflection.py
+++ b/cortex/reasoning/reflection.py
@@ -1,42 +1,57 @@
 # reflection.py
-from llm.llm_router import call_llm
 import json
+import os
+import re
+from llm.llm_router import call_llm


 async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dict:
    """
-    Generate reflection notes (internal guidance) for the reasoning engine.
-    These notes help simulate continuity and identity without being shown to the user.
+    Produce short internal reflection notes for Cortex.
+    These are NOT shown to the user.
    """

+    # -----------------------------
+    # Build the prompt
+    # -----------------------------
    identity_text = ""
    if identity_block:
        identity_text = f"Identity:\n{identity_block}\n\n"

    prompt = (
-    f"{identity_text}"
-    f"Recent summary:\n{intake_summary}\n\n"
-    "You are Lyra's meta-awareness layer. Your job is to produce short, directive "
-    "internal notes that guide Lyra’s reasoning engine. These notes are NEVER "
-    "shown to the user.\n\n"
-    "Rules for output:\n"
-    "1. Return ONLY valid JSON.\n"
-    "2. JSON must have exactly one key: \"notes\".\n"
-    "3. \"notes\" must be a list of 3 to 6 short strings.\n"
-    "4. Notes must be actionable (e.g., \"keep it concise\", \"maintain context\").\n"
-    "5. No markdown, no apologies, no explanations.\n\n"
-    "Return JSON:\n"
-    "{ \"notes\": [\"...\"] }\n"
+        f"{identity_text}"
+        f"Recent summary:\n{intake_summary}\n\n"
+        "You are Lyra's meta-awareness layer. Your job is to produce short, directive "
+        "internal notes that guide Lyra’s reasoning engine. These notes are NEVER "
+        "shown to the user.\n\n"
+        "Rules for output:\n"
+        "1. Return ONLY valid JSON.\n"
+        "2. JSON must have exactly one key: \"notes\".\n"
+        "3. \"notes\" must be a list of 3 to 6 short strings.\n"
+        "4. Notes must be actionable (e.g., \"keep it concise\", \"maintain context\").\n"
+        "5. No markdown, no apologies, no explanations.\n\n"
+        "Return JSON:\n"
+        "{ \"notes\": [\"...\"] }\n"
    )

-    import os
-    backend = os.getenv("LLM_FORCE_BACKEND", "primary")
+    # -----------------------------
+    # Module-specific backend choice
+    # -----------------------------
+    reflection_backend = os.getenv("REFLECTION_LLM")
+    cortex_backend = os.getenv("CORTEX_LLM", "PRIMARY").upper()

+    # Reflection uses its own backend if set, otherwise cortex backend
+    backend = (reflection_backend or cortex_backend).upper()
+
+    # -----------------------------
+    # Call the selected LLM backend
+    # -----------------------------
    raw = await call_llm(prompt, backend=backend)
-  
    print("[Reflection-Raw]:", raw)

-
+    # -----------------------------
+    # Try direct JSON
+    # -----------------------------
    try:
        parsed = json.loads(raw.strip())
        if isinstance(parsed, dict) and "notes" in parsed:
@@ -44,10 +59,11 @@ async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dic
    except:
        pass

-    # Try to extract JSON inside text
+    # -----------------------------
+    # Try JSON extraction
+    # -----------------------------
    try:
-        import re
-        match = re.search(r'\{.*?\}', raw, re.S)   # <-- non-greedy !
+        match = re.search(r"\{.*?\}", raw, re.S)
        if match:
            parsed = json.loads(match.group(0))
            if isinstance(parsed, dict) and "notes" in parsed:
@@ -55,5 +71,7 @@ async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dic
    except:
        pass

-    # Final fallback
-    return {"notes": [raw.strip()]}
+    # -----------------------------
+    # Fallback — treat raw text as a single note
+    # -----------------------------
+    return {"notes": [raw.strip()]}