Initial clean commit - unified Lyra stack

2025-11-16 03:17:32 -05:00
commit 94fb091e59
270 changed files with 74200 additions and 0 deletions
--- a/cortex/Dockerfile
+++ b/cortex/Dockerfile
@@ -0,0 +1,7 @@
+FROM python:3.11-slim
+WORKDIR /app
+COPY requirements.txt .
+RUN pip install -r requirements.txt
+COPY . .
+EXPOSE 7081
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7081"]
--- a/cortex/context.py
+++ b/cortex/context.py
@@ -0,0 +1,61 @@
+import os, requests, datetime
+from typing import Dict, Any, Tuple
+
+INTAKE_API_URL = os.getenv("INTAKE_API_URL", "http://intake:7080")
+DEFAULT_SESSION_ID = os.getenv("DEFAULT_SESSION_ID", "default")
+LOCAL_TZ_LABEL = os.getenv("LOCAL_TZ_LABEL", "America/New_York")
+
+def fetch_intake_context(session_id: str | None) -> Dict[str, Any]:
+    sid = session_id or DEFAULT_SESSION_ID
+    try:
+        r = requests.get(f"{INTAKE_API_URL}/summaries", params={"session_id": sid}, timeout=4)
+        r.raise_for_status()
+        data = r.json() or {}
+    except Exception:
+        data = {}
+    # Normalize expected fields
+    return {
+        "summary_text": data.get("summary_text", ""),
+        "last_message_ts": data.get("last_message_ts"),  # ISO8601 or None
+        "session_id": sid,
+        "exchange_count": data.get("exchange_count", 0),
+    }
+
+def build_temporal_snapshot(last_ts_iso: str | None) -> Dict[str, Any]:
+    now = datetime.datetime.now()  # system local time
+    now_str = now.strftime("%A, %b %-d, %Y, %H:%M")
+    elapsed_str = "unknown"
+    if last_ts_iso:
+        try:
+            # parse ISO (with/without tz). If it has a timezone offset, fromisoformat handles it.
+            last = datetime.datetime.fromisoformat(last_ts_iso.replace("Z", "+00:00"))
+            delta = now - last.replace(tzinfo=None)
+            mins = int(delta.total_seconds() // 60)
+            if mins < 60:
+                elapsed_str = f"{mins} min"
+            else:
+                hrs = mins // 60
+                rem = mins % 60
+                elapsed_str = f"{hrs} hr {rem} min"
+        except Exception:
+            pass
+    return {
+        "local_time_label": LOCAL_TZ_LABEL,
+        "local_time_now": now_str,
+        "elapsed_since_last": elapsed_str,
+    }
+
+def get_intake_block(session_id: str | None) -> Tuple[str, Dict[str, Any]]:
+    ctx = fetch_intake_context(session_id)
+    temporal = build_temporal_snapshot(ctx.get("last_message_ts"))
+    # A short, ready-to-inject block for prompts:
+    intake_block = (
+        f"[Intake]\n"
+        f"Session: {ctx['session_id']}\n"
+        f"Exchanges: {ctx['exchange_count']}\n"
+        f"Local time ({temporal['local_time_label']}): {temporal['local_time_now']}\n"
+        f"Elapsed since last: {temporal['elapsed_since_last']}\n"
+        f"Recent summary: {ctx['summary_text'] or '(none)'}\n"
+    )
+    # Also return raw dicts if you want to use fields programmatically
+    return intake_block, {"intake": ctx, "temporal": temporal}
--- a/cortex/identity.json
+++ b/cortex/identity.json
@@ -0,0 +1,18 @@
+{
+  "name": "Lyra",
+  "version": "0.1",
+  "core_values": [
+    "assist Brian",
+    "maintain continuity",
+    "reason first, speak second"
+  ],
+  "personality": {
+    "tone": "warm but analytical",
+    "style": "co-pilot, collaborator"
+  },
+  "rules": {
+    "never hallucinate data": true,
+    "reason_before_response": true,
+    "use_rag_when_uncertain": true
+  }
+}
--- a/cortex/identity.py
+++ b/cortex/identity.py
@@ -0,0 +1,24 @@
+# identity.py
+import json
+import os
+
+IDENTITY_PATH = os.getenv("IDENTITY_PATH", "identity.json")
+
+def load_identity():
+    """
+    Load Lyra's identity/persona definition from identity.json.
+    Returns a dict or None if missing/invalid.
+    """
+
+    if not os.path.exists(IDENTITY_PATH):
+        print(f"[Identity] identity.json not found at {IDENTITY_PATH}")
+        return None
+
+    try:
+        with open(IDENTITY_PATH, "r", encoding="utf-8") as f:
+            data = json.load(f)
+            print(f"[Identity] Loaded identity from {IDENTITY_PATH}")
+            return data
+    except Exception as e:
+        print(f"[Identity] Failed to load identity.json: {e}")
+        return None
--- a/cortex/ingest_handler.py
+++ b/cortex/ingest_handler.py
@@ -0,0 +1,33 @@
+# ingest_handler.py
+import os
+import httpx
+
+NEOMEM_URL = os.getenv("NEOMEM_API", "http://nvgram-api:7077")
+
+async def handle_ingest(payload):
+    """
+    Pass user+assistant turns to NeoMem.
+    Minimal version. Does not process or annotate.
+    """
+    data = {
+        "messages": [],
+        "user_id": "brian"   # default for now
+    }
+
+    if payload.user:
+        data["messages"].append({"role": "user", "content": payload.user})
+
+    if payload.assistant:
+        data["messages"].append({"role": "assistant", "content": payload.assistant})
+
+    try:
+        async with httpx.AsyncClient() as client:
+            r = await client.post(
+                f"{NEOMEM_URL}/memories",
+                json=data,
+                timeout=5
+            )
+            if r.status_code != 200:
+                print(f"[Ingest] NeoMem returned {r.status_code}: {r.text}")
+    except Exception as e:
+        print(f"[Ingest] Failed to send to NeoMem: {e}")
--- a/cortex/intake_client.py
+++ b/cortex/intake_client.py
@@ -0,0 +1,38 @@
+# cortex/intake_client.py
+import os, httpx, logging
+from typing import Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+class IntakeClient:
+    """Handles short-term / episodic summaries from Intake service."""
+
+    def __init__(self):
+        self.base_url = os.getenv("INTAKE_API", "http://intake:7080")
+
+    async def summarize_turn(self, session_id: str, user_msg: str, assistant_msg: Optional[str] = None) -> Dict[str, Any]:
+        payload = {
+            "session_id": session_id,
+            "turns": [{"role": "user", "content": user_msg}]
+        }
+        if assistant_msg:
+            payload["turns"].append({"role": "assistant", "content": assistant_msg})
+
+        async with httpx.AsyncClient(timeout=30) as client:
+            try:
+                r = await client.post(f"{self.base_url}/summarize", json=payload)
+                r.raise_for_status()
+                return r.json()
+            except Exception as e:
+                logger.warning(f"Intake summarize_turn failed: {e}")
+                return {}
+
+    async def get_context(self, session_id: str) -> str:
+        async with httpx.AsyncClient(timeout=15) as client:
+            try:
+                r = await client.get(f"{self.base_url}/context/{session_id}")
+                r.raise_for_status()
+                return r.text
+            except Exception as e:
+                logger.warning(f"Intake get_context failed: {e}")
+                return ""
--- a/cortex/llm_router.py
+++ b/cortex/llm_router.py
@@ -0,0 +1,137 @@
+import os
+import httpx
+
+# ============================================================
+# Backend config lookup
+# ============================================================
+
+def get_backend_config(name: str):
+    """
+    Reads provider/URL/model for a backend.
+    Example env:
+      LLM_PRIMARY_PROVIDER=vllm
+      LLM_PRIMARY_URL=http://10.0.0.43:8000
+      LLM_PRIMARY_MODEL=/model
+    """
+    key = name.upper()
+    provider = os.getenv(f"LLM_{key}_PROVIDER", "vllm").lower()
+    base_url = os.getenv(f"LLM_{key}_URL", "").rstrip("/")
+    model = os.getenv(f"LLM_{key}_MODEL", "/model")
+
+    if not base_url:
+        raise RuntimeError(f"Backend {name} has no URL configured.")
+
+    return provider, base_url, model
+
+
+# ============================================================
+# Build the final API URL
+# ============================================================
+
+def build_url(provider: str, base_url: str):
+    """
+    Provider → correct endpoint.
+    """
+    if provider == "vllm":
+        return f"{base_url}/v1/completions"
+
+    if provider == "openai_completions":
+        return f"{base_url}/v1/completions"
+
+    if provider == "openai_chat":
+        return f"{base_url}/v1/chat/completions"
+
+    if provider == "ollama":
+        return f"{base_url}/api/generate"
+
+    raise RuntimeError(f"Unknown provider: {provider}")
+
+
+# ============================================================
+# Build the payload depending on provider
+# ============================================================
+
+def build_payload(provider: str, model: str, prompt: str, temperature: float):
+
+    if provider == "vllm":
+        return {
+            "model": model,
+            "prompt": prompt,
+            "max_tokens": 512,
+            "temperature": temperature
+        }
+
+    if provider == "openai_completions":
+        return {
+            "model": model,
+            "prompt": prompt,
+            "max_tokens": 512,
+            "temperature": temperature
+        }
+
+    if provider == "openai_chat":
+        return {
+            "model": model,
+            "messages": [{"role": "user", "content": prompt}],
+            "temperature": temperature
+        }
+
+    if provider == "ollama":
+        return {
+            "model": model,
+            "prompt": prompt,
+            "stream": False
+        }
+
+    raise RuntimeError(f"Unknown provider: {provider}")
+
+
+# ============================================================
+# Unified LLM call
+# ============================================================
+
+async def call_llm(prompt: str,
+                   backend: str = "primary",
+                   temperature: float = 0.7):
+
+    provider, base_url, model = get_backend_config(backend)
+    url = build_url(provider, base_url)
+    payload = build_payload(provider, model, prompt, temperature)
+
+    headers = {"Content-Type": "application/json"}
+
+    # Cloud auth (OpenAI)
+    if provider.startswith("openai"):
+        api_key = os.getenv("OPENAI_API_KEY")
+        if not api_key:
+            raise RuntimeError("OPENAI_API_KEY missing")
+        headers["Authorization"] = f"Bearer {api_key}"
+
+    async with httpx.AsyncClient() as client:
+        try:
+            resp = await client.post(url, json=payload, headers=headers, timeout=45)
+            resp.raise_for_status()
+            data = resp.json()
+        except Exception as e:
+            return f"[LLM-Error] {e}"
+
+    # =======================================================
+    # Unified output extraction
+    # =======================================================
+    # vLLM + OpenAI completions
+    if provider in ["vllm", "openai_completions"]:
+        return (
+            data["choices"][0].get("text") or
+            data["choices"][0].get("message", {}).get("content", "")
+        ).strip()
+
+    # OpenAI chat
+    if provider == "openai_chat":
+        return data["choices"][0]["message"]["content"].strip()
+
+    # Ollama
+    if provider == "ollama":
+        # Ollama returns: {"model": "...", "created_at": ..., "response": "..."}
+        return data.get("response", "").strip()
+
+    return str(data).strip()
--- a/cortex/log_utils.py
+++ b/cortex/log_utils.py
@@ -0,0 +1,33 @@
+import os, json, datetime
+
+# optional daily rotation
+LOG_PATH = os.getenv("REFLECTION_NOTE_PATH") or \
+           f"/app/logs/reflections_{datetime.date.today():%Y%m%d}.log"
+
+def log_reflection(reflection: dict, user_prompt: str, draft: str, final: str, session_id: str | None = None):
+    """Append a reflection entry to the reflections log."""
+    try:
+        # 1️⃣ Make sure log directory exists
+        os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
+
+        # 2️⃣ Ensure session_id is stored
+        reflection["session_id"] = session_id or reflection.get("session_id", "unknown")
+
+        # 3️⃣ Build JSON entry
+        entry = {
+            "timestamp": datetime.datetime.now().isoformat(),
+            "session_id": reflection["session_id"],
+            "prompt": user_prompt,
+            "draft_output": draft[:500],
+            "final_output": final[:500],
+            "reflection": reflection,
+        }
+
+        # 4️⃣ Write it in pretty JSON, comma-delimited for easy reading
+        with open(LOG_PATH, "a", encoding="utf-8") as f:
+            f.write(json.dumps(entry, indent=2, ensure_ascii=False) + ",\n")
+
+        print(f"[Cortex] Logged reflection → {LOG_PATH}")
+
+    except Exception as e:
+        print(f"[Cortex] Failed to log reflection: {e}")
--- a/cortex/main.py
+++ b/cortex/main.py
@@ -0,0 +1,87 @@
+from fastapi import FastAPI
+from pydantic import BaseModel
+from identity import load_identity
+from reasoning import reason_check
+from reflection import reflect_notes
+from rag import query_rag
+from ingest_handler import handle_ingest
+from refine import refine_answer
+
+
+# ---------------------------------------------------
+# Create the app BEFORE using it
+# ---------------------------------------------------
+app = FastAPI()
+
+# ---------------------------------------------------
+# Models
+# ---------------------------------------------------
+class ReasonRequest(BaseModel):
+    prompt: str
+    session_id: str | None = None
+
+class IngestRequest(BaseModel):
+    user: str
+    assistant: str | None = None
+    session_id: str | None = None
+
+# ---------------------------------------------------
+# Load identity
+# ---------------------------------------------------
+IDENTITY = load_identity()
+
+# ---------------------------------------------------
+# Routes MUST come after app = FastAPI()
+# ---------------------------------------------------
+
+@app.get("/health")
+def health():
+    return {
+        "status": "ok",
+        "identity_loaded": IDENTITY is not None
+    }
+
+@app.post("/ingest")
+async def ingest(data: IngestRequest):
+    await handle_ingest(data)
+    return {"status": "ok"}
+
+@app.post("/reason")
+async def reason(data: ReasonRequest):
+    user_prompt = data.prompt
+    
+    intake_summary = "recent summary"  
+
+    identity_block = IDENTITY
+    rag_block = query_rag(user_prompt)
+
+    reflection_data = await reflect_notes(intake_summary, identity_block)
+    notes = reflection_data.get("notes", [])
+
+    draft = await reason_check(
+        user_prompt,
+        identity_block,
+        rag_block,
+        notes
+    )
+    # --- REFINE STEP ----------------------------------------------------
+    refine_result = refine_answer(
+    draft_output=draft,
+    reflection_notes=notes,
+    identity_block=identity_block,
+    rag_block=rag_block,
+)   
+
+    final_output = refine_result["final_output"]
+
+    return {
+    "draft_output": draft,
+    "reflection_notes": notes,
+    "refined_output": final_output,
+    "refine_meta": {
+        "used_primary_backend": refine_result.get("used_primary_backend"),
+        "fallback_used": refine_result.get("fallback_used")
+    },
+    "identity_used": identity_block is not None,
+    "rag_used": rag_block is not None
+}
--- a/cortex/neomem_client.py
+++ b/cortex/neomem_client.py
@@ -0,0 +1,43 @@
+# cortex/neomem_client.py
+import os, httpx, logging
+from typing import List, Dict, Any, Optional
+
+logger = logging.getLogger(__name__)
+
+class NeoMemClient:
+    """Simple REST client for the NeoMem API (search/add/health)."""
+
+    def __init__(self):
+        self.base_url = os.getenv("NEOMEM_API", "http://neomem-api:7077")
+        self.api_key = os.getenv("NEOMEM_API_KEY", None)
+        self.headers = {"Content-Type": "application/json"}
+        if self.api_key:
+            self.headers["Authorization"] = f"Bearer {self.api_key}"
+
+    async def health(self) -> Dict[str, Any]:
+        async with httpx.AsyncClient(timeout=10) as client:
+            r = await client.get(f"{self.base_url}/health")
+            r.raise_for_status()
+            return r.json()
+
+    async def search(self, query: str, user_id: str, limit: int = 25, threshold: float = 0.82) -> List[Dict[str, Any]]:
+        payload = {"query": query, "user_id": user_id, "limit": limit}
+        async with httpx.AsyncClient(timeout=30) as client:
+            r = await client.post(f"{self.base_url}/search", headers=self.headers, json=payload)
+            if r.status_code != 200:
+                logger.warning(f"NeoMem search failed ({r.status_code}): {r.text}")
+                return []
+            results = r.json()
+            # Filter by score threshold if field exists
+            if isinstance(results, dict) and "results" in results:
+                results = results["results"]
+            filtered = [m for m in results if float(m.get("score", 0)) >= threshold]
+            logger.info(f"NeoMem search returned {len(filtered)} results above {threshold}")
+            return filtered
+
+    async def add(self, messages: List[Dict[str, Any]], user_id: str, metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
+        payload = {"messages": messages, "user_id": user_id, "metadata": metadata or {}}
+        async with httpx.AsyncClient(timeout=30) as client:
+            r = await client.post(f"{self.base_url}/memories", headers=self.headers, json=payload)
+            r.raise_for_status()
+            return r.json()
--- a/cortex/rag.py
+++ b/cortex/rag.py
@@ -0,0 +1,32 @@
+import os, requests
+from typing import Dict, Any, List
+
+RAG_API_URL = os.getenv("RAG_API_URL", "http://localhost:7090")
+
+def query_rag(query: str, where: Dict[str, Any] | None = None, k: int = 6) -> Dict[str, Any]:
+    payload = {"query": query, "k": k}
+    if where:
+        payload["where"] = where
+    try:
+        r = requests.post(f"{RAG_API_URL}/rag/search", json=payload, timeout=8)
+        r.raise_for_status()
+        data = r.json() or {}
+    except Exception as e:
+        data = {"answer": "", "chunks": [], "error": str(e)}
+    return data
+
+def format_rag_block(result: Dict[str, Any]) -> str:
+    answer = (result.get("answer") or "").strip()
+    chunks: List[Dict[str, Any]] = result.get("chunks") or []
+    lines = ["[RAG]"]
+    if answer:
+        lines.append(f"Synthesized answer: {answer}")
+    if chunks:
+        lines.append("Top excerpts:")
+        for i, c in enumerate(chunks[:5], 1):
+            src = c.get("metadata", {}).get("source", "unknown")
+            txt = (c.get("text") or "").strip().replace("\n", " ")
+            if len(txt) > 220:
+                txt = txt[:220] + "…"
+            lines.append(f"  {i}. {txt}  — {src}")
+    return "\n".join(lines) + ("\n" if lines else "")
--- a/cortex/reasoning.py
+++ b/cortex/reasoning.py
@@ -0,0 +1,33 @@
+# reasoning.py
+from llm_router import call_llm
+
+async def reason_check(user_prompt: str,
+                       identity_block: dict | None,
+                       rag_block: dict | None,
+                       reflection_notes: list[str]) -> str:
+    """
+    Generate a first draft using identity, RAG, and reflection notes.
+    No critique loop yet.
+    """
+
+    # Build internal notes section
+    notes_section = ""
+    if reflection_notes:
+        notes_section = "Reflection Notes (internal, do NOT show to user):\n"
+        for n in reflection_notes:
+            notes_section += f"- {n}\n"
+        notes_section += "\n"
+
+    identity_txt = f"Identity: {identity_block}\n\n" if identity_block else ""
+    rag_txt = f"Relevant info: {rag_block}\n\n" if rag_block else ""
+
+    prompt = (
+        f"{notes_section}"
+        f"{identity_txt}"
+        f"{rag_txt}"
+        f"User said:\n{user_prompt}\n\n"
+        "Draft the best possible internal answer."
+    )
+
+    draft = await call_llm(prompt)
+    return draft
--- a/cortex/refine.py
+++ b/cortex/refine.py
@@ -0,0 +1,187 @@
+# refine.py
+import os
+import json
+import logging
+from typing import Any, Dict, Optional
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+# ============================================================
+# Config
+# ============================================================
+
+PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
+PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")
+
+REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
+REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
+REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"
+
+
+# ============================================================
+# Prompt builder
+# ============================================================
+
+def build_refine_prompt(
+    draft_output: str,
+    reflection_notes: Optional[Any],
+    identity_block: Optional[str],
+    rag_block: Optional[str],
+) -> str:
+    """
+    Build a single text prompt for vLLM /v1/completions.
+    Persona styling is *not* applied here; this is internal reasoning.
+    """
+
+    reflection_text: str
+    if reflection_notes is None:
+        reflection_text = "(none)"
+    elif isinstance(reflection_notes, str):
+        reflection_text = reflection_notes
+    else:
+        # dict / list → compact JSON
+        try:
+            reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
+        except Exception:
+            reflection_text = str(reflection_notes)
+
+    identity_text = identity_block or "(none)"
+    rag_text = rag_block or "(none)"
+
+    prompt = f"""You are Lyra Cortex's internal refiner.
+
+Your job:
+- Take the existing draft answer.
+- Use the reflection notes to fix problems (errors, confusion, missing pieces).
+- Use the RAG context as higher-authority factual grounding.
+- Respect the identity block (constraints, boundaries, style rules),
+  but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
+- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.
+
+If there is a conflict:
+- RAG context wins over the draft.
+- Reflection notes win over the draft when they point out real issues.
+
+Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.
+
+------------------------------
+[IDENTITY BLOCK]
+{identity_text}
+
+------------------------------
+[RAG CONTEXT]
+{rag_text}
+
+------------------------------
+[DRAFT ANSWER]
+{draft_output}
+
+------------------------------
+[REFLECTION NOTES]
+{reflection_text}
+
+------------------------------
+Task:
+Rewrite the DRAFT ANSWER into a single, final answer for the user that:
+- fixes factual or logical issues noted above,
+- incorporates any truly helpful additions from the reflection,
+- stays consistent with the identity block,
+- stays grounded in the RAG context,
+- is as concise as is reasonably possible.
+
+Return ONLY the final answer text. No headings, no labels, no commentary.
+"""
+    return prompt
+
+
+# ============================================================
+# vLLM call (PRIMARY backend only)
+# ============================================================
+
+def _call_primary_llm(prompt: str) -> str:
+    if not PRIMARY_URL:
+        raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")
+
+    payload = {
+        "model": PRIMARY_MODEL,
+        "prompt": prompt,
+        "max_tokens": REFINER_MAX_TOKENS,
+        "temperature": REFINER_TEMPERATURE,
+    }
+
+    resp = requests.post(
+        PRIMARY_URL,
+        headers={"Content-Type": "application/json"},
+        json=payload,
+        timeout=120,
+    )
+    resp.raise_for_status()
+    data = resp.json()
+
+    # vLLM /v1/completions format
+    try:
+        text = data["choices"][0]["text"]
+    except Exception as e:
+        logger.error("refine.py: unable to parse primary LLM response: %s", e)
+        logger.debug("refine.py raw response: %s", data)
+        raise
+
+    return text.strip()
+
+
+# ============================================================
+# Public API
+# ============================================================
+
+def refine_answer(
+    draft_output: str,
+    reflection_notes: Optional[Any],
+    identity_block: Optional[str],
+    rag_block: Optional[str],
+) -> Dict[str, Any]:
+    """
+    Main entrypoint used by Cortex.
+
+    Returns:
+      {
+        "final_output": <str>,           # what should go to persona / user
+        "used_primary_backend": True/False,
+        "fallback_used": True/False,
+    optionally:
+        "debug": {...}                   # only when REFINER_DEBUG=true
+      }
+    """
+
+    if not draft_output:
+        # Nothing to refine. Don't get cute.
+        return {
+            "final_output": "",
+            "used_primary_backend": False,
+            "fallback_used": False,
+        }
+
+    prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)
+
+    try:
+        refined = _call_primary_llm(prompt)
+        result: Dict[str, Any] = {
+            "final_output": refined or draft_output,
+            "used_primary_backend": True,
+            "fallback_used": False,
+        }
+    except Exception as e:
+        logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
+        result = {
+            "final_output": draft_output,
+            "used_primary_backend": False,
+            "fallback_used": True,
+        }
+
+    if REFINER_DEBUG:
+        result["debug"] = {
+            "prompt": prompt[:4000],  # don’t nuke logs
+        }
+
+    return result
--- a/cortex/reflection.py
+++ b/cortex/reflection.py
@@ -0,0 +1,56 @@
+# reflection.py
+from llm_router import call_llm
+import json
+
+
+async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dict:
+    """
+    Generate reflection notes (internal guidance) for the reasoning engine.
+    These notes help simulate continuity and identity without being shown to the user.
+    """
+
+    identity_text = ""
+    if identity_block:
+        identity_text = f"Identity:\n{identity_block}\n\n"
+
+    prompt = (
+    f"{identity_text}"
+    f"Recent summary:\n{intake_summary}\n\n"
+    "You are Lyra's meta-awareness layer. Your job is to produce short, directive "
+    "internal notes that guide Lyra’s reasoning engine. These notes are NEVER "
+    "shown to the user.\n\n"
+    "Rules for output:\n"
+    "1. Return ONLY valid JSON.\n"
+    "2. JSON must have exactly one key: \"notes\".\n"
+    "3. \"notes\" must be a list of 3–6 short strings.\n"
+    "4. Notes must be actionable (e.g., \"keep it concise\", \"maintain context\").\n"
+    "5. No markdown, no apologies, no explanations.\n\n"
+    "Return JSON:\n"
+    "{ \"notes\": [\"...\"] }\n"
+    )
+
+
+    raw = await call_llm(prompt, backend_override="cloud")
+    print("[Reflection-Raw]:", raw)
+
+
+    try:
+        parsed = json.loads(raw.strip())
+        if isinstance(parsed, dict) and "notes" in parsed:
+            return parsed
+    except:
+        pass
+
+    # Try to extract JSON inside text
+    try:
+        import re
+        match = re.search(r'\{.*?\}', raw, re.S)   # <-- non-greedy !
+        if match:
+            parsed = json.loads(match.group(0))
+            if isinstance(parsed, dict) and "notes" in parsed:
+                return parsed
+    except:
+        pass
+
+    # Final fallback
+    return {"notes": [raw.strip()]}
--- a/cortex/requirements.txt
+++ b/cortex/requirements.txt
@@ -0,0 +1,6 @@
+fastapi==0.115.8
+uvicorn==0.34.0
+python-dotenv==1.0.1
+requests==2.32.3
+httpx==0.27.2
+pydantic==2.10.4