Initial clean commit - unified Lyra stack
This commit is contained in:
7
cortex/Dockerfile
Normal file
7
cortex/Dockerfile
Normal file
@@ -0,0 +1,7 @@
|
||||
FROM python:3.11-slim
|
||||
WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install -r requirements.txt
|
||||
COPY . .
|
||||
EXPOSE 7081
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7081"]
|
||||
61
cortex/context.py
Normal file
61
cortex/context.py
Normal file
@@ -0,0 +1,61 @@
|
||||
import os, requests, datetime
|
||||
from typing import Dict, Any, Tuple
|
||||
|
||||
INTAKE_API_URL = os.getenv("INTAKE_API_URL", "http://intake:7080")
|
||||
DEFAULT_SESSION_ID = os.getenv("DEFAULT_SESSION_ID", "default")
|
||||
LOCAL_TZ_LABEL = os.getenv("LOCAL_TZ_LABEL", "America/New_York")
|
||||
|
||||
def fetch_intake_context(session_id: str | None) -> Dict[str, Any]:
|
||||
sid = session_id or DEFAULT_SESSION_ID
|
||||
try:
|
||||
r = requests.get(f"{INTAKE_API_URL}/summaries", params={"session_id": sid}, timeout=4)
|
||||
r.raise_for_status()
|
||||
data = r.json() or {}
|
||||
except Exception:
|
||||
data = {}
|
||||
# Normalize expected fields
|
||||
return {
|
||||
"summary_text": data.get("summary_text", ""),
|
||||
"last_message_ts": data.get("last_message_ts"), # ISO8601 or None
|
||||
"session_id": sid,
|
||||
"exchange_count": data.get("exchange_count", 0),
|
||||
}
|
||||
|
||||
def build_temporal_snapshot(last_ts_iso: str | None) -> Dict[str, Any]:
|
||||
now = datetime.datetime.now() # system local time
|
||||
now_str = now.strftime("%A, %b %-d, %Y, %H:%M")
|
||||
elapsed_str = "unknown"
|
||||
if last_ts_iso:
|
||||
try:
|
||||
# parse ISO (with/without tz). If it has a timezone offset, fromisoformat handles it.
|
||||
last = datetime.datetime.fromisoformat(last_ts_iso.replace("Z", "+00:00"))
|
||||
delta = now - last.replace(tzinfo=None)
|
||||
mins = int(delta.total_seconds() // 60)
|
||||
if mins < 60:
|
||||
elapsed_str = f"{mins} min"
|
||||
else:
|
||||
hrs = mins // 60
|
||||
rem = mins % 60
|
||||
elapsed_str = f"{hrs} hr {rem} min"
|
||||
except Exception:
|
||||
pass
|
||||
return {
|
||||
"local_time_label": LOCAL_TZ_LABEL,
|
||||
"local_time_now": now_str,
|
||||
"elapsed_since_last": elapsed_str,
|
||||
}
|
||||
|
||||
def get_intake_block(session_id: str | None) -> Tuple[str, Dict[str, Any]]:
|
||||
ctx = fetch_intake_context(session_id)
|
||||
temporal = build_temporal_snapshot(ctx.get("last_message_ts"))
|
||||
# A short, ready-to-inject block for prompts:
|
||||
intake_block = (
|
||||
f"[Intake]\n"
|
||||
f"Session: {ctx['session_id']}\n"
|
||||
f"Exchanges: {ctx['exchange_count']}\n"
|
||||
f"Local time ({temporal['local_time_label']}): {temporal['local_time_now']}\n"
|
||||
f"Elapsed since last: {temporal['elapsed_since_last']}\n"
|
||||
f"Recent summary: {ctx['summary_text'] or '(none)'}\n"
|
||||
)
|
||||
# Also return raw dicts if you want to use fields programmatically
|
||||
return intake_block, {"intake": ctx, "temporal": temporal}
|
||||
18
cortex/identity.json
Normal file
18
cortex/identity.json
Normal file
@@ -0,0 +1,18 @@
|
||||
{
|
||||
"name": "Lyra",
|
||||
"version": "0.1",
|
||||
"core_values": [
|
||||
"assist Brian",
|
||||
"maintain continuity",
|
||||
"reason first, speak second"
|
||||
],
|
||||
"personality": {
|
||||
"tone": "warm but analytical",
|
||||
"style": "co-pilot, collaborator"
|
||||
},
|
||||
"rules": {
|
||||
"never hallucinate data": true,
|
||||
"reason_before_response": true,
|
||||
"use_rag_when_uncertain": true
|
||||
}
|
||||
}
|
||||
24
cortex/identity.py
Normal file
24
cortex/identity.py
Normal file
@@ -0,0 +1,24 @@
|
||||
# identity.py
|
||||
import json
|
||||
import os
|
||||
|
||||
IDENTITY_PATH = os.getenv("IDENTITY_PATH", "identity.json")
|
||||
|
||||
def load_identity():
|
||||
"""
|
||||
Load Lyra's identity/persona definition from identity.json.
|
||||
Returns a dict or None if missing/invalid.
|
||||
"""
|
||||
|
||||
if not os.path.exists(IDENTITY_PATH):
|
||||
print(f"[Identity] identity.json not found at {IDENTITY_PATH}")
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(IDENTITY_PATH, "r", encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
print(f"[Identity] Loaded identity from {IDENTITY_PATH}")
|
||||
return data
|
||||
except Exception as e:
|
||||
print(f"[Identity] Failed to load identity.json: {e}")
|
||||
return None
|
||||
33
cortex/ingest_handler.py
Normal file
33
cortex/ingest_handler.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# ingest_handler.py
|
||||
import os
|
||||
import httpx
|
||||
|
||||
NEOMEM_URL = os.getenv("NEOMEM_API", "http://nvgram-api:7077")
|
||||
|
||||
async def handle_ingest(payload):
|
||||
"""
|
||||
Pass user+assistant turns to NeoMem.
|
||||
Minimal version. Does not process or annotate.
|
||||
"""
|
||||
data = {
|
||||
"messages": [],
|
||||
"user_id": "brian" # default for now
|
||||
}
|
||||
|
||||
if payload.user:
|
||||
data["messages"].append({"role": "user", "content": payload.user})
|
||||
|
||||
if payload.assistant:
|
||||
data["messages"].append({"role": "assistant", "content": payload.assistant})
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient() as client:
|
||||
r = await client.post(
|
||||
f"{NEOMEM_URL}/memories",
|
||||
json=data,
|
||||
timeout=5
|
||||
)
|
||||
if r.status_code != 200:
|
||||
print(f"[Ingest] NeoMem returned {r.status_code}: {r.text}")
|
||||
except Exception as e:
|
||||
print(f"[Ingest] Failed to send to NeoMem: {e}")
|
||||
38
cortex/intake_client.py
Normal file
38
cortex/intake_client.py
Normal file
@@ -0,0 +1,38 @@
|
||||
# cortex/intake_client.py
|
||||
import os, httpx, logging
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class IntakeClient:
|
||||
"""Handles short-term / episodic summaries from Intake service."""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = os.getenv("INTAKE_API", "http://intake:7080")
|
||||
|
||||
async def summarize_turn(self, session_id: str, user_msg: str, assistant_msg: Optional[str] = None) -> Dict[str, Any]:
|
||||
payload = {
|
||||
"session_id": session_id,
|
||||
"turns": [{"role": "user", "content": user_msg}]
|
||||
}
|
||||
if assistant_msg:
|
||||
payload["turns"].append({"role": "assistant", "content": assistant_msg})
|
||||
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
try:
|
||||
r = await client.post(f"{self.base_url}/summarize", json=payload)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except Exception as e:
|
||||
logger.warning(f"Intake summarize_turn failed: {e}")
|
||||
return {}
|
||||
|
||||
async def get_context(self, session_id: str) -> str:
|
||||
async with httpx.AsyncClient(timeout=15) as client:
|
||||
try:
|
||||
r = await client.get(f"{self.base_url}/context/{session_id}")
|
||||
r.raise_for_status()
|
||||
return r.text
|
||||
except Exception as e:
|
||||
logger.warning(f"Intake get_context failed: {e}")
|
||||
return ""
|
||||
137
cortex/llm_router.py
Normal file
137
cortex/llm_router.py
Normal file
@@ -0,0 +1,137 @@
|
||||
import os
|
||||
import httpx
|
||||
|
||||
# ============================================================
|
||||
# Backend config lookup
|
||||
# ============================================================
|
||||
|
||||
def get_backend_config(name: str):
|
||||
"""
|
||||
Reads provider/URL/model for a backend.
|
||||
Example env:
|
||||
LLM_PRIMARY_PROVIDER=vllm
|
||||
LLM_PRIMARY_URL=http://10.0.0.43:8000
|
||||
LLM_PRIMARY_MODEL=/model
|
||||
"""
|
||||
key = name.upper()
|
||||
provider = os.getenv(f"LLM_{key}_PROVIDER", "vllm").lower()
|
||||
base_url = os.getenv(f"LLM_{key}_URL", "").rstrip("/")
|
||||
model = os.getenv(f"LLM_{key}_MODEL", "/model")
|
||||
|
||||
if not base_url:
|
||||
raise RuntimeError(f"Backend {name} has no URL configured.")
|
||||
|
||||
return provider, base_url, model
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Build the final API URL
|
||||
# ============================================================
|
||||
|
||||
def build_url(provider: str, base_url: str):
|
||||
"""
|
||||
Provider → correct endpoint.
|
||||
"""
|
||||
if provider == "vllm":
|
||||
return f"{base_url}/v1/completions"
|
||||
|
||||
if provider == "openai_completions":
|
||||
return f"{base_url}/v1/completions"
|
||||
|
||||
if provider == "openai_chat":
|
||||
return f"{base_url}/v1/chat/completions"
|
||||
|
||||
if provider == "ollama":
|
||||
return f"{base_url}/api/generate"
|
||||
|
||||
raise RuntimeError(f"Unknown provider: {provider}")
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Build the payload depending on provider
|
||||
# ============================================================
|
||||
|
||||
def build_payload(provider: str, model: str, prompt: str, temperature: float):
|
||||
|
||||
if provider == "vllm":
|
||||
return {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": 512,
|
||||
"temperature": temperature
|
||||
}
|
||||
|
||||
if provider == "openai_completions":
|
||||
return {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"max_tokens": 512,
|
||||
"temperature": temperature
|
||||
}
|
||||
|
||||
if provider == "openai_chat":
|
||||
return {
|
||||
"model": model,
|
||||
"messages": [{"role": "user", "content": prompt}],
|
||||
"temperature": temperature
|
||||
}
|
||||
|
||||
if provider == "ollama":
|
||||
return {
|
||||
"model": model,
|
||||
"prompt": prompt,
|
||||
"stream": False
|
||||
}
|
||||
|
||||
raise RuntimeError(f"Unknown provider: {provider}")
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Unified LLM call
|
||||
# ============================================================
|
||||
|
||||
async def call_llm(prompt: str,
|
||||
backend: str = "primary",
|
||||
temperature: float = 0.7):
|
||||
|
||||
provider, base_url, model = get_backend_config(backend)
|
||||
url = build_url(provider, base_url)
|
||||
payload = build_payload(provider, model, prompt, temperature)
|
||||
|
||||
headers = {"Content-Type": "application/json"}
|
||||
|
||||
# Cloud auth (OpenAI)
|
||||
if provider.startswith("openai"):
|
||||
api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
raise RuntimeError("OPENAI_API_KEY missing")
|
||||
headers["Authorization"] = f"Bearer {api_key}"
|
||||
|
||||
async with httpx.AsyncClient() as client:
|
||||
try:
|
||||
resp = await client.post(url, json=payload, headers=headers, timeout=45)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
except Exception as e:
|
||||
return f"[LLM-Error] {e}"
|
||||
|
||||
# =======================================================
|
||||
# Unified output extraction
|
||||
# =======================================================
|
||||
# vLLM + OpenAI completions
|
||||
if provider in ["vllm", "openai_completions"]:
|
||||
return (
|
||||
data["choices"][0].get("text") or
|
||||
data["choices"][0].get("message", {}).get("content", "")
|
||||
).strip()
|
||||
|
||||
# OpenAI chat
|
||||
if provider == "openai_chat":
|
||||
return data["choices"][0]["message"]["content"].strip()
|
||||
|
||||
# Ollama
|
||||
if provider == "ollama":
|
||||
# Ollama returns: {"model": "...", "created_at": ..., "response": "..."}
|
||||
return data.get("response", "").strip()
|
||||
|
||||
return str(data).strip()
|
||||
33
cortex/log_utils.py
Normal file
33
cortex/log_utils.py
Normal file
@@ -0,0 +1,33 @@
|
||||
import os, json, datetime
|
||||
|
||||
# optional daily rotation
|
||||
LOG_PATH = os.getenv("REFLECTION_NOTE_PATH") or \
|
||||
f"/app/logs/reflections_{datetime.date.today():%Y%m%d}.log"
|
||||
|
||||
def log_reflection(reflection: dict, user_prompt: str, draft: str, final: str, session_id: str | None = None):
|
||||
"""Append a reflection entry to the reflections log."""
|
||||
try:
|
||||
# 1️⃣ Make sure log directory exists
|
||||
os.makedirs(os.path.dirname(LOG_PATH), exist_ok=True)
|
||||
|
||||
# 2️⃣ Ensure session_id is stored
|
||||
reflection["session_id"] = session_id or reflection.get("session_id", "unknown")
|
||||
|
||||
# 3️⃣ Build JSON entry
|
||||
entry = {
|
||||
"timestamp": datetime.datetime.now().isoformat(),
|
||||
"session_id": reflection["session_id"],
|
||||
"prompt": user_prompt,
|
||||
"draft_output": draft[:500],
|
||||
"final_output": final[:500],
|
||||
"reflection": reflection,
|
||||
}
|
||||
|
||||
# 4️⃣ Write it in pretty JSON, comma-delimited for easy reading
|
||||
with open(LOG_PATH, "a", encoding="utf-8") as f:
|
||||
f.write(json.dumps(entry, indent=2, ensure_ascii=False) + ",\n")
|
||||
|
||||
print(f"[Cortex] Logged reflection → {LOG_PATH}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"[Cortex] Failed to log reflection: {e}")
|
||||
87
cortex/main.py
Normal file
87
cortex/main.py
Normal file
@@ -0,0 +1,87 @@
|
||||
from fastapi import FastAPI
|
||||
from pydantic import BaseModel
|
||||
from identity import load_identity
|
||||
from reasoning import reason_check
|
||||
from reflection import reflect_notes
|
||||
from rag import query_rag
|
||||
from ingest_handler import handle_ingest
|
||||
from refine import refine_answer
|
||||
|
||||
|
||||
# ---------------------------------------------------
|
||||
# Create the app BEFORE using it
|
||||
# ---------------------------------------------------
|
||||
app = FastAPI()
|
||||
|
||||
# ---------------------------------------------------
|
||||
# Models
|
||||
# ---------------------------------------------------
|
||||
class ReasonRequest(BaseModel):
|
||||
prompt: str
|
||||
session_id: str | None = None
|
||||
|
||||
class IngestRequest(BaseModel):
|
||||
user: str
|
||||
assistant: str | None = None
|
||||
session_id: str | None = None
|
||||
|
||||
# ---------------------------------------------------
|
||||
# Load identity
|
||||
# ---------------------------------------------------
|
||||
IDENTITY = load_identity()
|
||||
|
||||
# ---------------------------------------------------
|
||||
# Routes MUST come after app = FastAPI()
|
||||
# ---------------------------------------------------
|
||||
|
||||
@app.get("/health")
|
||||
def health():
|
||||
return {
|
||||
"status": "ok",
|
||||
"identity_loaded": IDENTITY is not None
|
||||
}
|
||||
|
||||
@app.post("/ingest")
|
||||
async def ingest(data: IngestRequest):
|
||||
await handle_ingest(data)
|
||||
return {"status": "ok"}
|
||||
|
||||
@app.post("/reason")
|
||||
async def reason(data: ReasonRequest):
|
||||
user_prompt = data.prompt
|
||||
|
||||
intake_summary = "recent summary"
|
||||
|
||||
identity_block = IDENTITY
|
||||
rag_block = query_rag(user_prompt)
|
||||
|
||||
reflection_data = await reflect_notes(intake_summary, identity_block)
|
||||
notes = reflection_data.get("notes", [])
|
||||
|
||||
draft = await reason_check(
|
||||
user_prompt,
|
||||
identity_block,
|
||||
rag_block,
|
||||
notes
|
||||
)
|
||||
# --- REFINE STEP ----------------------------------------------------
|
||||
refine_result = refine_answer(
|
||||
draft_output=draft,
|
||||
reflection_notes=notes,
|
||||
identity_block=identity_block,
|
||||
rag_block=rag_block,
|
||||
)
|
||||
|
||||
final_output = refine_result["final_output"]
|
||||
|
||||
return {
|
||||
"draft_output": draft,
|
||||
"reflection_notes": notes,
|
||||
"refined_output": final_output,
|
||||
"refine_meta": {
|
||||
"used_primary_backend": refine_result.get("used_primary_backend"),
|
||||
"fallback_used": refine_result.get("fallback_used")
|
||||
},
|
||||
"identity_used": identity_block is not None,
|
||||
"rag_used": rag_block is not None
|
||||
}
|
||||
43
cortex/neomem_client.py
Normal file
43
cortex/neomem_client.py
Normal file
@@ -0,0 +1,43 @@
|
||||
# cortex/neomem_client.py
|
||||
import os, httpx, logging
|
||||
from typing import List, Dict, Any, Optional
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class NeoMemClient:
|
||||
"""Simple REST client for the NeoMem API (search/add/health)."""
|
||||
|
||||
def __init__(self):
|
||||
self.base_url = os.getenv("NEOMEM_API", "http://neomem-api:7077")
|
||||
self.api_key = os.getenv("NEOMEM_API_KEY", None)
|
||||
self.headers = {"Content-Type": "application/json"}
|
||||
if self.api_key:
|
||||
self.headers["Authorization"] = f"Bearer {self.api_key}"
|
||||
|
||||
async def health(self) -> Dict[str, Any]:
|
||||
async with httpx.AsyncClient(timeout=10) as client:
|
||||
r = await client.get(f"{self.base_url}/health")
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
|
||||
async def search(self, query: str, user_id: str, limit: int = 25, threshold: float = 0.82) -> List[Dict[str, Any]]:
|
||||
payload = {"query": query, "user_id": user_id, "limit": limit}
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
r = await client.post(f"{self.base_url}/search", headers=self.headers, json=payload)
|
||||
if r.status_code != 200:
|
||||
logger.warning(f"NeoMem search failed ({r.status_code}): {r.text}")
|
||||
return []
|
||||
results = r.json()
|
||||
# Filter by score threshold if field exists
|
||||
if isinstance(results, dict) and "results" in results:
|
||||
results = results["results"]
|
||||
filtered = [m for m in results if float(m.get("score", 0)) >= threshold]
|
||||
logger.info(f"NeoMem search returned {len(filtered)} results above {threshold}")
|
||||
return filtered
|
||||
|
||||
async def add(self, messages: List[Dict[str, Any]], user_id: str, metadata: Optional[Dict[str, Any]] = None) -> Dict[str, Any]:
|
||||
payload = {"messages": messages, "user_id": user_id, "metadata": metadata or {}}
|
||||
async with httpx.AsyncClient(timeout=30) as client:
|
||||
r = await client.post(f"{self.base_url}/memories", headers=self.headers, json=payload)
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
32
cortex/rag.py
Normal file
32
cortex/rag.py
Normal file
@@ -0,0 +1,32 @@
|
||||
import os, requests
|
||||
from typing import Dict, Any, List
|
||||
|
||||
RAG_API_URL = os.getenv("RAG_API_URL", "http://localhost:7090")
|
||||
|
||||
def query_rag(query: str, where: Dict[str, Any] | None = None, k: int = 6) -> Dict[str, Any]:
|
||||
payload = {"query": query, "k": k}
|
||||
if where:
|
||||
payload["where"] = where
|
||||
try:
|
||||
r = requests.post(f"{RAG_API_URL}/rag/search", json=payload, timeout=8)
|
||||
r.raise_for_status()
|
||||
data = r.json() or {}
|
||||
except Exception as e:
|
||||
data = {"answer": "", "chunks": [], "error": str(e)}
|
||||
return data
|
||||
|
||||
def format_rag_block(result: Dict[str, Any]) -> str:
|
||||
answer = (result.get("answer") or "").strip()
|
||||
chunks: List[Dict[str, Any]] = result.get("chunks") or []
|
||||
lines = ["[RAG]"]
|
||||
if answer:
|
||||
lines.append(f"Synthesized answer: {answer}")
|
||||
if chunks:
|
||||
lines.append("Top excerpts:")
|
||||
for i, c in enumerate(chunks[:5], 1):
|
||||
src = c.get("metadata", {}).get("source", "unknown")
|
||||
txt = (c.get("text") or "").strip().replace("\n", " ")
|
||||
if len(txt) > 220:
|
||||
txt = txt[:220] + "…"
|
||||
lines.append(f" {i}. {txt} — {src}")
|
||||
return "\n".join(lines) + ("\n" if lines else "")
|
||||
33
cortex/reasoning.py
Normal file
33
cortex/reasoning.py
Normal file
@@ -0,0 +1,33 @@
|
||||
# reasoning.py
|
||||
from llm_router import call_llm
|
||||
|
||||
async def reason_check(user_prompt: str,
|
||||
identity_block: dict | None,
|
||||
rag_block: dict | None,
|
||||
reflection_notes: list[str]) -> str:
|
||||
"""
|
||||
Generate a first draft using identity, RAG, and reflection notes.
|
||||
No critique loop yet.
|
||||
"""
|
||||
|
||||
# Build internal notes section
|
||||
notes_section = ""
|
||||
if reflection_notes:
|
||||
notes_section = "Reflection Notes (internal, do NOT show to user):\n"
|
||||
for n in reflection_notes:
|
||||
notes_section += f"- {n}\n"
|
||||
notes_section += "\n"
|
||||
|
||||
identity_txt = f"Identity: {identity_block}\n\n" if identity_block else ""
|
||||
rag_txt = f"Relevant info: {rag_block}\n\n" if rag_block else ""
|
||||
|
||||
prompt = (
|
||||
f"{notes_section}"
|
||||
f"{identity_txt}"
|
||||
f"{rag_txt}"
|
||||
f"User said:\n{user_prompt}\n\n"
|
||||
"Draft the best possible internal answer."
|
||||
)
|
||||
|
||||
draft = await call_llm(prompt)
|
||||
return draft
|
||||
187
cortex/refine.py
Normal file
187
cortex/refine.py
Normal file
@@ -0,0 +1,187 @@
|
||||
# refine.py
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ============================================================
|
||||
# Config
|
||||
# ============================================================
|
||||
|
||||
PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
|
||||
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")
|
||||
|
||||
REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
|
||||
REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
|
||||
REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Prompt builder
|
||||
# ============================================================
|
||||
|
||||
def build_refine_prompt(
|
||||
draft_output: str,
|
||||
reflection_notes: Optional[Any],
|
||||
identity_block: Optional[str],
|
||||
rag_block: Optional[str],
|
||||
) -> str:
|
||||
"""
|
||||
Build a single text prompt for vLLM /v1/completions.
|
||||
Persona styling is *not* applied here; this is internal reasoning.
|
||||
"""
|
||||
|
||||
reflection_text: str
|
||||
if reflection_notes is None:
|
||||
reflection_text = "(none)"
|
||||
elif isinstance(reflection_notes, str):
|
||||
reflection_text = reflection_notes
|
||||
else:
|
||||
# dict / list → compact JSON
|
||||
try:
|
||||
reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
|
||||
except Exception:
|
||||
reflection_text = str(reflection_notes)
|
||||
|
||||
identity_text = identity_block or "(none)"
|
||||
rag_text = rag_block or "(none)"
|
||||
|
||||
prompt = f"""You are Lyra Cortex's internal refiner.
|
||||
|
||||
Your job:
|
||||
- Take the existing draft answer.
|
||||
- Use the reflection notes to fix problems (errors, confusion, missing pieces).
|
||||
- Use the RAG context as higher-authority factual grounding.
|
||||
- Respect the identity block (constraints, boundaries, style rules),
|
||||
but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
|
||||
- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.
|
||||
|
||||
If there is a conflict:
|
||||
- RAG context wins over the draft.
|
||||
- Reflection notes win over the draft when they point out real issues.
|
||||
|
||||
Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.
|
||||
|
||||
------------------------------
|
||||
[IDENTITY BLOCK]
|
||||
{identity_text}
|
||||
|
||||
------------------------------
|
||||
[RAG CONTEXT]
|
||||
{rag_text}
|
||||
|
||||
------------------------------
|
||||
[DRAFT ANSWER]
|
||||
{draft_output}
|
||||
|
||||
------------------------------
|
||||
[REFLECTION NOTES]
|
||||
{reflection_text}
|
||||
|
||||
------------------------------
|
||||
Task:
|
||||
Rewrite the DRAFT ANSWER into a single, final answer for the user that:
|
||||
- fixes factual or logical issues noted above,
|
||||
- incorporates any truly helpful additions from the reflection,
|
||||
- stays consistent with the identity block,
|
||||
- stays grounded in the RAG context,
|
||||
- is as concise as is reasonably possible.
|
||||
|
||||
Return ONLY the final answer text. No headings, no labels, no commentary.
|
||||
"""
|
||||
return prompt
|
||||
|
||||
|
||||
# ============================================================
|
||||
# vLLM call (PRIMARY backend only)
|
||||
# ============================================================
|
||||
|
||||
def _call_primary_llm(prompt: str) -> str:
|
||||
if not PRIMARY_URL:
|
||||
raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")
|
||||
|
||||
payload = {
|
||||
"model": PRIMARY_MODEL,
|
||||
"prompt": prompt,
|
||||
"max_tokens": REFINER_MAX_TOKENS,
|
||||
"temperature": REFINER_TEMPERATURE,
|
||||
}
|
||||
|
||||
resp = requests.post(
|
||||
PRIMARY_URL,
|
||||
headers={"Content-Type": "application/json"},
|
||||
json=payload,
|
||||
timeout=120,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
|
||||
# vLLM /v1/completions format
|
||||
try:
|
||||
text = data["choices"][0]["text"]
|
||||
except Exception as e:
|
||||
logger.error("refine.py: unable to parse primary LLM response: %s", e)
|
||||
logger.debug("refine.py raw response: %s", data)
|
||||
raise
|
||||
|
||||
return text.strip()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Public API
|
||||
# ============================================================
|
||||
|
||||
def refine_answer(
|
||||
draft_output: str,
|
||||
reflection_notes: Optional[Any],
|
||||
identity_block: Optional[str],
|
||||
rag_block: Optional[str],
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Main entrypoint used by Cortex.
|
||||
|
||||
Returns:
|
||||
{
|
||||
"final_output": <str>, # what should go to persona / user
|
||||
"used_primary_backend": True/False,
|
||||
"fallback_used": True/False,
|
||||
optionally:
|
||||
"debug": {...} # only when REFINER_DEBUG=true
|
||||
}
|
||||
"""
|
||||
|
||||
if not draft_output:
|
||||
# Nothing to refine. Don't get cute.
|
||||
return {
|
||||
"final_output": "",
|
||||
"used_primary_backend": False,
|
||||
"fallback_used": False,
|
||||
}
|
||||
|
||||
prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)
|
||||
|
||||
try:
|
||||
refined = _call_primary_llm(prompt)
|
||||
result: Dict[str, Any] = {
|
||||
"final_output": refined or draft_output,
|
||||
"used_primary_backend": True,
|
||||
"fallback_used": False,
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
|
||||
result = {
|
||||
"final_output": draft_output,
|
||||
"used_primary_backend": False,
|
||||
"fallback_used": True,
|
||||
}
|
||||
|
||||
if REFINER_DEBUG:
|
||||
result["debug"] = {
|
||||
"prompt": prompt[:4000], # don’t nuke logs
|
||||
}
|
||||
|
||||
return result
|
||||
56
cortex/reflection.py
Normal file
56
cortex/reflection.py
Normal file
@@ -0,0 +1,56 @@
|
||||
# reflection.py
|
||||
from llm_router import call_llm
|
||||
import json
|
||||
|
||||
|
||||
async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dict:
|
||||
"""
|
||||
Generate reflection notes (internal guidance) for the reasoning engine.
|
||||
These notes help simulate continuity and identity without being shown to the user.
|
||||
"""
|
||||
|
||||
identity_text = ""
|
||||
if identity_block:
|
||||
identity_text = f"Identity:\n{identity_block}\n\n"
|
||||
|
||||
prompt = (
|
||||
f"{identity_text}"
|
||||
f"Recent summary:\n{intake_summary}\n\n"
|
||||
"You are Lyra's meta-awareness layer. Your job is to produce short, directive "
|
||||
"internal notes that guide Lyra’s reasoning engine. These notes are NEVER "
|
||||
"shown to the user.\n\n"
|
||||
"Rules for output:\n"
|
||||
"1. Return ONLY valid JSON.\n"
|
||||
"2. JSON must have exactly one key: \"notes\".\n"
|
||||
"3. \"notes\" must be a list of 3–6 short strings.\n"
|
||||
"4. Notes must be actionable (e.g., \"keep it concise\", \"maintain context\").\n"
|
||||
"5. No markdown, no apologies, no explanations.\n\n"
|
||||
"Return JSON:\n"
|
||||
"{ \"notes\": [\"...\"] }\n"
|
||||
)
|
||||
|
||||
|
||||
raw = await call_llm(prompt, backend_override="cloud")
|
||||
print("[Reflection-Raw]:", raw)
|
||||
|
||||
|
||||
try:
|
||||
parsed = json.loads(raw.strip())
|
||||
if isinstance(parsed, dict) and "notes" in parsed:
|
||||
return parsed
|
||||
except:
|
||||
pass
|
||||
|
||||
# Try to extract JSON inside text
|
||||
try:
|
||||
import re
|
||||
match = re.search(r'\{.*?\}', raw, re.S) # <-- non-greedy !
|
||||
if match:
|
||||
parsed = json.loads(match.group(0))
|
||||
if isinstance(parsed, dict) and "notes" in parsed:
|
||||
return parsed
|
||||
except:
|
||||
pass
|
||||
|
||||
# Final fallback
|
||||
return {"notes": [raw.strip()]}
|
||||
6
cortex/requirements.txt
Normal file
6
cortex/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
fastapi==0.115.8
|
||||
uvicorn==0.34.0
|
||||
python-dotenv==1.0.1
|
||||
requests==2.32.3
|
||||
httpx==0.27.2
|
||||
pydantic==2.10.4
|
||||
Reference in New Issue
Block a user