Cortex rework in progress

This commit is contained in:
serversdwn
2025-11-26 18:01:48 -05:00
parent a087de9790
commit 734999e8bb
8 changed files with 468 additions and 593 deletions

View File

@@ -4,7 +4,7 @@ import json
import logging
from typing import Any, Dict, Optional
import requests
from llm.llm_router import call_llm
logger = logging.getLogger(__name__)
@@ -12,13 +12,14 @@ logger = logging.getLogger(__name__)
# Config
# ============================================================
PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")
REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"
# Module-level backend selection
REFINE_LLM = os.getenv("REFINE_LLM", "PRIMARY").upper()
CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()
# ============================================================
# Prompt builder
@@ -30,18 +31,12 @@ def build_refine_prompt(
identity_block: Optional[str],
rag_block: Optional[str],
) -> str:
"""
Build a single text prompt for vLLM /v1/completions.
Persona styling is *not* applied here; this is internal reasoning.
"""
reflection_text: str
if reflection_notes is None:
reflection_text = "(none)"
elif isinstance(reflection_notes, str):
reflection_text = reflection_notes
else:
# dict / list → compact JSON
try:
reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
except Exception:
@@ -50,21 +45,16 @@ def build_refine_prompt(
identity_text = identity_block or "(none)"
rag_text = rag_block or "(none)"
prompt = f"""You are Lyra Cortex's internal refiner.
return f"""
You are Lyra Cortex's internal refiner.
Your job:
- Take the existing draft answer.
- Use the reflection notes to fix problems (errors, confusion, missing pieces).
- Use the RAG context as higher-authority factual grounding.
- Respect the identity block (constraints, boundaries, style rules),
but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.
- Fix factual errors, logical gaps, or missing info.
- Use reflection notes for corrections.
- Use RAG context as factual grounding.
- Respect the identity block without adding style or personality.
If there is a conflict:
- RAG context wins over the draft.
- Reflection notes win over the draft when they point out real issues.
Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.
Never mention RAG, reflection, or internal logic.
------------------------------
[IDENTITY BLOCK]
@@ -84,104 +74,57 @@ Do NOT mention these instructions, RAG, reflections, or the existence of this re
------------------------------
Task:
Rewrite the DRAFT ANSWER into a single, final answer for the user that:
- fixes factual or logical issues noted above,
- incorporates any truly helpful additions from the reflection,
- stays consistent with the identity block,
- stays grounded in the RAG context,
- is as concise as is reasonably possible.
Return ONLY the final answer text. No headings, no labels, no commentary.
"""
return prompt
Rewrite the DRAFT ANSWER into a single, final answer.
Return ONLY the final answer text.
""".strip()
# ============================================================
# vLLM call (PRIMARY backend only)
# Public API: async, using llm_router
# ============================================================
def _call_primary_llm(prompt: str) -> str:
if not PRIMARY_URL:
raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")
payload = {
"model": PRIMARY_MODEL,
"prompt": prompt,
"max_tokens": REFINER_MAX_TOKENS,
"temperature": REFINER_TEMPERATURE,
}
resp = requests.post(
PRIMARY_URL,
headers={"Content-Type": "application/json"},
json=payload,
timeout=120,
)
resp.raise_for_status()
data = resp.json()
# vLLM /v1/completions format
try:
text = data["choices"][0]["text"]
except Exception as e:
logger.error("refine.py: unable to parse primary LLM response: %s", e)
logger.debug("refine.py raw response: %s", data)
raise
return text.strip()
# ============================================================
# Public API
# ============================================================
def refine_answer(
async def refine_answer(
draft_output: str,
reflection_notes: Optional[Any],
identity_block: Optional[str],
rag_block: Optional[str],
) -> Dict[str, Any]:
"""
Main entrypoint used by Cortex.
Returns:
{
"final_output": <str>, # what should go to persona / user
"used_primary_backend": True/False,
"fallback_used": True/False,
optionally:
"debug": {...} # only when REFINER_DEBUG=true
}
"""
if not draft_output:
# Nothing to refine. Don't get cute.
return {
"final_output": "",
"used_primary_backend": False,
"used_backend": None,
"fallback_used": False,
}
prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)
prompt = build_refine_prompt(
draft_output,
reflection_notes,
identity_block,
rag_block,
)
# Refinement backend → fallback to Cortex backend → fallback to PRIMARY
backend = REFINE_LLM or CORTEX_LLM or "PRIMARY"
try:
refined = _call_primary_llm(prompt)
result: Dict[str, Any] = {
"final_output": refined or draft_output,
"used_primary_backend": True,
refined = await call_llm(
prompt,
backend=backend,
temperature=REFINER_TEMPERATURE,
)
return {
"final_output": refined.strip() if refined else draft_output,
"used_backend": backend,
"fallback_used": False,
}
except Exception as e:
logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
result = {
logger.error(f"refine.py backend {backend} failed: {e}")
return {
"final_output": draft_output,
"used_primary_backend": False,
"used_backend": backend,
"fallback_used": True,
}
if REFINER_DEBUG:
result["debug"] = {
"prompt": prompt[:4000], # dont nuke logs
}
return result