Files
project-lyra/cortex/reasoning/refine.py
2025-11-25 20:50:05 -05:00

188 lines
5.4 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# refine.py
import os
import json
import logging
from typing import Any, Dict, Optional
import requests
logger = logging.getLogger(__name__)
# ============================================================
# Config
# ============================================================
PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")
REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"
# ============================================================
# Prompt builder
# ============================================================
def build_refine_prompt(
draft_output: str,
reflection_notes: Optional[Any],
identity_block: Optional[str],
rag_block: Optional[str],
) -> str:
"""
Build a single text prompt for vLLM /v1/completions.
Persona styling is *not* applied here; this is internal reasoning.
"""
reflection_text: str
if reflection_notes is None:
reflection_text = "(none)"
elif isinstance(reflection_notes, str):
reflection_text = reflection_notes
else:
# dict / list → compact JSON
try:
reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
except Exception:
reflection_text = str(reflection_notes)
identity_text = identity_block or "(none)"
rag_text = rag_block or "(none)"
prompt = f"""You are Lyra Cortex's internal refiner.
Your job:
- Take the existing draft answer.
- Use the reflection notes to fix problems (errors, confusion, missing pieces).
- Use the RAG context as higher-authority factual grounding.
- Respect the identity block (constraints, boundaries, style rules),
but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.
If there is a conflict:
- RAG context wins over the draft.
- Reflection notes win over the draft when they point out real issues.
Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.
------------------------------
[IDENTITY BLOCK]
{identity_text}
------------------------------
[RAG CONTEXT]
{rag_text}
------------------------------
[DRAFT ANSWER]
{draft_output}
------------------------------
[REFLECTION NOTES]
{reflection_text}
------------------------------
Task:
Rewrite the DRAFT ANSWER into a single, final answer for the user that:
- fixes factual or logical issues noted above,
- incorporates any truly helpful additions from the reflection,
- stays consistent with the identity block,
- stays grounded in the RAG context,
- is as concise as is reasonably possible.
Return ONLY the final answer text. No headings, no labels, no commentary.
"""
return prompt
# ============================================================
# vLLM call (PRIMARY backend only)
# ============================================================
def _call_primary_llm(prompt: str) -> str:
if not PRIMARY_URL:
raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")
payload = {
"model": PRIMARY_MODEL,
"prompt": prompt,
"max_tokens": REFINER_MAX_TOKENS,
"temperature": REFINER_TEMPERATURE,
}
resp = requests.post(
PRIMARY_URL,
headers={"Content-Type": "application/json"},
json=payload,
timeout=120,
)
resp.raise_for_status()
data = resp.json()
# vLLM /v1/completions format
try:
text = data["choices"][0]["text"]
except Exception as e:
logger.error("refine.py: unable to parse primary LLM response: %s", e)
logger.debug("refine.py raw response: %s", data)
raise
return text.strip()
# ============================================================
# Public API
# ============================================================
def refine_answer(
draft_output: str,
reflection_notes: Optional[Any],
identity_block: Optional[str],
rag_block: Optional[str],
) -> Dict[str, Any]:
"""
Main entrypoint used by Cortex.
Returns:
{
"final_output": <str>, # what should go to persona / user
"used_primary_backend": True/False,
"fallback_used": True/False,
optionally:
"debug": {...} # only when REFINER_DEBUG=true
}
"""
if not draft_output:
# Nothing to refine. Don't get cute.
return {
"final_output": "",
"used_primary_backend": False,
"fallback_used": False,
}
prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)
try:
refined = _call_primary_llm(prompt)
result: Dict[str, Any] = {
"final_output": refined or draft_output,
"used_primary_backend": True,
"fallback_used": False,
}
except Exception as e:
logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
result = {
"final_output": draft_output,
"used_primary_backend": False,
"fallback_used": True,
}
if REFINER_DEBUG:
result["debug"] = {
"prompt": prompt[:4000], # dont nuke logs
}
return result