188 lines
5.4 KiB
Python
188 lines
5.4 KiB
Python
# refine.py
|
||
import os
|
||
import json
|
||
import logging
|
||
from typing import Any, Dict, Optional
|
||
|
||
import requests
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ============================================================
|
||
# Config
|
||
# ============================================================
|
||
|
||
PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
|
||
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")
|
||
|
||
REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
|
||
REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
|
||
REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"
|
||
|
||
|
||
# ============================================================
|
||
# Prompt builder
|
||
# ============================================================
|
||
|
||
def build_refine_prompt(
|
||
draft_output: str,
|
||
reflection_notes: Optional[Any],
|
||
identity_block: Optional[str],
|
||
rag_block: Optional[str],
|
||
) -> str:
|
||
"""
|
||
Build a single text prompt for vLLM /v1/completions.
|
||
Persona styling is *not* applied here; this is internal reasoning.
|
||
"""
|
||
|
||
reflection_text: str
|
||
if reflection_notes is None:
|
||
reflection_text = "(none)"
|
||
elif isinstance(reflection_notes, str):
|
||
reflection_text = reflection_notes
|
||
else:
|
||
# dict / list → compact JSON
|
||
try:
|
||
reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
|
||
except Exception:
|
||
reflection_text = str(reflection_notes)
|
||
|
||
identity_text = identity_block or "(none)"
|
||
rag_text = rag_block or "(none)"
|
||
|
||
prompt = f"""You are Lyra Cortex's internal refiner.
|
||
|
||
Your job:
|
||
- Take the existing draft answer.
|
||
- Use the reflection notes to fix problems (errors, confusion, missing pieces).
|
||
- Use the RAG context as higher-authority factual grounding.
|
||
- Respect the identity block (constraints, boundaries, style rules),
|
||
but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
|
||
- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.
|
||
|
||
If there is a conflict:
|
||
- RAG context wins over the draft.
|
||
- Reflection notes win over the draft when they point out real issues.
|
||
|
||
Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.
|
||
|
||
------------------------------
|
||
[IDENTITY BLOCK]
|
||
{identity_text}
|
||
|
||
------------------------------
|
||
[RAG CONTEXT]
|
||
{rag_text}
|
||
|
||
------------------------------
|
||
[DRAFT ANSWER]
|
||
{draft_output}
|
||
|
||
------------------------------
|
||
[REFLECTION NOTES]
|
||
{reflection_text}
|
||
|
||
------------------------------
|
||
Task:
|
||
Rewrite the DRAFT ANSWER into a single, final answer for the user that:
|
||
- fixes factual or logical issues noted above,
|
||
- incorporates any truly helpful additions from the reflection,
|
||
- stays consistent with the identity block,
|
||
- stays grounded in the RAG context,
|
||
- is as concise as is reasonably possible.
|
||
|
||
Return ONLY the final answer text. No headings, no labels, no commentary.
|
||
"""
|
||
return prompt
|
||
|
||
|
||
# ============================================================
|
||
# vLLM call (PRIMARY backend only)
|
||
# ============================================================
|
||
|
||
def _call_primary_llm(prompt: str) -> str:
|
||
if not PRIMARY_URL:
|
||
raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")
|
||
|
||
payload = {
|
||
"model": PRIMARY_MODEL,
|
||
"prompt": prompt,
|
||
"max_tokens": REFINER_MAX_TOKENS,
|
||
"temperature": REFINER_TEMPERATURE,
|
||
}
|
||
|
||
resp = requests.post(
|
||
PRIMARY_URL,
|
||
headers={"Content-Type": "application/json"},
|
||
json=payload,
|
||
timeout=120,
|
||
)
|
||
resp.raise_for_status()
|
||
data = resp.json()
|
||
|
||
# vLLM /v1/completions format
|
||
try:
|
||
text = data["choices"][0]["text"]
|
||
except Exception as e:
|
||
logger.error("refine.py: unable to parse primary LLM response: %s", e)
|
||
logger.debug("refine.py raw response: %s", data)
|
||
raise
|
||
|
||
return text.strip()
|
||
|
||
|
||
# ============================================================
|
||
# Public API
|
||
# ============================================================
|
||
|
||
def refine_answer(
|
||
draft_output: str,
|
||
reflection_notes: Optional[Any],
|
||
identity_block: Optional[str],
|
||
rag_block: Optional[str],
|
||
) -> Dict[str, Any]:
|
||
"""
|
||
Main entrypoint used by Cortex.
|
||
|
||
Returns:
|
||
{
|
||
"final_output": <str>, # what should go to persona / user
|
||
"used_primary_backend": True/False,
|
||
"fallback_used": True/False,
|
||
optionally:
|
||
"debug": {...} # only when REFINER_DEBUG=true
|
||
}
|
||
"""
|
||
|
||
if not draft_output:
|
||
# Nothing to refine. Don't get cute.
|
||
return {
|
||
"final_output": "",
|
||
"used_primary_backend": False,
|
||
"fallback_used": False,
|
||
}
|
||
|
||
prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)
|
||
|
||
try:
|
||
refined = _call_primary_llm(prompt)
|
||
result: Dict[str, Any] = {
|
||
"final_output": refined or draft_output,
|
||
"used_primary_backend": True,
|
||
"fallback_used": False,
|
||
}
|
||
except Exception as e:
|
||
logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
|
||
result = {
|
||
"final_output": draft_output,
|
||
"used_primary_backend": False,
|
||
"fallback_used": True,
|
||
}
|
||
|
||
if REFINER_DEBUG:
|
||
result["debug"] = {
|
||
"prompt": prompt[:4000], # don’t nuke logs
|
||
}
|
||
|
||
return result
|