# refine.py import os import json import logging from typing import Any, Dict, Optional import requests logger = logging.getLogger(__name__) # ============================================================ # Config # ============================================================ PRIMARY_URL = os.getenv("LLM_PRIMARY_URL") PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax") REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3")) REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768")) REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true" # ============================================================ # Prompt builder # ============================================================ def build_refine_prompt( draft_output: str, reflection_notes: Optional[Any], identity_block: Optional[str], rag_block: Optional[str], ) -> str: """ Build a single text prompt for vLLM /v1/completions. Persona styling is *not* applied here; this is internal reasoning. """ reflection_text: str if reflection_notes is None: reflection_text = "(none)" elif isinstance(reflection_notes, str): reflection_text = reflection_notes else: # dict / list → compact JSON try: reflection_text = json.dumps(reflection_notes, ensure_ascii=False) except Exception: reflection_text = str(reflection_notes) identity_text = identity_block or "(none)" rag_text = rag_block or "(none)" prompt = f"""You are Lyra Cortex's internal refiner. Your job: - Take the existing draft answer. - Use the reflection notes to fix problems (errors, confusion, missing pieces). - Use the RAG context as higher-authority factual grounding. - Respect the identity block (constraints, boundaries, style rules), but DO NOT add personality flourishes or roleplay. Stay neutral and clear. - Produce ONE final answer that is coherent, self-consistent, and directly addresses the user. If there is a conflict: - RAG context wins over the draft. - Reflection notes win over the draft when they point out real issues. Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step. ------------------------------ [IDENTITY BLOCK] {identity_text} ------------------------------ [RAG CONTEXT] {rag_text} ------------------------------ [DRAFT ANSWER] {draft_output} ------------------------------ [REFLECTION NOTES] {reflection_text} ------------------------------ Task: Rewrite the DRAFT ANSWER into a single, final answer for the user that: - fixes factual or logical issues noted above, - incorporates any truly helpful additions from the reflection, - stays consistent with the identity block, - stays grounded in the RAG context, - is as concise as is reasonably possible. Return ONLY the final answer text. No headings, no labels, no commentary. """ return prompt # ============================================================ # vLLM call (PRIMARY backend only) # ============================================================ def _call_primary_llm(prompt: str) -> str: if not PRIMARY_URL: raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py") payload = { "model": PRIMARY_MODEL, "prompt": prompt, "max_tokens": REFINER_MAX_TOKENS, "temperature": REFINER_TEMPERATURE, } resp = requests.post( PRIMARY_URL, headers={"Content-Type": "application/json"}, json=payload, timeout=120, ) resp.raise_for_status() data = resp.json() # vLLM /v1/completions format try: text = data["choices"][0]["text"] except Exception as e: logger.error("refine.py: unable to parse primary LLM response: %s", e) logger.debug("refine.py raw response: %s", data) raise return text.strip() # ============================================================ # Public API # ============================================================ def refine_answer( draft_output: str, reflection_notes: Optional[Any], identity_block: Optional[str], rag_block: Optional[str], ) -> Dict[str, Any]: """ Main entrypoint used by Cortex. Returns: { "final_output": , # what should go to persona / user "used_primary_backend": True/False, "fallback_used": True/False, optionally: "debug": {...} # only when REFINER_DEBUG=true } """ if not draft_output: # Nothing to refine. Don't get cute. return { "final_output": "", "used_primary_backend": False, "fallback_used": False, } prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block) try: refined = _call_primary_llm(prompt) result: Dict[str, Any] = { "final_output": refined or draft_output, "used_primary_backend": True, "fallback_used": False, } except Exception as e: logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e) result = { "final_output": draft_output, "used_primary_backend": False, "fallback_used": True, } if REFINER_DEBUG: result["debug"] = { "prompt": prompt[:4000], # don’t nuke logs } return result