# refine.py import os import json import logging from typing import Any, Dict, Optional from llm.llm_router import call_llm logger = logging.getLogger(__name__) # ============================================================ # Config # ============================================================ REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3")) REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768")) REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true" # Module-level backend selection REFINE_LLM = os.getenv("REFINE_LLM", "PRIMARY").upper() CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper() # ============================================================ # Prompt builder # ============================================================ def build_refine_prompt( draft_output: str, reflection_notes: Optional[Any], identity_block: Optional[str], rag_block: Optional[str], ) -> str: if reflection_notes is None: reflection_text = "(none)" elif isinstance(reflection_notes, str): reflection_text = reflection_notes else: try: reflection_text = json.dumps(reflection_notes, ensure_ascii=False) except Exception: reflection_text = str(reflection_notes) identity_text = identity_block or "(none)" rag_text = rag_block or "(none)" return f""" You are Lyra Cortex's internal refiner. Your job: - Fix factual errors, logical gaps, or missing info. - Use reflection notes for corrections. - Use RAG context as factual grounding. - Respect the identity block without adding style or personality. Never mention RAG, reflection, or internal logic. ------------------------------ [IDENTITY BLOCK] {identity_text} ------------------------------ [RAG CONTEXT] {rag_text} ------------------------------ [DRAFT ANSWER] {draft_output} ------------------------------ [REFLECTION NOTES] {reflection_text} ------------------------------ Task: Rewrite the DRAFT ANSWER into a single, final answer. Return ONLY the final answer text. """.strip() # ============================================================ # Public API: async, using llm_router # ============================================================ async def refine_answer( draft_output: str, reflection_notes: Optional[Any], identity_block: Optional[str], rag_block: Optional[str], ) -> Dict[str, Any]: if not draft_output: return { "final_output": "", "used_backend": None, "fallback_used": False, } prompt = build_refine_prompt( draft_output, reflection_notes, identity_block, rag_block, ) # Refinement backend → fallback to Cortex backend → fallback to PRIMARY backend = REFINE_LLM or CORTEX_LLM or "PRIMARY" try: refined = await call_llm( prompt, backend=backend, temperature=REFINER_TEMPERATURE, ) return { "final_output": refined.strip() if refined else draft_output, "used_backend": backend, "fallback_used": False, } except Exception as e: logger.error(f"refine.py backend {backend} failed: {e}") return { "final_output": draft_output, "used_backend": backend, "fallback_used": True, }