project-lyra/cortex/reasoning/refine.py

# refine.py
import os
import json
import logging
from typing import Any, Dict, Optional

from llm.llm_router import call_llm

logger = logging.getLogger(__name__)

# ============================================================
# Config
# ============================================================

REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"

# Module-level backend selection
REFINE_LLM = os.getenv("REFINE_LLM", "PRIMARY").upper()
CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()


# ============================================================
# Prompt builder
# ============================================================

def build_refine_prompt(
    draft_output: str,
    reflection_notes: Optional[Any],
    identity_block: Optional[str],
    rag_block: Optional[str],
) -> str:

    if reflection_notes is None:
        reflection_text = "(none)"
    elif isinstance(reflection_notes, str):
        reflection_text = reflection_notes
    else:
        try:
            reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
        except Exception:
            reflection_text = str(reflection_notes)

    identity_text = identity_block or "(none)"
    rag_text = rag_block or "(none)"

    return f"""
You are Lyra Cortex's internal refiner.

Your job:
- Fix factual errors, logical gaps, or missing info.
- Use reflection notes for corrections.
- Use RAG context as factual grounding.
- Respect the identity block without adding style or personality.

Never mention RAG, reflection, or internal logic.

------------------------------
[IDENTITY BLOCK]
{identity_text}

------------------------------
[RAG CONTEXT]
{rag_text}

------------------------------
[DRAFT ANSWER]
{draft_output}

------------------------------
[REFLECTION NOTES]
{reflection_text}

------------------------------
Task:
Rewrite the DRAFT ANSWER into a single, final answer.
Return ONLY the final answer text.
""".strip()


# ============================================================
# Public API: async, using llm_router
# ============================================================

async def refine_answer(
    draft_output: str,
    reflection_notes: Optional[Any],
    identity_block: Optional[str],
    rag_block: Optional[str],
) -> Dict[str, Any]:

    if not draft_output:
        return {
            "final_output": "",
            "used_backend": None,
            "fallback_used": False,
        }

    prompt = build_refine_prompt(
        draft_output,
        reflection_notes,
        identity_block,
        rag_block,
    )

    # Refinement backend → fallback to Cortex backend → fallback to PRIMARY
    backend = REFINE_LLM or CORTEX_LLM or "PRIMARY"

    try:
        refined = await call_llm(
            prompt,
            backend=backend,
            temperature=REFINER_TEMPERATURE,
        )

        return {
            "final_output": refined.strip() if refined else draft_output,
            "used_backend": backend,
            "fallback_used": False,
        }

    except Exception as e:
        logger.error(f"refine.py backend {backend} failed: {e}")

        return {
            "final_output": draft_output,
            "used_backend": backend,
            "fallback_used": True,
        }