project-lyra/cortex/reasoning/refine.py

# refine.py
import os
import json
import logging
from typing import Any, Dict, Optional

import requests

logger = logging.getLogger(__name__)

# ============================================================
# Config
# ============================================================

PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")

REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"


# ============================================================
# Prompt builder
# ============================================================

def build_refine_prompt(
    draft_output: str,
    reflection_notes: Optional[Any],
    identity_block: Optional[str],
    rag_block: Optional[str],
) -> str:
    """
    Build a single text prompt for vLLM /v1/completions.
    Persona styling is *not* applied here; this is internal reasoning.
    """

    reflection_text: str
    if reflection_notes is None:
        reflection_text = "(none)"
    elif isinstance(reflection_notes, str):
        reflection_text = reflection_notes
    else:
        # dict / list → compact JSON
        try:
            reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
        except Exception:
            reflection_text = str(reflection_notes)

    identity_text = identity_block or "(none)"
    rag_text = rag_block or "(none)"

    prompt = f"""You are Lyra Cortex's internal refiner.

Your job:
- Take the existing draft answer.
- Use the reflection notes to fix problems (errors, confusion, missing pieces).
- Use the RAG context as higher-authority factual grounding.
- Respect the identity block (constraints, boundaries, style rules),
  but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.

If there is a conflict:
- RAG context wins over the draft.
- Reflection notes win over the draft when they point out real issues.

Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.

------------------------------
[IDENTITY BLOCK]
{identity_text}

------------------------------
[RAG CONTEXT]
{rag_text}

------------------------------
[DRAFT ANSWER]
{draft_output}

------------------------------
[REFLECTION NOTES]
{reflection_text}

------------------------------
Task:
Rewrite the DRAFT ANSWER into a single, final answer for the user that:
- fixes factual or logical issues noted above,
- incorporates any truly helpful additions from the reflection,
- stays consistent with the identity block,
- stays grounded in the RAG context,
- is as concise as is reasonably possible.

Return ONLY the final answer text. No headings, no labels, no commentary.
"""
    return prompt


# ============================================================
# vLLM call (PRIMARY backend only)
# ============================================================

def _call_primary_llm(prompt: str) -> str:
    if not PRIMARY_URL:
        raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")

    payload = {
        "model": PRIMARY_MODEL,
        "prompt": prompt,
        "max_tokens": REFINER_MAX_TOKENS,
        "temperature": REFINER_TEMPERATURE,
    }

    resp = requests.post(
        PRIMARY_URL,
        headers={"Content-Type": "application/json"},
        json=payload,
        timeout=120,
    )
    resp.raise_for_status()
    data = resp.json()

    # vLLM /v1/completions format
    try:
        text = data["choices"][0]["text"]
    except Exception as e:
        logger.error("refine.py: unable to parse primary LLM response: %s", e)
        logger.debug("refine.py raw response: %s", data)
        raise

    return text.strip()


# ============================================================
# Public API
# ============================================================

def refine_answer(
    draft_output: str,
    reflection_notes: Optional[Any],
    identity_block: Optional[str],
    rag_block: Optional[str],
) -> Dict[str, Any]:
    """
    Main entrypoint used by Cortex.

    Returns:
      {
        "final_output": <str>,           # what should go to persona / user
        "used_primary_backend": True/False,
        "fallback_used": True/False,
    optionally:
        "debug": {...}                   # only when REFINER_DEBUG=true
      }
    """

    if not draft_output:
        # Nothing to refine. Don't get cute.
        return {
            "final_output": "",
            "used_primary_backend": False,
            "fallback_used": False,
        }

    prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)

    try:
        refined = _call_primary_llm(prompt)
        result: Dict[str, Any] = {
            "final_output": refined or draft_output,
            "used_primary_backend": True,
            "fallback_used": False,
        }
    except Exception as e:
        logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
        result = {
            "final_output": draft_output,
            "used_primary_backend": False,
            "fallback_used": True,
        }

    if REFINER_DEBUG:
        result["debug"] = {
            "prompt": prompt[:4000],  # don’t nuke logs
        }

    return result