project-lyra/core/relay/lib/llm.js

async function tryBackend(backend, messages) {
  if (!backend.url || !backend.model) throw new Error("missing url/model");

  const isOllama = backend.type === "ollama";
  const isOpenAI = backend.type === "openai";
  const isVllm = backend.type === "vllm";
  const isLlamaCpp = backend.type === "llamacpp";

  let endpoint = backend.url;
  let headers = { "Content-Type": "application/json" };
  if (isOpenAI) headers["Authorization"] = `Bearer ${OPENAI_API_KEY}`;

  // Choose correct endpoint automatically
  if (isOllama && !endpoint.endsWith("/api/chat")) endpoint += "/api/chat";
  if ((isVllm || isLlamaCpp) && !endpoint.endsWith("/v1/completions")) endpoint += "/v1/completions";
  if (isOpenAI && !endpoint.endsWith("/v1/chat/completions")) endpoint += "/v1/chat/completions";

  // Build payload based on backend style
  const body = (isVllm || isLlamaCpp)
    ? {
        model: backend.model,
        prompt: messages.map(m => m.content).join("\n"),
        max_tokens: 400,
        temperature: 0.3,
      }
    : isOllama
    ? { model: backend.model, messages, stream: false }
    : { model: backend.model, messages, stream: false };

  const resp = await fetch(endpoint, {
    method: "POST",
    headers,
    body: JSON.stringify(body),
    timeout: 120000,
  });
  if (!resp.ok) throw new Error(`${backend.key} HTTP ${resp.status}`);
  const raw = await resp.text();

  // 🧩 Normalize replies
  let reply = "";
  let parsedData = null;

  try {
    if (isOllama) {
      // Ollama sometimes returns NDJSON lines; merge them
      const merged = raw
        .split("\n")
        .filter(line => line.trim().startsWith("{"))
        .map(line => JSON.parse(line))
        .map(obj => obj.message?.content || obj.response || "")
        .join("");
      reply = merged.trim();
    } else {
      parsedData = JSON.parse(raw);
	  reply =
	    parsedData?.choices?.[0]?.text?.trim() ||
	    parsedData?.choices?.[0]?.message?.content?.trim() ||
	    parsedData?.message?.content?.trim() ||
	    "";
    }
  } catch (err) {
    reply = `[parse error: ${err.message}]`;
  }

  return { reply, raw, parsedData, backend: backend.key };
}

// ------------------------------------
// Structured logging helper
// ------------------------------------
const LOG_DETAIL = process.env.LOG_DETAIL_LEVEL || "summary"; // minimal | summary | detailed | verbose

function logLLMCall(backend, messages, result, error = null) {
  const timestamp = new Date().toISOString().split('T')[1].slice(0, -1);

  if (error) {
    // Always log errors
    console.warn(`⚠️  [LLM] ${backend.key.toUpperCase()} failed | ${timestamp} | ${error.message}`);
    return;
  }

  // Success - log based on detail level
  if (LOG_DETAIL === "minimal") {
    return; // Don't log successful calls in minimal mode
  }

  if (LOG_DETAIL === "summary") {
    console.log(`✅ [LLM] ${backend.key.toUpperCase()} | ${timestamp} | Reply: ${result.reply.substring(0, 80)}...`);
    return;
  }

  // Detailed or verbose
  console.log(`\n${'─'.repeat(100)}`);
  console.log(`🧠 LLM CALL | Backend: ${backend.key.toUpperCase()} | ${timestamp}`);
  console.log(`${'─'.repeat(100)}`);

  // Show prompt preview
  const lastMsg = messages[messages.length - 1];
  const promptPreview = (lastMsg?.content || '').substring(0, 150);
  console.log(`📝 Prompt: ${promptPreview}...`);

  // Show parsed reply
  console.log(`💬 Reply: ${result.reply.substring(0, 200)}...`);

  // Show raw response only in verbose mode
  if (LOG_DETAIL === "verbose" && result.parsedData) {
    console.log(`\n╭─ RAW RESPONSE ────────────────────────────────────────────────────────────────────────────`);
    const jsonStr = JSON.stringify(result.parsedData, null, 2);
    const lines = jsonStr.split('\n');
    const maxLines = 50;

    lines.slice(0, maxLines).forEach(line => {
      console.log(`│ ${line}`);
    });

    if (lines.length > maxLines) {
      console.log(`│ ... (${lines.length - maxLines} more lines - check raw field for full response)`);
    }
    console.log(`╰${'─'.repeat(95)}`);
  }

  console.log(`${'─'.repeat(100)}\n`);
}

// ------------------------------------
// Export the main call helper
// ------------------------------------
export async function callSpeechLLM(messages) {
  const backends = [
    { key: "primary",  type: "vllm",     url: process.env.LLM_PRIMARY_URL,  model: process.env.LLM_PRIMARY_MODEL },
    { key: "secondary",type: "ollama",   url: process.env.LLM_SECONDARY_URL,model: process.env.LLM_SECONDARY_MODEL },
    { key: "cloud",    type: "openai",   url: process.env.LLM_CLOUD_URL,    model: process.env.LLM_CLOUD_MODEL },
    { key: "fallback", type: "llamacpp", url: process.env.LLM_FALLBACK_URL, model: process.env.LLM_FALLBACK_MODEL },
  ];

  const failedBackends = [];

  for (const b of backends) {
    if (!b.url || !b.model) continue;

    try {
      const out = await tryBackend(b, messages);
      logLLMCall(b, messages, out);
      return out;
    } catch (err) {
      logLLMCall(b, messages, null, err);
      failedBackends.push({ backend: b.key, error: err.message });
    }
  }

  // All backends failed - log summary
  console.error(`\n${'='.repeat(100)}`);
  console.error(`🔴 ALL LLM BACKENDS FAILED`);
  console.error(`${'='.repeat(100)}`);
  failedBackends.forEach(({ backend, error }) => {
    console.error(`  ${backend.toUpperCase()}: ${error}`);
  });
  console.error(`${'='.repeat(100)}\n`);

  throw new Error("all_backends_failed");
}