From fe86759cfd25f109c70c70065531b402c5d18bcd Mon Sep 17 00:00:00 2001 From: serversdwn Date: Fri, 12 Dec 2025 02:58:23 -0500 Subject: [PATCH] v0.5.2 - fixed: llm router async, relay-UI mismatch, intake summarization failure, among others. Memory relevance thresh. increased. --- CHANGELOG.md | 49 +++++++++++++++++++++++++++++++ core/relay/server.js | 2 +- cortex/intake/intake.py | 6 ++++ cortex/llm/llm_router.py | 63 +++++++++++++++++++++++++++++++--------- 4 files changed, 106 insertions(+), 14 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ab30ad6..c895d52 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,55 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Se --- +## [0.5.2] - 2025-12-12 + +### Fixed - LLM Router & Async HTTP +- **Critical**: Replaced synchronous `requests` with async `httpx` in LLM router [cortex/llm/llm_router.py](cortex/llm/llm_router.py) + - Event loop blocking was causing timeouts and empty responses + - All three providers (MI50, Ollama, OpenAI) now use `await http_client.post()` + - Fixes "Expecting value: line 1 column 1 (char 0)" JSON parsing errors in intake +- **Critical**: Fixed missing `backend` parameter in intake summarization [cortex/intake/intake.py:285](cortex/intake/intake.py#L285) + - Was defaulting to PRIMARY (MI50) instead of respecting `INTAKE_LLM=SECONDARY` + - Now correctly uses configured backend (Ollama on 3090) +- **Relay**: Fixed session ID case mismatch [core/relay/server.js:87](core/relay/server.js#L87) + - UI sends `sessionId` (camelCase) but relay expected `session_id` (snake_case) + - Now accepts both variants: `req.body.session_id || req.body.sessionId` + - Custom session IDs now properly tracked instead of defaulting to "default" + +### Added - Error Handling & Diagnostics +- Added comprehensive error handling in LLM router for all providers + - HTTPError, JSONDecodeError, KeyError, and generic Exception handling + - Detailed error messages with exception type and description + - Provider-specific error logging (mi50, ollama, openai) +- Added debug logging in intake summarization + - Logs LLM response length and preview + - Validates non-empty responses before JSON parsing + - Helps diagnose empty or malformed responses + +### Added - Session Management +- Added session persistence endpoints in relay [core/relay/server.js:160-171](core/relay/server.js#L160-L171) + - `GET /sessions/:id` - Retrieve session history + - `POST /sessions/:id` - Save session history + - In-memory storage using Map (ephemeral, resets on container restart) + - Fixes UI "Failed to load session" errors + +### Changed - Provider Configuration +- Added `mi50` provider support for llama.cpp server [cortex/llm/llm_router.py:62-81](cortex/llm/llm_router.py#L62-L81) + - Uses `/completion` endpoint with `n_predict` parameter + - Extracts `content` field from response + - Configured for MI50 GPU with DeepSeek model +- Increased memory retrieval threshold from 0.78 to 0.90 [cortex/.env:20](cortex/.env#L20) + - Filters out low-relevance memories (only returns 90%+ similarity) + - Reduces noise in context retrieval + +### Technical Improvements +- Unified async HTTP handling across all LLM providers +- Better separation of concerns between provider implementations +- Improved error messages for debugging LLM API failures +- Consistent timeout handling (120 seconds for all providers) + +--- + ## [0.5.1] - 2025-12-11 ### Fixed - Intake Integration diff --git a/core/relay/server.js b/core/relay/server.js index 357ca14..c0e7c2a 100644 --- a/core/relay/server.js +++ b/core/relay/server.js @@ -84,7 +84,7 @@ app.get("/_health", (_, res) => { // ----------------------------------------------------- app.post("/v1/chat/completions", async (req, res) => { try { - const session_id = req.body.session_id || req.body.user || "default"; + const session_id = req.body.session_id || req.body.sessionId || req.body.user || "default"; const messages = req.body.messages || []; const lastMessage = messages[messages.length - 1]; const user_msg = lastMessage?.content || ""; diff --git a/cortex/intake/intake.py b/cortex/intake/intake.py index 50b192d..f5d9cba 100644 --- a/cortex/intake/intake.py +++ b/cortex/intake/intake.py @@ -282,11 +282,17 @@ JSON only. No text outside JSON. try: llm_response = await call_llm( prompt, + backend=INTAKE_LLM, temperature=0.2 ) + print(f"[Intake] LLM response length: {len(llm_response) if llm_response else 0}") + print(f"[Intake] LLM response preview: {llm_response[:200] if llm_response else '(empty)'}") # LLM should return JSON, parse it + if not llm_response or not llm_response.strip(): + raise ValueError("Empty response from LLM") + summary = json.loads(llm_response) return { diff --git a/cortex/llm/llm_router.py b/cortex/llm/llm_router.py index 20553cc..7b7c173 100644 --- a/cortex/llm/llm_router.py +++ b/cortex/llm/llm_router.py @@ -1,7 +1,10 @@ # llm_router.py import os -import requests +import httpx import json +import logging + +logger = logging.getLogger(__name__) # ------------------------------------------------------------ # Load backend registry from root .env @@ -33,6 +36,9 @@ BACKENDS = { DEFAULT_BACKEND = "PRIMARY" +# Reusable async HTTP client +http_client = httpx.AsyncClient(timeout=120.0) + # ------------------------------------------------------------ # Public call @@ -65,9 +71,20 @@ async def call_llm( "n_predict": max_tokens, "temperature": temperature } - r = requests.post(f"{url}/completion", json=payload, timeout=120) - data = r.json() - return data["content"] + try: + r = await http_client.post(f"{url}/completion", json=payload) + r.raise_for_status() + data = r.json() + return data.get("content", "") + except httpx.HTTPError as e: + logger.error(f"HTTP error calling mi50: {type(e).__name__}: {str(e)}") + raise RuntimeError(f"LLM API error (mi50): {type(e).__name__}: {str(e)}") + except (KeyError, json.JSONDecodeError) as e: + logger.error(f"Response parsing error from mi50: {e}") + raise RuntimeError(f"Invalid response format (mi50): {e}") + except Exception as e: + logger.error(f"Unexpected error calling mi50: {type(e).__name__}: {str(e)}") + raise RuntimeError(f"Unexpected error (mi50): {type(e).__name__}: {str(e)}") # ------------------------------- # Provider: OLLAMA (your 3090) @@ -78,13 +95,22 @@ async def call_llm( "messages": [ {"role": "user", "content": prompt} ], - "stream": False # <-- critical fix + "stream": False } - - r = requests.post(f"{url}/api/chat", json=payload, timeout=120) - data = r.json() - - return data["message"]["content"] + try: + r = await http_client.post(f"{url}/api/chat", json=payload) + r.raise_for_status() + data = r.json() + return data["message"]["content"] + except httpx.HTTPError as e: + logger.error(f"HTTP error calling ollama: {type(e).__name__}: {str(e)}") + raise RuntimeError(f"LLM API error (ollama): {type(e).__name__}: {str(e)}") + except (KeyError, json.JSONDecodeError) as e: + logger.error(f"Response parsing error from ollama: {e}") + raise RuntimeError(f"Invalid response format (ollama): {e}") + except Exception as e: + logger.error(f"Unexpected error calling ollama: {type(e).__name__}: {str(e)}") + raise RuntimeError(f"Unexpected error (ollama): {type(e).__name__}: {str(e)}") # ------------------------------- @@ -103,9 +129,20 @@ async def call_llm( "temperature": temperature, "max_tokens": max_tokens, } - r = requests.post(f"{url}/chat/completions", json=payload, headers=headers, timeout=120) - data = r.json() - return data["choices"][0]["message"]["content"] + try: + r = await http_client.post(f"{url}/chat/completions", json=payload, headers=headers) + r.raise_for_status() + data = r.json() + return data["choices"][0]["message"]["content"] + except httpx.HTTPError as e: + logger.error(f"HTTP error calling openai: {type(e).__name__}: {str(e)}") + raise RuntimeError(f"LLM API error (openai): {type(e).__name__}: {str(e)}") + except (KeyError, json.JSONDecodeError) as e: + logger.error(f"Response parsing error from openai: {e}") + raise RuntimeError(f"Invalid response format (openai): {e}") + except Exception as e: + logger.error(f"Unexpected error calling openai: {type(e).__name__}: {str(e)}") + raise RuntimeError(f"Unexpected error (openai): {type(e).__name__}: {str(e)}") # ------------------------------- # Unknown provider