From fe86759cfd25f109c70c70065531b402c5d18bcd Mon Sep 17 00:00:00 2001
From: serversdwn <brianharrison02@gmail.com>
Date: Fri, 12 Dec 2025 02:58:23 -0500
Subject: [PATCH] v0.5.2 - fixed: llm router async, relay-UI mismatch, intake
 summarization failure, among others.

Memory relevance thresh. increased.
---
 CHANGELOG.md             | 49 +++++++++++++++++++++++++++++++
 core/relay/server.js     |  2 +-
 cortex/intake/intake.py  |  6 ++++
 cortex/llm/llm_router.py | 63 +++++++++++++++++++++++++++++++---------
 4 files changed, 106 insertions(+), 14 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ab30ad6..c895d52 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,55 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Se
 
 ---
 
+## [0.5.2] - 2025-12-12
+
+### Fixed - LLM Router & Async HTTP
+- **Critical**: Replaced synchronous `requests` with async `httpx` in LLM router [cortex/llm/llm_router.py](cortex/llm/llm_router.py)
+  - Event loop blocking was causing timeouts and empty responses
+  - All three providers (MI50, Ollama, OpenAI) now use `await http_client.post()`
+  - Fixes "Expecting value: line 1 column 1 (char 0)" JSON parsing errors in intake
+- **Critical**: Fixed missing `backend` parameter in intake summarization [cortex/intake/intake.py:285](cortex/intake/intake.py#L285)
+  - Was defaulting to PRIMARY (MI50) instead of respecting `INTAKE_LLM=SECONDARY`
+  - Now correctly uses configured backend (Ollama on 3090)
+- **Relay**: Fixed session ID case mismatch [core/relay/server.js:87](core/relay/server.js#L87)
+  - UI sends `sessionId` (camelCase) but relay expected `session_id` (snake_case)
+  - Now accepts both variants: `req.body.session_id || req.body.sessionId`
+  - Custom session IDs now properly tracked instead of defaulting to "default"
+
+### Added - Error Handling & Diagnostics
+- Added comprehensive error handling in LLM router for all providers
+  - HTTPError, JSONDecodeError, KeyError, and generic Exception handling
+  - Detailed error messages with exception type and description
+  - Provider-specific error logging (mi50, ollama, openai)
+- Added debug logging in intake summarization
+  - Logs LLM response length and preview
+  - Validates non-empty responses before JSON parsing
+  - Helps diagnose empty or malformed responses
+
+### Added - Session Management
+- Added session persistence endpoints in relay [core/relay/server.js:160-171](core/relay/server.js#L160-L171)
+  - `GET /sessions/:id` - Retrieve session history
+  - `POST /sessions/:id` - Save session history
+  - In-memory storage using Map (ephemeral, resets on container restart)
+  - Fixes UI "Failed to load session" errors
+
+### Changed - Provider Configuration
+- Added `mi50` provider support for llama.cpp server [cortex/llm/llm_router.py:62-81](cortex/llm/llm_router.py#L62-L81)
+  - Uses `/completion` endpoint with `n_predict` parameter
+  - Extracts `content` field from response
+  - Configured for MI50 GPU with DeepSeek model
+- Increased memory retrieval threshold from 0.78 to 0.90 [cortex/.env:20](cortex/.env#L20)
+  - Filters out low-relevance memories (only returns 90%+ similarity)
+  - Reduces noise in context retrieval
+
+### Technical Improvements
+- Unified async HTTP handling across all LLM providers
+- Better separation of concerns between provider implementations
+- Improved error messages for debugging LLM API failures
+- Consistent timeout handling (120 seconds for all providers)
+
+---
+
 ## [0.5.1] - 2025-12-11
 
 ### Fixed - Intake Integration
diff --git a/core/relay/server.js b/core/relay/server.js
index 357ca14..c0e7c2a 100644
--- a/core/relay/server.js
+++ b/core/relay/server.js
@@ -84,7 +84,7 @@ app.get("/_health", (_, res) => {
 // -----------------------------------------------------
 app.post("/v1/chat/completions", async (req, res) => {
   try {
-    const session_id = req.body.session_id || req.body.user || "default";
+    const session_id = req.body.session_id || req.body.sessionId || req.body.user || "default";
     const messages = req.body.messages || [];
     const lastMessage = messages[messages.length - 1];
     const user_msg = lastMessage?.content || "";
diff --git a/cortex/intake/intake.py b/cortex/intake/intake.py
index 50b192d..f5d9cba 100644
--- a/cortex/intake/intake.py
+++ b/cortex/intake/intake.py
@@ -282,11 +282,17 @@ JSON only. No text outside JSON.
     try:
         llm_response = await call_llm(
             prompt,
+            backend=INTAKE_LLM,
             temperature=0.2
         )
 
+        print(f"[Intake] LLM response length: {len(llm_response) if llm_response else 0}")
+        print(f"[Intake] LLM response preview: {llm_response[:200] if llm_response else '(empty)'}")
 
         # LLM should return JSON, parse it
+        if not llm_response or not llm_response.strip():
+            raise ValueError("Empty response from LLM")
+
         summary = json.loads(llm_response)
 
         return {
diff --git a/cortex/llm/llm_router.py b/cortex/llm/llm_router.py
index 20553cc..7b7c173 100644
--- a/cortex/llm/llm_router.py
+++ b/cortex/llm/llm_router.py
@@ -1,7 +1,10 @@
 # llm_router.py
 import os
-import requests
+import httpx
 import json
+import logging
+
+logger = logging.getLogger(__name__)
 
 # ------------------------------------------------------------
 # Load backend registry from root .env
@@ -33,6 +36,9 @@ BACKENDS = {
 
 DEFAULT_BACKEND = "PRIMARY"
 
+# Reusable async HTTP client
+http_client = httpx.AsyncClient(timeout=120.0)
+
 
 # ------------------------------------------------------------
 # Public call
@@ -65,9 +71,20 @@ async def call_llm(
             "n_predict": max_tokens,
             "temperature": temperature
         }
-        r = requests.post(f"{url}/completion", json=payload, timeout=120)
-        data = r.json()
-        return data["content"]
+        try:
+            r = await http_client.post(f"{url}/completion", json=payload)
+            r.raise_for_status()
+            data = r.json()
+            return data.get("content", "")
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error calling mi50: {type(e).__name__}: {str(e)}")
+            raise RuntimeError(f"LLM API error (mi50): {type(e).__name__}: {str(e)}")
+        except (KeyError, json.JSONDecodeError) as e:
+            logger.error(f"Response parsing error from mi50: {e}")
+            raise RuntimeError(f"Invalid response format (mi50): {e}")
+        except Exception as e:
+            logger.error(f"Unexpected error calling mi50: {type(e).__name__}: {str(e)}")
+            raise RuntimeError(f"Unexpected error (mi50): {type(e).__name__}: {str(e)}")
 
     # -------------------------------
     # Provider: OLLAMA (your 3090)
@@ -78,13 +95,22 @@ async def call_llm(
             "messages": [
                 {"role": "user", "content": prompt}
             ],
-            "stream": False        # <-- critical fix
+            "stream": False
         }
-
-        r = requests.post(f"{url}/api/chat", json=payload, timeout=120)
-        data = r.json()
-
-        return data["message"]["content"]
+        try:
+            r = await http_client.post(f"{url}/api/chat", json=payload)
+            r.raise_for_status()
+            data = r.json()
+            return data["message"]["content"]
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error calling ollama: {type(e).__name__}: {str(e)}")
+            raise RuntimeError(f"LLM API error (ollama): {type(e).__name__}: {str(e)}")
+        except (KeyError, json.JSONDecodeError) as e:
+            logger.error(f"Response parsing error from ollama: {e}")
+            raise RuntimeError(f"Invalid response format (ollama): {e}")
+        except Exception as e:
+            logger.error(f"Unexpected error calling ollama: {type(e).__name__}: {str(e)}")
+            raise RuntimeError(f"Unexpected error (ollama): {type(e).__name__}: {str(e)}")
 
 
     # -------------------------------
@@ -103,9 +129,20 @@ async def call_llm(
             "temperature": temperature,
             "max_tokens": max_tokens,
         }
-        r = requests.post(f"{url}/chat/completions", json=payload, headers=headers, timeout=120)
-        data = r.json()
-        return data["choices"][0]["message"]["content"]
+        try:
+            r = await http_client.post(f"{url}/chat/completions", json=payload, headers=headers)
+            r.raise_for_status()
+            data = r.json()
+            return data["choices"][0]["message"]["content"]
+        except httpx.HTTPError as e:
+            logger.error(f"HTTP error calling openai: {type(e).__name__}: {str(e)}")
+            raise RuntimeError(f"LLM API error (openai): {type(e).__name__}: {str(e)}")
+        except (KeyError, json.JSONDecodeError) as e:
+            logger.error(f"Response parsing error from openai: {e}")
+            raise RuntimeError(f"Invalid response format (openai): {e}")
+        except Exception as e:
+            logger.error(f"Unexpected error calling openai: {type(e).__name__}: {str(e)}")
+            raise RuntimeError(f"Unexpected error (openai): {type(e).__name__}: {str(e)}")
 
     # -------------------------------
     # Unknown provider