Update to v0.9.1 #1
49
CHANGELOG.md
49
CHANGELOG.md
@@ -9,6 +9,55 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/) and [Se
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
## [0.5.2] - 2025-12-12
|
||||||
|
|
||||||
|
### Fixed - LLM Router & Async HTTP
|
||||||
|
- **Critical**: Replaced synchronous `requests` with async `httpx` in LLM router [cortex/llm/llm_router.py](cortex/llm/llm_router.py)
|
||||||
|
- Event loop blocking was causing timeouts and empty responses
|
||||||
|
- All three providers (MI50, Ollama, OpenAI) now use `await http_client.post()`
|
||||||
|
- Fixes "Expecting value: line 1 column 1 (char 0)" JSON parsing errors in intake
|
||||||
|
- **Critical**: Fixed missing `backend` parameter in intake summarization [cortex/intake/intake.py:285](cortex/intake/intake.py#L285)
|
||||||
|
- Was defaulting to PRIMARY (MI50) instead of respecting `INTAKE_LLM=SECONDARY`
|
||||||
|
- Now correctly uses configured backend (Ollama on 3090)
|
||||||
|
- **Relay**: Fixed session ID case mismatch [core/relay/server.js:87](core/relay/server.js#L87)
|
||||||
|
- UI sends `sessionId` (camelCase) but relay expected `session_id` (snake_case)
|
||||||
|
- Now accepts both variants: `req.body.session_id || req.body.sessionId`
|
||||||
|
- Custom session IDs now properly tracked instead of defaulting to "default"
|
||||||
|
|
||||||
|
### Added - Error Handling & Diagnostics
|
||||||
|
- Added comprehensive error handling in LLM router for all providers
|
||||||
|
- HTTPError, JSONDecodeError, KeyError, and generic Exception handling
|
||||||
|
- Detailed error messages with exception type and description
|
||||||
|
- Provider-specific error logging (mi50, ollama, openai)
|
||||||
|
- Added debug logging in intake summarization
|
||||||
|
- Logs LLM response length and preview
|
||||||
|
- Validates non-empty responses before JSON parsing
|
||||||
|
- Helps diagnose empty or malformed responses
|
||||||
|
|
||||||
|
### Added - Session Management
|
||||||
|
- Added session persistence endpoints in relay [core/relay/server.js:160-171](core/relay/server.js#L160-L171)
|
||||||
|
- `GET /sessions/:id` - Retrieve session history
|
||||||
|
- `POST /sessions/:id` - Save session history
|
||||||
|
- In-memory storage using Map (ephemeral, resets on container restart)
|
||||||
|
- Fixes UI "Failed to load session" errors
|
||||||
|
|
||||||
|
### Changed - Provider Configuration
|
||||||
|
- Added `mi50` provider support for llama.cpp server [cortex/llm/llm_router.py:62-81](cortex/llm/llm_router.py#L62-L81)
|
||||||
|
- Uses `/completion` endpoint with `n_predict` parameter
|
||||||
|
- Extracts `content` field from response
|
||||||
|
- Configured for MI50 GPU with DeepSeek model
|
||||||
|
- Increased memory retrieval threshold from 0.78 to 0.90 [cortex/.env:20](cortex/.env#L20)
|
||||||
|
- Filters out low-relevance memories (only returns 90%+ similarity)
|
||||||
|
- Reduces noise in context retrieval
|
||||||
|
|
||||||
|
### Technical Improvements
|
||||||
|
- Unified async HTTP handling across all LLM providers
|
||||||
|
- Better separation of concerns between provider implementations
|
||||||
|
- Improved error messages for debugging LLM API failures
|
||||||
|
- Consistent timeout handling (120 seconds for all providers)
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## [0.5.1] - 2025-12-11
|
## [0.5.1] - 2025-12-11
|
||||||
|
|
||||||
### Fixed - Intake Integration
|
### Fixed - Intake Integration
|
||||||
|
|||||||
@@ -84,7 +84,7 @@ app.get("/_health", (_, res) => {
|
|||||||
// -----------------------------------------------------
|
// -----------------------------------------------------
|
||||||
app.post("/v1/chat/completions", async (req, res) => {
|
app.post("/v1/chat/completions", async (req, res) => {
|
||||||
try {
|
try {
|
||||||
const session_id = req.body.session_id || req.body.user || "default";
|
const session_id = req.body.session_id || req.body.sessionId || req.body.user || "default";
|
||||||
const messages = req.body.messages || [];
|
const messages = req.body.messages || [];
|
||||||
const lastMessage = messages[messages.length - 1];
|
const lastMessage = messages[messages.length - 1];
|
||||||
const user_msg = lastMessage?.content || "";
|
const user_msg = lastMessage?.content || "";
|
||||||
|
|||||||
@@ -282,11 +282,17 @@ JSON only. No text outside JSON.
|
|||||||
try:
|
try:
|
||||||
llm_response = await call_llm(
|
llm_response = await call_llm(
|
||||||
prompt,
|
prompt,
|
||||||
|
backend=INTAKE_LLM,
|
||||||
temperature=0.2
|
temperature=0.2
|
||||||
)
|
)
|
||||||
|
|
||||||
|
print(f"[Intake] LLM response length: {len(llm_response) if llm_response else 0}")
|
||||||
|
print(f"[Intake] LLM response preview: {llm_response[:200] if llm_response else '(empty)'}")
|
||||||
|
|
||||||
# LLM should return JSON, parse it
|
# LLM should return JSON, parse it
|
||||||
|
if not llm_response or not llm_response.strip():
|
||||||
|
raise ValueError("Empty response from LLM")
|
||||||
|
|
||||||
summary = json.loads(llm_response)
|
summary = json.loads(llm_response)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
# llm_router.py
|
# llm_router.py
|
||||||
import os
|
import os
|
||||||
import requests
|
import httpx
|
||||||
import json
|
import json
|
||||||
|
import logging
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Load backend registry from root .env
|
# Load backend registry from root .env
|
||||||
@@ -33,6 +36,9 @@ BACKENDS = {
|
|||||||
|
|
||||||
DEFAULT_BACKEND = "PRIMARY"
|
DEFAULT_BACKEND = "PRIMARY"
|
||||||
|
|
||||||
|
# Reusable async HTTP client
|
||||||
|
http_client = httpx.AsyncClient(timeout=120.0)
|
||||||
|
|
||||||
|
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
# Public call
|
# Public call
|
||||||
@@ -65,9 +71,20 @@ async def call_llm(
|
|||||||
"n_predict": max_tokens,
|
"n_predict": max_tokens,
|
||||||
"temperature": temperature
|
"temperature": temperature
|
||||||
}
|
}
|
||||||
r = requests.post(f"{url}/completion", json=payload, timeout=120)
|
try:
|
||||||
data = r.json()
|
r = await http_client.post(f"{url}/completion", json=payload)
|
||||||
return data["content"]
|
r.raise_for_status()
|
||||||
|
data = r.json()
|
||||||
|
return data.get("content", "")
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"HTTP error calling mi50: {type(e).__name__}: {str(e)}")
|
||||||
|
raise RuntimeError(f"LLM API error (mi50): {type(e).__name__}: {str(e)}")
|
||||||
|
except (KeyError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Response parsing error from mi50: {e}")
|
||||||
|
raise RuntimeError(f"Invalid response format (mi50): {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error calling mi50: {type(e).__name__}: {str(e)}")
|
||||||
|
raise RuntimeError(f"Unexpected error (mi50): {type(e).__name__}: {str(e)}")
|
||||||
|
|
||||||
# -------------------------------
|
# -------------------------------
|
||||||
# Provider: OLLAMA (your 3090)
|
# Provider: OLLAMA (your 3090)
|
||||||
@@ -78,13 +95,22 @@ async def call_llm(
|
|||||||
"messages": [
|
"messages": [
|
||||||
{"role": "user", "content": prompt}
|
{"role": "user", "content": prompt}
|
||||||
],
|
],
|
||||||
"stream": False # <-- critical fix
|
"stream": False
|
||||||
}
|
}
|
||||||
|
try:
|
||||||
r = requests.post(f"{url}/api/chat", json=payload, timeout=120)
|
r = await http_client.post(f"{url}/api/chat", json=payload)
|
||||||
data = r.json()
|
r.raise_for_status()
|
||||||
|
data = r.json()
|
||||||
return data["message"]["content"]
|
return data["message"]["content"]
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"HTTP error calling ollama: {type(e).__name__}: {str(e)}")
|
||||||
|
raise RuntimeError(f"LLM API error (ollama): {type(e).__name__}: {str(e)}")
|
||||||
|
except (KeyError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Response parsing error from ollama: {e}")
|
||||||
|
raise RuntimeError(f"Invalid response format (ollama): {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error calling ollama: {type(e).__name__}: {str(e)}")
|
||||||
|
raise RuntimeError(f"Unexpected error (ollama): {type(e).__name__}: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
# -------------------------------
|
# -------------------------------
|
||||||
@@ -103,9 +129,20 @@ async def call_llm(
|
|||||||
"temperature": temperature,
|
"temperature": temperature,
|
||||||
"max_tokens": max_tokens,
|
"max_tokens": max_tokens,
|
||||||
}
|
}
|
||||||
r = requests.post(f"{url}/chat/completions", json=payload, headers=headers, timeout=120)
|
try:
|
||||||
data = r.json()
|
r = await http_client.post(f"{url}/chat/completions", json=payload, headers=headers)
|
||||||
return data["choices"][0]["message"]["content"]
|
r.raise_for_status()
|
||||||
|
data = r.json()
|
||||||
|
return data["choices"][0]["message"]["content"]
|
||||||
|
except httpx.HTTPError as e:
|
||||||
|
logger.error(f"HTTP error calling openai: {type(e).__name__}: {str(e)}")
|
||||||
|
raise RuntimeError(f"LLM API error (openai): {type(e).__name__}: {str(e)}")
|
||||||
|
except (KeyError, json.JSONDecodeError) as e:
|
||||||
|
logger.error(f"Response parsing error from openai: {e}")
|
||||||
|
raise RuntimeError(f"Invalid response format (openai): {e}")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Unexpected error calling openai: {type(e).__name__}: {str(e)}")
|
||||||
|
raise RuntimeError(f"Unexpected error (openai): {type(e).__name__}: {str(e)}")
|
||||||
|
|
||||||
# -------------------------------
|
# -------------------------------
|
||||||
# Unknown provider
|
# Unknown provider
|
||||||
|
|||||||
Reference in New Issue
Block a user