Update to v0.9.1 #1
@@ -326,11 +326,33 @@ def bg_summarize(session_id: str):
|
|||||||
# ─────────────────────────────
|
# ─────────────────────────────
|
||||||
# Internal entrypoint for Cortex
|
# Internal entrypoint for Cortex
|
||||||
# ─────────────────────────────
|
# ─────────────────────────────
|
||||||
|
def get_recent_messages(session_id: str, limit: int = 20) -> list:
|
||||||
|
"""
|
||||||
|
Get recent raw messages from the session buffer.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
session_id: Session identifier
|
||||||
|
limit: Maximum number of messages to return (default 20)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of message dicts with 'role' and 'content' fields
|
||||||
|
"""
|
||||||
|
if session_id not in SESSIONS:
|
||||||
|
return []
|
||||||
|
|
||||||
|
buffer = SESSIONS[session_id]["buffer"]
|
||||||
|
|
||||||
|
# Convert buffer to list and get last N messages
|
||||||
|
messages = list(buffer)[-limit:]
|
||||||
|
|
||||||
|
return messages
|
||||||
|
|
||||||
|
|
||||||
def add_exchange_internal(exchange: dict):
|
def add_exchange_internal(exchange: dict):
|
||||||
"""
|
"""
|
||||||
Direct internal call — bypasses FastAPI request handling.
|
Direct internal call — bypasses FastAPI request handling.
|
||||||
Cortex uses this to feed user/assistant turns directly
|
Cortex uses this to feed user/assistant turns directly
|
||||||
into Intake’s buffer and trigger full summarization.
|
into Intake's buffer and trigger full summarization.
|
||||||
"""
|
"""
|
||||||
session_id = exchange.get("session_id")
|
session_id = exchange.get("session_id")
|
||||||
if not session_id:
|
if not session_id:
|
||||||
|
|||||||
@@ -44,11 +44,22 @@ http_client = httpx.AsyncClient(timeout=120.0)
|
|||||||
# Public call
|
# Public call
|
||||||
# ------------------------------------------------------------
|
# ------------------------------------------------------------
|
||||||
async def call_llm(
|
async def call_llm(
|
||||||
prompt: str,
|
prompt: str = None,
|
||||||
|
messages: list = None,
|
||||||
backend: str | None = None,
|
backend: str | None = None,
|
||||||
temperature: float = 0.7,
|
temperature: float = 0.7,
|
||||||
max_tokens: int = 512,
|
max_tokens: int = 512,
|
||||||
):
|
):
|
||||||
|
"""
|
||||||
|
Call an LLM backend.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
prompt: String prompt (for completion-style APIs like mi50)
|
||||||
|
messages: List of message dicts (for chat-style APIs like Ollama/OpenAI)
|
||||||
|
backend: Which backend to use (PRIMARY, SECONDARY, OPENAI, etc.)
|
||||||
|
temperature: Sampling temperature
|
||||||
|
max_tokens: Maximum tokens to generate
|
||||||
|
"""
|
||||||
backend = (backend or DEFAULT_BACKEND).upper()
|
backend = (backend or DEFAULT_BACKEND).upper()
|
||||||
|
|
||||||
if backend not in BACKENDS:
|
if backend not in BACKENDS:
|
||||||
@@ -69,7 +80,8 @@ async def call_llm(
|
|||||||
payload = {
|
payload = {
|
||||||
"prompt": prompt,
|
"prompt": prompt,
|
||||||
"n_predict": max_tokens,
|
"n_predict": max_tokens,
|
||||||
"temperature": temperature
|
"temperature": temperature,
|
||||||
|
"stop": ["User:", "\nUser:", "Assistant:", "\n\n\n"]
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
r = await http_client.post(f"{url}/completion", json=payload)
|
r = await http_client.post(f"{url}/completion", json=payload)
|
||||||
@@ -90,12 +102,20 @@ async def call_llm(
|
|||||||
# Provider: OLLAMA (your 3090)
|
# Provider: OLLAMA (your 3090)
|
||||||
# -------------------------------
|
# -------------------------------
|
||||||
if provider == "ollama":
|
if provider == "ollama":
|
||||||
|
# Use messages array if provided, otherwise convert prompt to single user message
|
||||||
|
if messages:
|
||||||
|
chat_messages = messages
|
||||||
|
else:
|
||||||
|
chat_messages = [{"role": "user", "content": prompt}]
|
||||||
|
|
||||||
payload = {
|
payload = {
|
||||||
"model": model,
|
"model": model,
|
||||||
"messages": [
|
"messages": chat_messages,
|
||||||
{"role": "user", "content": prompt}
|
"stream": False,
|
||||||
],
|
"options": {
|
||||||
"stream": False
|
"temperature": temperature,
|
||||||
|
"num_predict": max_tokens
|
||||||
|
}
|
||||||
}
|
}
|
||||||
try:
|
try:
|
||||||
r = await http_client.post(f"{url}/api/chat", json=payload)
|
r = await http_client.post(f"{url}/api/chat", json=payload)
|
||||||
|
|||||||
@@ -351,17 +351,34 @@ async def run_simple(req: ReasonRequest):
|
|||||||
logger.info(f"📝 User: {req.user_prompt[:150]}...")
|
logger.info(f"📝 User: {req.user_prompt[:150]}...")
|
||||||
logger.info(f"{'-'*100}\n")
|
logger.info(f"{'-'*100}\n")
|
||||||
|
|
||||||
# Get conversation history from context
|
# Get conversation history from context and intake buffer
|
||||||
context_state = await collect_context(req.session_id, req.user_prompt)
|
context_state = await collect_context(req.session_id, req.user_prompt)
|
||||||
|
|
||||||
# Build simple conversation history
|
# Get recent messages from Intake buffer
|
||||||
messages = []
|
from intake.intake import get_recent_messages
|
||||||
if context_state.get("recent_messages"):
|
recent_msgs = get_recent_messages(req.session_id, limit=20)
|
||||||
for msg in context_state["recent_messages"]:
|
logger.info(f"📋 Retrieved {len(recent_msgs)} recent messages from Intake buffer")
|
||||||
|
|
||||||
|
# Build simple conversation history with system message
|
||||||
|
system_message = {
|
||||||
|
"role": "system",
|
||||||
|
"content": (
|
||||||
|
"You are a helpful AI assistant. Provide direct, concise responses to the user's questions. "
|
||||||
|
"Maintain context from previous messages in the conversation."
|
||||||
|
)
|
||||||
|
}
|
||||||
|
|
||||||
|
messages = [system_message]
|
||||||
|
|
||||||
|
# Add conversation history
|
||||||
|
|
||||||
|
if recent_msgs:
|
||||||
|
for msg in recent_msgs:
|
||||||
messages.append({
|
messages.append({
|
||||||
"role": msg.get("role", "user"),
|
"role": msg.get("role", "user"),
|
||||||
"content": msg.get("content", "")
|
"content": msg.get("content", "")
|
||||||
})
|
})
|
||||||
|
logger.info(f" - {msg.get('role')}: {msg.get('content', '')[:50]}...")
|
||||||
|
|
||||||
# Add current user message
|
# Add current user message
|
||||||
messages.append({
|
messages.append({
|
||||||
@@ -369,30 +386,24 @@ async def run_simple(req: ReasonRequest):
|
|||||||
"content": req.user_prompt
|
"content": req.user_prompt
|
||||||
})
|
})
|
||||||
|
|
||||||
# Format messages into a simple prompt for the LLM
|
logger.info(f"📨 Total messages being sent to LLM: {len(messages)} (including system message)")
|
||||||
conversation = ""
|
|
||||||
for msg in messages:
|
|
||||||
role = msg["role"]
|
|
||||||
content = msg["content"]
|
|
||||||
if role == "user":
|
|
||||||
conversation += f"User: {content}\n\n"
|
|
||||||
elif role == "assistant":
|
|
||||||
conversation += f"Assistant: {content}\n\n"
|
|
||||||
|
|
||||||
conversation += "Assistant: "
|
|
||||||
|
|
||||||
# Get backend from env (default to OPENAI for standard mode)
|
# Get backend from env (default to OPENAI for standard mode)
|
||||||
backend = os.getenv("STANDARD_MODE_LLM", "OPENAI")
|
backend = os.getenv("STANDARD_MODE_LLM", "OPENAI")
|
||||||
temperature = req.temperature if req.temperature is not None else 0.7
|
temperature = req.temperature if req.temperature is not None else 0.7
|
||||||
|
|
||||||
# Direct LLM call
|
# Direct LLM call with messages (works for Ollama/OpenAI chat APIs)
|
||||||
try:
|
try:
|
||||||
response = await call_llm(
|
raw_response = await call_llm(
|
||||||
prompt=conversation,
|
messages=messages,
|
||||||
backend=backend,
|
backend=backend,
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=2048
|
max_tokens=2048
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Clean response - just strip whitespace
|
||||||
|
response = raw_response.strip()
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"❌ LLM call failed: {e}")
|
logger.error(f"❌ LLM call failed: {e}")
|
||||||
response = f"Error: {str(e)}"
|
response = f"Error: {str(e)}"
|
||||||
|
|||||||
Reference in New Issue
Block a user