simple context added to standard mode

This commit is contained in:
serversdwn
2025-12-21 13:01:00 -05:00
parent d09425c37b
commit ceb60119fb
3 changed files with 79 additions and 26 deletions

View File

@@ -326,11 +326,33 @@ def bg_summarize(session_id: str):
# ───────────────────────────── # ─────────────────────────────
# Internal entrypoint for Cortex # Internal entrypoint for Cortex
# ───────────────────────────── # ─────────────────────────────
def get_recent_messages(session_id: str, limit: int = 20) -> list:
"""
Get recent raw messages from the session buffer.
Args:
session_id: Session identifier
limit: Maximum number of messages to return (default 20)
Returns:
List of message dicts with 'role' and 'content' fields
"""
if session_id not in SESSIONS:
return []
buffer = SESSIONS[session_id]["buffer"]
# Convert buffer to list and get last N messages
messages = list(buffer)[-limit:]
return messages
def add_exchange_internal(exchange: dict): def add_exchange_internal(exchange: dict):
""" """
Direct internal call — bypasses FastAPI request handling. Direct internal call — bypasses FastAPI request handling.
Cortex uses this to feed user/assistant turns directly Cortex uses this to feed user/assistant turns directly
into Intakes buffer and trigger full summarization. into Intake's buffer and trigger full summarization.
""" """
session_id = exchange.get("session_id") session_id = exchange.get("session_id")
if not session_id: if not session_id:

View File

@@ -44,11 +44,22 @@ http_client = httpx.AsyncClient(timeout=120.0)
# Public call # Public call
# ------------------------------------------------------------ # ------------------------------------------------------------
async def call_llm( async def call_llm(
prompt: str, prompt: str = None,
messages: list = None,
backend: str | None = None, backend: str | None = None,
temperature: float = 0.7, temperature: float = 0.7,
max_tokens: int = 512, max_tokens: int = 512,
): ):
"""
Call an LLM backend.
Args:
prompt: String prompt (for completion-style APIs like mi50)
messages: List of message dicts (for chat-style APIs like Ollama/OpenAI)
backend: Which backend to use (PRIMARY, SECONDARY, OPENAI, etc.)
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
"""
backend = (backend or DEFAULT_BACKEND).upper() backend = (backend or DEFAULT_BACKEND).upper()
if backend not in BACKENDS: if backend not in BACKENDS:
@@ -69,7 +80,8 @@ async def call_llm(
payload = { payload = {
"prompt": prompt, "prompt": prompt,
"n_predict": max_tokens, "n_predict": max_tokens,
"temperature": temperature "temperature": temperature,
"stop": ["User:", "\nUser:", "Assistant:", "\n\n\n"]
} }
try: try:
r = await http_client.post(f"{url}/completion", json=payload) r = await http_client.post(f"{url}/completion", json=payload)
@@ -90,12 +102,20 @@ async def call_llm(
# Provider: OLLAMA (your 3090) # Provider: OLLAMA (your 3090)
# ------------------------------- # -------------------------------
if provider == "ollama": if provider == "ollama":
# Use messages array if provided, otherwise convert prompt to single user message
if messages:
chat_messages = messages
else:
chat_messages = [{"role": "user", "content": prompt}]
payload = { payload = {
"model": model, "model": model,
"messages": [ "messages": chat_messages,
{"role": "user", "content": prompt} "stream": False,
], "options": {
"stream": False "temperature": temperature,
"num_predict": max_tokens
}
} }
try: try:
r = await http_client.post(f"{url}/api/chat", json=payload) r = await http_client.post(f"{url}/api/chat", json=payload)

View File

@@ -351,17 +351,34 @@ async def run_simple(req: ReasonRequest):
logger.info(f"📝 User: {req.user_prompt[:150]}...") logger.info(f"📝 User: {req.user_prompt[:150]}...")
logger.info(f"{'-'*100}\n") logger.info(f"{'-'*100}\n")
# Get conversation history from context # Get conversation history from context and intake buffer
context_state = await collect_context(req.session_id, req.user_prompt) context_state = await collect_context(req.session_id, req.user_prompt)
# Build simple conversation history # Get recent messages from Intake buffer
messages = [] from intake.intake import get_recent_messages
if context_state.get("recent_messages"): recent_msgs = get_recent_messages(req.session_id, limit=20)
for msg in context_state["recent_messages"]: logger.info(f"📋 Retrieved {len(recent_msgs)} recent messages from Intake buffer")
# Build simple conversation history with system message
system_message = {
"role": "system",
"content": (
"You are a helpful AI assistant. Provide direct, concise responses to the user's questions. "
"Maintain context from previous messages in the conversation."
)
}
messages = [system_message]
# Add conversation history
if recent_msgs:
for msg in recent_msgs:
messages.append({ messages.append({
"role": msg.get("role", "user"), "role": msg.get("role", "user"),
"content": msg.get("content", "") "content": msg.get("content", "")
}) })
logger.info(f" - {msg.get('role')}: {msg.get('content', '')[:50]}...")
# Add current user message # Add current user message
messages.append({ messages.append({
@@ -369,30 +386,24 @@ async def run_simple(req: ReasonRequest):
"content": req.user_prompt "content": req.user_prompt
}) })
# Format messages into a simple prompt for the LLM logger.info(f"📨 Total messages being sent to LLM: {len(messages)} (including system message)")
conversation = ""
for msg in messages:
role = msg["role"]
content = msg["content"]
if role == "user":
conversation += f"User: {content}\n\n"
elif role == "assistant":
conversation += f"Assistant: {content}\n\n"
conversation += "Assistant: "
# Get backend from env (default to OPENAI for standard mode) # Get backend from env (default to OPENAI for standard mode)
backend = os.getenv("STANDARD_MODE_LLM", "OPENAI") backend = os.getenv("STANDARD_MODE_LLM", "OPENAI")
temperature = req.temperature if req.temperature is not None else 0.7 temperature = req.temperature if req.temperature is not None else 0.7
# Direct LLM call # Direct LLM call with messages (works for Ollama/OpenAI chat APIs)
try: try:
response = await call_llm( raw_response = await call_llm(
prompt=conversation, messages=messages,
backend=backend, backend=backend,
temperature=temperature, temperature=temperature,
max_tokens=2048 max_tokens=2048
) )
# Clean response - just strip whitespace
response = raw_response.strip()
except Exception as e: except Exception as e:
logger.error(f"❌ LLM call failed: {e}") logger.error(f"❌ LLM call failed: {e}")
response = f"Error: {str(e)}" response = f"Error: {str(e)}"