diff --git a/cortex/intake/intake.py b/cortex/intake/intake.py index ce0e592..da3e973 100644 --- a/cortex/intake/intake.py +++ b/cortex/intake/intake.py @@ -326,11 +326,33 @@ def bg_summarize(session_id: str): # ───────────────────────────── # Internal entrypoint for Cortex # ───────────────────────────── +def get_recent_messages(session_id: str, limit: int = 20) -> list: + """ + Get recent raw messages from the session buffer. + + Args: + session_id: Session identifier + limit: Maximum number of messages to return (default 20) + + Returns: + List of message dicts with 'role' and 'content' fields + """ + if session_id not in SESSIONS: + return [] + + buffer = SESSIONS[session_id]["buffer"] + + # Convert buffer to list and get last N messages + messages = list(buffer)[-limit:] + + return messages + + def add_exchange_internal(exchange: dict): """ Direct internal call — bypasses FastAPI request handling. Cortex uses this to feed user/assistant turns directly - into Intake’s buffer and trigger full summarization. + into Intake's buffer and trigger full summarization. """ session_id = exchange.get("session_id") if not session_id: diff --git a/cortex/llm/llm_router.py b/cortex/llm/llm_router.py index 7b7c173..c75acc8 100644 --- a/cortex/llm/llm_router.py +++ b/cortex/llm/llm_router.py @@ -44,11 +44,22 @@ http_client = httpx.AsyncClient(timeout=120.0) # Public call # ------------------------------------------------------------ async def call_llm( - prompt: str, + prompt: str = None, + messages: list = None, backend: str | None = None, temperature: float = 0.7, max_tokens: int = 512, ): + """ + Call an LLM backend. + + Args: + prompt: String prompt (for completion-style APIs like mi50) + messages: List of message dicts (for chat-style APIs like Ollama/OpenAI) + backend: Which backend to use (PRIMARY, SECONDARY, OPENAI, etc.) + temperature: Sampling temperature + max_tokens: Maximum tokens to generate + """ backend = (backend or DEFAULT_BACKEND).upper() if backend not in BACKENDS: @@ -69,7 +80,8 @@ async def call_llm( payload = { "prompt": prompt, "n_predict": max_tokens, - "temperature": temperature + "temperature": temperature, + "stop": ["User:", "\nUser:", "Assistant:", "\n\n\n"] } try: r = await http_client.post(f"{url}/completion", json=payload) @@ -90,12 +102,20 @@ async def call_llm( # Provider: OLLAMA (your 3090) # ------------------------------- if provider == "ollama": + # Use messages array if provided, otherwise convert prompt to single user message + if messages: + chat_messages = messages + else: + chat_messages = [{"role": "user", "content": prompt}] + payload = { "model": model, - "messages": [ - {"role": "user", "content": prompt} - ], - "stream": False + "messages": chat_messages, + "stream": False, + "options": { + "temperature": temperature, + "num_predict": max_tokens + } } try: r = await http_client.post(f"{url}/api/chat", json=payload) diff --git a/cortex/router.py b/cortex/router.py index 852f654..7dc3619 100644 --- a/cortex/router.py +++ b/cortex/router.py @@ -351,17 +351,34 @@ async def run_simple(req: ReasonRequest): logger.info(f"📝 User: {req.user_prompt[:150]}...") logger.info(f"{'-'*100}\n") - # Get conversation history from context + # Get conversation history from context and intake buffer context_state = await collect_context(req.session_id, req.user_prompt) - # Build simple conversation history - messages = [] - if context_state.get("recent_messages"): - for msg in context_state["recent_messages"]: + # Get recent messages from Intake buffer + from intake.intake import get_recent_messages + recent_msgs = get_recent_messages(req.session_id, limit=20) + logger.info(f"📋 Retrieved {len(recent_msgs)} recent messages from Intake buffer") + + # Build simple conversation history with system message + system_message = { + "role": "system", + "content": ( + "You are a helpful AI assistant. Provide direct, concise responses to the user's questions. " + "Maintain context from previous messages in the conversation." + ) + } + + messages = [system_message] + + # Add conversation history + + if recent_msgs: + for msg in recent_msgs: messages.append({ "role": msg.get("role", "user"), "content": msg.get("content", "") }) + logger.info(f" - {msg.get('role')}: {msg.get('content', '')[:50]}...") # Add current user message messages.append({ @@ -369,30 +386,24 @@ async def run_simple(req: ReasonRequest): "content": req.user_prompt }) - # Format messages into a simple prompt for the LLM - conversation = "" - for msg in messages: - role = msg["role"] - content = msg["content"] - if role == "user": - conversation += f"User: {content}\n\n" - elif role == "assistant": - conversation += f"Assistant: {content}\n\n" - - conversation += "Assistant: " + logger.info(f"📨 Total messages being sent to LLM: {len(messages)} (including system message)") # Get backend from env (default to OPENAI for standard mode) backend = os.getenv("STANDARD_MODE_LLM", "OPENAI") temperature = req.temperature if req.temperature is not None else 0.7 - # Direct LLM call + # Direct LLM call with messages (works for Ollama/OpenAI chat APIs) try: - response = await call_llm( - prompt=conversation, + raw_response = await call_llm( + messages=messages, backend=backend, temperature=temperature, max_tokens=2048 ) + + # Clean response - just strip whitespace + response = raw_response.strip() + except Exception as e: logger.error(f"❌ LLM call failed: {e}") response = f"Error: {str(e)}"