feat: Refactor LLM router and integrate health check endpoint

- Simplified LLM call logic in llm_router.py, removing tool adapter complexity and enhancing error handling.
- Added health check endpoint to main.py for system status verification.
- Cleaned up router.py by removing unused imports and commented-out code, streamlining the structure.
- Updated docker-compose.yml to unify services under a single Lyra container, enhancing deployment simplicity.
- Created Dockerfile for unified container setup, including both Relay and Cortex services.
- Added QUICKSTART.md for improved onboarding and usage instructions.
- Implemented start.sh script to manage service startup and health checks.
This commit is contained in:
2026-05-29 18:20:56 -04:00
parent 376b8114ad
commit 5f53fb32a4
14 changed files with 802 additions and 1665 deletions
-1
View File
@@ -1 +0,0 @@
# Ingest module - handles communication with Intake service
-33
View File
@@ -1,33 +0,0 @@
# ingest_handler.py
import os
import httpx
NEOMEM_URL = os.getenv("NEOMEM_API", "http://nvgram-api:7077")
async def handle_ingest(payload):
"""
Pass user+assistant turns to NeoMem.
Minimal version. Does not process or annotate.
"""
data = {
"messages": [],
"user_id": "brian" # default for now
}
if payload.user:
data["messages"].append({"role": "user", "content": payload.user})
if payload.assistant:
data["messages"].append({"role": "assistant", "content": payload.assistant})
try:
async with httpx.AsyncClient() as client:
r = await client.post(
f"{NEOMEM_URL}/memories",
json=data,
timeout=5
)
if r.status_code != 200:
print(f"[Ingest] NeoMem returned {r.status_code}: {r.text}")
except Exception as e:
print(f"[Ingest] Failed to send to NeoMem: {e}")
-45
View File
@@ -1,45 +0,0 @@
# cortex/intake_client.py
import os, httpx, logging
from typing import Dict, Any, Optional
logger = logging.getLogger(__name__)
class IntakeClient:
"""Handles short-term / episodic summaries from Intake service."""
def __init__(self):
self.base_url = os.getenv("INTAKE_API_URL", "http://intake:7080")
async def summarize_turn(self, session_id: str, user_msg: str, assistant_msg: Optional[str] = None) -> Dict[str, Any]:
"""
DEPRECATED: Intake v0.2 removed the /summarize endpoint.
Use add_exchange() instead, which auto-summarizes in the background.
This method is kept for backwards compatibility but will fail.
"""
payload = {
"session_id": session_id,
"turns": [{"role": "user", "content": user_msg}]
}
if assistant_msg:
payload["turns"].append({"role": "assistant", "content": assistant_msg})
async with httpx.AsyncClient(timeout=30) as client:
try:
r = await client.post(f"{self.base_url}/summarize", json=payload)
r.raise_for_status()
return r.json()
except Exception as e:
logger.warning(f"Intake summarize_turn failed (endpoint removed in v0.2): {e}")
return {}
async def get_context(self, session_id: str) -> str:
"""Get summarized context for a session from Intake."""
async with httpx.AsyncClient(timeout=15) as client:
try:
r = await client.get(f"{self.base_url}/summaries", params={"session_id": session_id})
r.raise_for_status()
data = r.json()
return data.get("summary_text", "")
except Exception as e:
logger.warning(f"Intake get_context failed: {e}")
return ""
+66 -28
View File
@@ -33,8 +33,8 @@ INTAKE_LLM = os.getenv("INTAKE_LLM", "PRIMARY").upper()
SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", "200"))
SUMMARY_TEMPERATURE = float(os.getenv("SUMMARY_TEMPERATURE", "0.3"))
NEOMEM_API = os.getenv("NEOMEM_API")
NEOMEM_KEY = os.getenv("NEOMEM_KEY")
NEBULA_API = os.getenv("NEBULA_API", "http://localhost:7090")
NEBULA_KEY = os.getenv("NEBULA_KEY")
# ─────────────────────────────
# Internal history for L10/L20/L30
@@ -120,7 +120,7 @@ async def summarize_L5(buf: List[Dict[str, Any]]) -> str:
async def summarize_L10(session_id: str, buf: List[Dict[str, Any]]) -> str:
# Reality Check for last 10 exchanges
# "Reality Check" for last 10 exchanges
text = _format_exchanges(buf[-10:])
prompt = f"""
@@ -138,6 +138,9 @@ Reality Check:
L10_HISTORY.setdefault(session_id, [])
L10_HISTORY[session_id].append(summary)
# Send to Nebula
await send_to_nebula(summary, session_id, "L10")
return summary
@@ -165,6 +168,9 @@ Overview:
L20_HISTORY.setdefault(session_id, [])
L20_HISTORY[session_id].append(summary)
# Send to Nebula
await send_to_nebula(summary, session_id, "L20")
return summary
@@ -187,45 +193,77 @@ noting major themes, persistent goals, and shifts.
Continuity Report:
"""
return await _llm(prompt)
summary = await _llm(prompt)
# Send to Nebula
await send_to_nebula(summary, session_id, "L30")
return summary
# ─────────────────────────────
# NeoMem push
# Nebula push
# ─────────────────────────────
def push_to_neomem(summary: str, session_id: str, level: str) -> None:
async def send_to_nebula(summary: str, session_id: str, level: str) -> None:
"""
Fire-and-forget push of a summary into NeoMem.
Send summary to Nebula vector memory system.
Falls back to disk storage if Nebula is not available.
"""
if not NEOMEM_API or not summary:
if not summary:
return
headers = {"Content-Type": "application/json"}
if NEOMEM_KEY:
headers["Authorization"] = f"Bearer {NEOMEM_KEY}"
payload = {
"messages": [{"role": "assistant", "content": summary}],
"user_id": "brian",
"metadata": {
"source": "intake",
"session_id": session_id,
"level": level,
},
"summary": summary,
"session_id": session_id,
"level": level,
"timestamp": datetime.now().isoformat(),
"source": "intake",
}
# Try HTTP POST to Nebula first
try:
import requests
requests.post(
f"{NEOMEM_API}/memories",
json=payload,
headers=headers,
timeout=20,
).raise_for_status()
print(f"🧠 NeoMem updated ({level}) for {session_id}")
import httpx
headers = {"Content-Type": "application/json"}
if NEBULA_KEY:
headers["Authorization"] = f"Bearer {NEBULA_KEY}"
async with httpx.AsyncClient() as client:
response = await client.post(
f"{NEBULA_API}/summaries",
json=payload,
headers=headers,
timeout=10.0,
)
response.raise_for_status()
print(f"🌌 Nebula updated ({level}) for {session_id}")
return
except Exception as e:
print(f"NeoMem push failed ({level}, {session_id}): {e}")
print(f"⚠️ Nebula unavailable, falling back to disk: {e}")
# Fallback: Write to disk
try:
fallback_dir = os.path.join(os.path.dirname(__file__), "../../.nebula_fallback")
os.makedirs(fallback_dir, exist_ok=True)
# Create session directory
session_dir = os.path.join(fallback_dir, session_id)
os.makedirs(session_dir, exist_ok=True)
# Write summary to timestamped file
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
filename = f"{level}_{timestamp}.json"
filepath = os.path.join(session_dir, filename)
import json
with open(filepath, "w") as f:
json.dump(payload, f, indent=2)
print(f"💾 Saved to disk: {filepath}")
except Exception as e:
print(f"❌ Failed to save summary to disk: {e}")
# ─────────────────────────────
+69 -205
View File
@@ -1,15 +1,15 @@
# llm_router.py
import os
import httpx
import json
import logging
from typing import Optional, List, Dict
from autonomy.tools.adapters import OpenAIAdapter, OllamaAdapter, LlamaCppAdapter
logger = logging.getLogger(__name__)
# ------------------------------------------------------------
# Load backend registry from root .env
# Backend Configuration
# ------------------------------------------------------------
BACKENDS = {
@@ -38,50 +38,25 @@ BACKENDS = {
DEFAULT_BACKEND = "PRIMARY"
# Reusable async HTTP client
http_client = httpx.AsyncClient(timeout=120.0)
# Tool adapters for each backend
TOOL_ADAPTERS = {
"OPENAI": OpenAIAdapter(),
"OLLAMA": OllamaAdapter(),
"MI50": LlamaCppAdapter(), # MI50 uses llama.cpp
"PRIMARY": None, # Determined at runtime
"SECONDARY": None, # Determined at runtime
"FALLBACK": None, # Determined at runtime
}
# ------------------------------------------------------------
# Public call
# Public LLM Call
# ------------------------------------------------------------
async def call_llm(
prompt: str = None,
messages: list = None,
backend: str | None = None,
prompt: Optional[str] = None,
messages: Optional[List[Dict]] = None,
backend: Optional[str] = None,
temperature: float = 0.7,
max_tokens: int = 512,
tools: Optional[List[Dict]] = None,
tool_choice: Optional[str] = None,
return_adapter_response: bool = False,
):
"""
Call an LLM backend with optional tool calling support.
Args:
prompt: String prompt (for completion-style APIs like mi50)
messages: List of message dicts (for chat-style APIs like Ollama/OpenAI)
backend: Which backend to use (PRIMARY, SECONDARY, OPENAI, etc.)
temperature: Sampling temperature
max_tokens: Maximum tokens to generate
tools: List of Lyra tool definitions (provider-agnostic)
tool_choice: How to use tools ("auto", "required", "none")
return_adapter_response: If True, return dict with content and tool_calls
Returns:
str (default) or dict (if return_adapter_response=True):
{"content": str, "tool_calls": [...] or None}
Simple LLM call.
Supports: ollama, mi50 (llama.cpp), openai.
Returns plain text response.
"""
backend = (backend or DEFAULT_BACKEND).upper()
if backend not in BACKENDS:
@@ -95,207 +70,96 @@ async def call_llm(
if not url or not model:
raise RuntimeError(f"Backend '{backend}' missing url/model in env")
# If tools are requested, use adapter to prepare request
if tools:
# Get adapter for this backend
adapter = TOOL_ADAPTERS.get(backend)
# Convert prompt → messages if needed
if not messages:
messages = [{"role": "user", "content": prompt or ""}]
# For PRIMARY/SECONDARY/FALLBACK, determine adapter based on provider
if adapter is None and backend in ["PRIMARY", "SECONDARY", "FALLBACK"]:
if provider == "openai":
adapter = TOOL_ADAPTERS["OPENAI"]
elif provider == "ollama":
adapter = TOOL_ADAPTERS["OLLAMA"]
elif provider == "mi50":
adapter = TOOL_ADAPTERS["MI50"]
if adapter:
# Use messages array if provided, otherwise convert prompt to messages
if not messages:
messages = [{"role": "user", "content": prompt}]
# Prepare request through adapter
adapted_request = await adapter.prepare_request(messages, tools, tool_choice)
messages = adapted_request["messages"]
# Extract tools in provider format if present
provider_tools = adapted_request.get("tools")
provider_tool_choice = adapted_request.get("tool_choice")
else:
logger.warning(f"No adapter available for backend {backend}, ignoring tools")
provider_tools = None
provider_tool_choice = None
else:
provider_tools = None
provider_tool_choice = None
# -------------------------------
# Provider: MI50 (llama.cpp server)
# -------------------------------
if provider == "mi50":
# If tools requested, convert messages to prompt with tool instructions
if messages and tools:
# Combine messages into a prompt
prompt_parts = []
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
prompt_parts.append(f"{role.capitalize()}: {content}")
prompt = "\n".join(prompt_parts) + "\nAssistant:"
payload = {
"prompt": prompt,
"n_predict": max_tokens,
"temperature": temperature,
"stop": ["User:", "\nUser:", "Assistant:", "\n\n\n"]
}
try:
r = await http_client.post(f"{url}/completion", json=payload)
r.raise_for_status()
data = r.json()
response_content = data.get("content", "")
# If caller wants adapter response with tool calls, parse and return
if return_adapter_response and tools:
adapter = TOOL_ADAPTERS.get(backend) or TOOL_ADAPTERS["MI50"]
return await adapter.parse_response(response_content)
else:
return response_content
except httpx.HTTPError as e:
logger.error(f"HTTP error calling mi50: {type(e).__name__}: {str(e)}")
raise RuntimeError(f"LLM API error (mi50): {type(e).__name__}: {str(e)}")
except (KeyError, json.JSONDecodeError) as e:
logger.error(f"Response parsing error from mi50: {e}")
raise RuntimeError(f"Invalid response format (mi50): {e}")
except Exception as e:
logger.error(f"Unexpected error calling mi50: {type(e).__name__}: {str(e)}")
raise RuntimeError(f"Unexpected error (mi50): {type(e).__name__}: {str(e)}")
# -------------------------------
# Provider: OLLAMA (your 3090)
# -------------------------------
logger.info(f"🔍 LLM Router: provider={provider}, checking if ollama...")
# ------------------------------------------------------------
# OLLAMA
# ------------------------------------------------------------
if provider == "ollama":
logger.info(f"🔍 LLM Router: Matched ollama provider, tools={bool(tools)}, return_adapter_response={return_adapter_response}")
# Use messages array if provided, otherwise convert prompt to single user message
if messages:
chat_messages = messages
else:
chat_messages = [{"role": "user", "content": prompt}]
payload = {
"model": model,
"messages": chat_messages,
"messages": messages,
"stream": False,
"options": {
"temperature": temperature,
"num_predict": max_tokens
}
}
try:
r = await http_client.post(f"{url}/api/chat", json=payload)
r.raise_for_status()
data = r.json()
response_content = data["message"]["content"]
return data["message"]["content"]
# If caller wants adapter response with tool calls, parse and return
if return_adapter_response and tools:
logger.info(f"🔍 Ollama: return_adapter_response=True, calling adapter.parse_response")
adapter = TOOL_ADAPTERS.get(backend) or TOOL_ADAPTERS["OLLAMA"]
logger.info(f"🔍 Ollama: Using adapter {adapter.__class__.__name__}")
result = await adapter.parse_response(response_content)
logger.info(f"🔍 Ollama: Adapter returned {result}")
return result
else:
return response_content
except httpx.HTTPError as e:
logger.error(f"HTTP error calling ollama: {type(e).__name__}: {str(e)}")
raise RuntimeError(f"LLM API error (ollama): {type(e).__name__}: {str(e)}")
except (KeyError, json.JSONDecodeError) as e:
logger.error(f"Response parsing error from ollama: {e}")
raise RuntimeError(f"Invalid response format (ollama): {e}")
except Exception as e:
logger.error(f"Unexpected error calling ollama: {type(e).__name__}: {str(e)}")
raise RuntimeError(f"Unexpected error (ollama): {type(e).__name__}: {str(e)}")
logger.error(f"Ollama error: {e}")
raise RuntimeError(f"Ollama API error: {e}")
# ------------------------------------------------------------
# MI50 (llama.cpp server)
# ------------------------------------------------------------
if provider == "mi50":
# -------------------------------
# Provider: OPENAI
# -------------------------------
# Convert messages to plain prompt
prompt_parts = []
for msg in messages:
role = msg.get("role", "user")
content = msg.get("content", "")
prompt_parts.append(f"{role.capitalize()}: {content}")
full_prompt = "\n".join(prompt_parts) + "\nAssistant:"
payload = {
"prompt": full_prompt,
"n_predict": max_tokens,
"temperature": temperature,
"stop": ["User:", "\nUser:", "Assistant:", "\n\n\n"]
}
try:
r = await http_client.post(f"{url}/completion", json=payload)
r.raise_for_status()
data = r.json()
return data.get("content", "")
except Exception as e:
logger.error(f"MI50 error: {e}")
raise RuntimeError(f"MI50 API error: {e}")
# ------------------------------------------------------------
# OPENAI
# ------------------------------------------------------------
if provider == "openai":
headers = {
"Authorization": f"Bearer {cfg['api_key']}",
"Authorization": f"Bearer {cfg.get('api_key')}",
"Content-Type": "application/json"
}
# Use messages array if provided, otherwise convert prompt to single user message
if messages:
chat_messages = messages
else:
chat_messages = [{"role": "user", "content": prompt}]
payload = {
"model": model,
"messages": chat_messages,
"messages": messages,
"temperature": temperature,
"max_tokens": max_tokens,
}
# Add tools if available (OpenAI native function calling)
if provider_tools:
payload["tools"] = provider_tools
if provider_tool_choice:
payload["tool_choice"] = provider_tool_choice
try:
r = await http_client.post(f"{url}/chat/completions", json=payload, headers=headers)
r = await http_client.post(
f"{url}/chat/completions",
json=payload,
headers=headers
)
r.raise_for_status()
data = r.json()
return data["choices"][0]["message"]["content"]
# If caller wants adapter response with tool calls, parse and return
if return_adapter_response and tools:
# Create mock response object for adapter
class MockChoice:
def __init__(self, message_data):
self.message = type('obj', (object,), {})()
self.message.content = message_data.get("content")
# Convert tool_calls dicts to objects
raw_tool_calls = message_data.get("tool_calls")
if raw_tool_calls:
self.message.tool_calls = []
for tc in raw_tool_calls:
tool_call_obj = type('obj', (object,), {})()
tool_call_obj.id = tc.get("id")
tool_call_obj.function = type('obj', (object,), {})()
tool_call_obj.function.name = tc.get("function", {}).get("name")
tool_call_obj.function.arguments = tc.get("function", {}).get("arguments")
self.message.tool_calls.append(tool_call_obj)
else:
self.message.tool_calls = None
class MockResponse:
def __init__(self, data):
self.choices = [MockChoice(data["choices"][0]["message"])]
mock_resp = MockResponse(data)
adapter = TOOL_ADAPTERS.get(backend) or TOOL_ADAPTERS["OPENAI"]
return await adapter.parse_response(mock_resp)
else:
return data["choices"][0]["message"]["content"]
except httpx.HTTPError as e:
logger.error(f"HTTP error calling openai: {type(e).__name__}: {str(e)}")
raise RuntimeError(f"LLM API error (openai): {type(e).__name__}: {str(e)}")
except (KeyError, json.JSONDecodeError) as e:
logger.error(f"Response parsing error from openai: {e}")
raise RuntimeError(f"Invalid response format (openai): {e}")
except Exception as e:
logger.error(f"Unexpected error calling openai: {type(e).__name__}: {str(e)}")
raise RuntimeError(f"Unexpected error (openai): {type(e).__name__}: {str(e)}")
logger.error(f"OpenAI error: {e}")
raise RuntimeError(f"OpenAI API error: {e}")
# -------------------------------
# Unknown provider
# -------------------------------
raise RuntimeError(f"Provider '{provider}' not implemented.")
# ------------------------------------------------------------
# Unknown Provider
# ------------------------------------------------------------
raise RuntimeError(f"Provider '{provider}' not implemented.")
+5
View File
@@ -13,4 +13,9 @@ app.add_middleware(
allow_headers=["*"],
)
# Health check endpoint
@app.get("/_health")
async def health_check():
return {"status": "ok"}
app.include_router(cortex_router)
+1 -392
View File
@@ -6,21 +6,8 @@ import asyncio
from fastapi import APIRouter
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from reasoning.reasoning import reason_check
from reasoning.reflection import reflect_notes
from reasoning.refine import refine_answer
from persona.speak import speak
from persona.identity import load_identity
from context import collect_context, update_last_assistant_message
from intake.intake import add_exchange_internal
from autonomy.monologue.monologue import InnerMonologue
from autonomy.self.state import load_self_state
from autonomy.tools.stream_events import get_stream_manager
# -------------------------------------------------------------------
# Setup
# -------------------------------------------------------------------
LOG_DETAIL_LEVEL = os.getenv("LOG_DETAIL_LEVEL", "summary").lower()
@@ -35,10 +22,7 @@ console_handler.setFormatter(logging.Formatter(
))
logger.addHandler(console_handler)
cortex_router = APIRouter()
inner_monologue = InnerMonologue()
# -------------------------------------------------------------------
# Models
@@ -49,292 +33,6 @@ class ReasonRequest(BaseModel):
temperature: float | None = None
backend: str | None = None
# -------------------------------------------------------------------
# /reason endpoint
# -------------------------------------------------------------------
@cortex_router.post("/reason")
async def run_reason(req: ReasonRequest):
from datetime import datetime
pipeline_start = datetime.now()
stage_timings = {}
# Show pipeline start in detailed/verbose mode
if LOG_DETAIL_LEVEL in ["detailed", "verbose"]:
logger.info(f"\n{'='*100}")
logger.info(f"🚀 PIPELINE START | Session: {req.session_id} | {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
logger.info(f"{'='*100}")
logger.info(f"📝 User: {req.user_prompt[:150]}...")
logger.info(f"{'-'*100}\n")
# ----------------------------------------------------------------
# STAGE 0 — Context
# ----------------------------------------------------------------
stage_start = datetime.now()
context_state = await collect_context(req.session_id, req.user_prompt)
stage_timings["context"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# STAGE 0.5 — Identity
# ----------------------------------------------------------------
stage_start = datetime.now()
identity_block = load_identity(req.session_id)
stage_timings["identity"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# STAGE 0.6 — Inner Monologue (observer-only)
# ----------------------------------------------------------------
stage_start = datetime.now()
inner_result = None
try:
self_state = load_self_state()
mono_context = {
"user_message": req.user_prompt,
"session_id": req.session_id,
"self_state": self_state,
"context_summary": context_state,
}
inner_result = await inner_monologue.process(mono_context)
logger.info(f"🧠 Monologue | {inner_result.get('intent', 'unknown')} | Tone: {inner_result.get('tone', 'neutral')}")
# Store in context for downstream use
context_state["monologue"] = inner_result
except Exception as e:
logger.warning(f"⚠️ Monologue failed: {e}")
stage_timings["monologue"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# STAGE 0.7 — Executive Planning (conditional)
# ----------------------------------------------------------------
stage_start = datetime.now()
executive_plan = None
if inner_result and inner_result.get("consult_executive"):
try:
from autonomy.executive.planner import plan_execution
executive_plan = await plan_execution(
user_prompt=req.user_prompt,
intent=inner_result.get("intent", "unknown"),
context_state=context_state,
identity_block=identity_block
)
logger.info(f"🎯 Executive plan: {executive_plan.get('summary', 'N/A')[:80]}...")
except Exception as e:
logger.warning(f"⚠️ Executive planning failed: {e}")
executive_plan = None
stage_timings["executive"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# STAGE 0.8 — Autonomous Tool Invocation
# ----------------------------------------------------------------
stage_start = datetime.now()
tool_results = None
autonomous_enabled = os.getenv("ENABLE_AUTONOMOUS_TOOLS", "true").lower() == "true"
tool_confidence_threshold = float(os.getenv("AUTONOMOUS_TOOL_CONFIDENCE_THRESHOLD", "0.6"))
if autonomous_enabled and inner_result:
try:
from autonomy.tools.decision_engine import ToolDecisionEngine
from autonomy.tools.orchestrator import ToolOrchestrator
# Analyze which tools to invoke
decision_engine = ToolDecisionEngine()
tool_decision = await decision_engine.analyze_tool_needs(
user_prompt=req.user_prompt,
monologue=inner_result,
context_state=context_state,
available_tools=["RAG", "WEB", "WEATHER", "CODEBRAIN"]
)
# Execute tools if confidence threshold met
if tool_decision["should_invoke_tools"] and tool_decision["confidence"] >= tool_confidence_threshold:
orchestrator = ToolOrchestrator(tool_timeout=30)
tool_results = await orchestrator.execute_tools(
tools_to_invoke=tool_decision["tools_to_invoke"],
context_state=context_state
)
# Format results for context injection
tool_context = orchestrator.format_results_for_context(tool_results)
context_state["autonomous_tool_results"] = tool_context
summary = tool_results.get("execution_summary", {})
logger.info(f"🛠️ Tools executed: {summary.get('successful', [])} succeeded")
else:
logger.info(f"🛠️ No tools invoked (confidence: {tool_decision.get('confidence', 0):.2f})")
except Exception as e:
logger.warning(f"⚠️ Autonomous tool invocation failed: {e}")
if LOG_DETAIL_LEVEL == "verbose":
import traceback
traceback.print_exc()
stage_timings["tools"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# STAGE 1-5 — Core Reasoning Pipeline
# ----------------------------------------------------------------
stage_start = datetime.now()
# Extract intake summary
intake_summary = "(no context available)"
if context_state.get("intake"):
l20 = context_state["intake"].get("L20")
if isinstance(l20, dict):
intake_summary = l20.get("summary", intake_summary)
elif isinstance(l20, str):
intake_summary = l20
# Reflection
try:
reflection = await reflect_notes(intake_summary, identity_block=identity_block)
reflection_notes = reflection.get("notes", [])
except Exception as e:
reflection_notes = []
logger.warning(f"⚠️ Reflection failed: {e}")
stage_timings["reflection"] = (datetime.now() - stage_start).total_seconds() * 1000
# Reasoning (draft)
stage_start = datetime.now()
draft = await reason_check(
req.user_prompt,
identity_block=identity_block,
rag_block=context_state.get("rag", []),
reflection_notes=reflection_notes,
context=context_state,
monologue=inner_result,
executive_plan=executive_plan
)
stage_timings["reasoning"] = (datetime.now() - stage_start).total_seconds() * 1000
# Refinement
stage_start = datetime.now()
result = await refine_answer(
draft_output=draft,
reflection_notes=reflection_notes,
identity_block=identity_block,
rag_block=context_state.get("rag", []),
)
final_neutral = result["final_output"]
stage_timings["refinement"] = (datetime.now() - stage_start).total_seconds() * 1000
# Persona
stage_start = datetime.now()
tone = inner_result.get("tone", "neutral") if inner_result else "neutral"
depth = inner_result.get("depth", "medium") if inner_result else "medium"
persona_answer = await speak(final_neutral, tone=tone, depth=depth)
stage_timings["persona"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# STAGE 6 — Session update
# ----------------------------------------------------------------
update_last_assistant_message(req.session_id, persona_answer)
# ----------------------------------------------------------------
# STAGE 6.5 — Self-state update & Pattern Learning
# ----------------------------------------------------------------
stage_start = datetime.now()
try:
from autonomy.self.analyzer import analyze_and_update_state
await analyze_and_update_state(
monologue=inner_result or {},
user_prompt=req.user_prompt,
response=persona_answer,
context=context_state
)
except Exception as e:
logger.warning(f"⚠️ Self-state update failed: {e}")
try:
from autonomy.learning.pattern_learner import get_pattern_learner
learner = get_pattern_learner()
await learner.learn_from_interaction(
user_prompt=req.user_prompt,
response=persona_answer,
monologue=inner_result or {},
context=context_state
)
except Exception as e:
logger.warning(f"⚠️ Pattern learning failed: {e}")
stage_timings["learning"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# STAGE 7 — Proactive Monitoring & Suggestions
# ----------------------------------------------------------------
stage_start = datetime.now()
proactive_enabled = os.getenv("ENABLE_PROACTIVE_MONITORING", "true").lower() == "true"
proactive_min_priority = float(os.getenv("PROACTIVE_SUGGESTION_MIN_PRIORITY", "0.6"))
if proactive_enabled:
try:
from autonomy.proactive.monitor import get_proactive_monitor
monitor = get_proactive_monitor(min_priority=proactive_min_priority)
self_state = load_self_state()
suggestion = await monitor.analyze_session(
session_id=req.session_id,
context_state=context_state,
self_state=self_state
)
if suggestion:
suggestion_text = monitor.format_suggestion(suggestion)
persona_answer += suggestion_text
logger.info(f"💡 Proactive suggestion: {suggestion['type']} (priority: {suggestion['priority']:.2f})")
except Exception as e:
logger.warning(f"⚠️ Proactive monitoring failed: {e}")
stage_timings["proactive"] = (datetime.now() - stage_start).total_seconds() * 1000
# ----------------------------------------------------------------
# PIPELINE COMPLETE — Summary
# ----------------------------------------------------------------
total_duration = (datetime.now() - pipeline_start).total_seconds() * 1000
# Always show pipeline completion
logger.info(f"\n{'='*100}")
logger.info(f"✨ PIPELINE COMPLETE | Session: {req.session_id} | Total: {total_duration:.0f}ms")
logger.info(f"{'='*100}")
# Show timing breakdown in detailed/verbose mode
if LOG_DETAIL_LEVEL in ["detailed", "verbose"]:
logger.info("⏱️ Stage Timings:")
for stage, duration in stage_timings.items():
pct = (duration / total_duration) * 100 if total_duration > 0 else 0
logger.info(f" {stage:15s}: {duration:6.0f}ms ({pct:5.1f}%)")
logger.info(f"📤 Output: {len(persona_answer)} chars")
logger.info(f"{'='*100}\n")
# ----------------------------------------------------------------
# RETURN
# ----------------------------------------------------------------
return {
"draft": draft,
"neutral": final_neutral,
"persona": persona_answer,
"reflection": reflection_notes,
"session_id": req.session_id,
"context_summary": {
"rag_results": len(context_state.get("rag", [])),
"minutes_since_last": context_state.get("minutes_since_last_msg"),
"message_count": context_state.get("message_count"),
"mode": context_state.get("mode"),
}
}
# -------------------------------------------------------------------
# /simple endpoint - Standard chatbot mode (no reasoning pipeline)
# -------------------------------------------------------------------
@@ -346,7 +44,6 @@ async def run_simple(req: ReasonRequest):
"""
from datetime import datetime
from llm.llm_router import call_llm
from autonomy.tools.function_caller import FunctionCaller
start_time = datetime.now()
@@ -356,9 +53,6 @@ async def run_simple(req: ReasonRequest):
logger.info(f"📝 User: {req.user_prompt[:150]}...")
logger.info(f"{'-'*100}\n")
# Get conversation history from context and intake buffer
context_state = await collect_context(req.session_id, req.user_prompt)
# Get recent messages from Intake buffer
from intake.intake import get_recent_messages
recent_msgs = get_recent_messages(req.session_id, limit=20)
@@ -400,31 +94,10 @@ async def run_simple(req: ReasonRequest):
temperature = req.temperature if req.temperature is not None else 0.7
# Check if tools are enabled
enable_tools = os.getenv("STANDARD_MODE_ENABLE_TOOLS", "false").lower() == "true"
# Call LLM with or without tools
try:
if enable_tools:
# Use FunctionCaller for tool-enabled conversation
logger.info(f"🛠️ Tool calling enabled for Standard Mode")
logger.info(f"🔍 Creating FunctionCaller with backend={backend}, temp={temperature}")
function_caller = FunctionCaller(backend, temperature)
logger.info(f"🔍 FunctionCaller created, calling call_with_tools...")
result = await function_caller.call_with_tools(
messages=messages,
max_tokens=2048,
session_id=req.session_id # Pass session_id for streaming
)
logger.info(f"🔍 call_with_tools returned: iterations={result.get('iterations')}, tool_calls={len(result.get('tool_calls', []))}")
# Log tool usage
if result.get("tool_calls"):
tool_names = [tc["name"] for tc in result["tool_calls"]]
logger.info(f"🔧 Tools used: {', '.join(tool_names)} ({result['iterations']} iterations)")
response = result["content"].strip()
else:
# Direct LLM call without tools (original behavior)
raw_response = await call_llm(
messages=messages,
@@ -440,7 +113,6 @@ async def run_simple(req: ReasonRequest):
# Update session with the exchange
try:
update_last_assistant_message(req.session_id, response)
add_exchange_internal({
"session_id": req.session_id,
"role": "user",
@@ -473,64 +145,6 @@ async def run_simple(req: ReasonRequest):
}
}
# -------------------------------------------------------------------
# /stream/thinking endpoint - SSE stream for "show your work"
# -------------------------------------------------------------------
@cortex_router.get("/stream/thinking/{session_id}")
async def stream_thinking(session_id: str):
"""
Server-Sent Events stream for tool calling "show your work" feature.
Streams real-time updates about:
- Thinking/planning steps
- Tool calls being made
- Tool execution results
- Final completion
"""
stream_manager = get_stream_manager()
queue = stream_manager.subscribe(session_id)
async def event_generator():
try:
# Send initial connection message
import json
connected_event = json.dumps({"type": "connected", "session_id": session_id})
yield f"data: {connected_event}\n\n"
while True:
# Wait for events with timeout to send keepalive
try:
event = await asyncio.wait_for(queue.get(), timeout=30.0)
# Format as SSE
event_data = json.dumps(event)
yield f"data: {event_data}\n\n"
# If it's a "done" event, close the stream
if event.get("type") == "done":
break
except asyncio.TimeoutError:
# Send keepalive comment
yield ": keepalive\n\n"
except asyncio.CancelledError:
logger.info(f"Stream cancelled for session {session_id}")
finally:
stream_manager.unsubscribe(session_id, queue)
return StreamingResponse(
event_generator(),
media_type="text/event-stream",
headers={
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"X-Accel-Buffering": "no" # Disable nginx buffering
}
)
# -------------------------------------------------------------------
# /ingest endpoint (internal)
# -------------------------------------------------------------------
@@ -542,11 +156,6 @@ class IngestPayload(BaseModel):
@cortex_router.post("/ingest")
async def ingest(payload: IngestPayload):
try:
update_last_assistant_message(payload.session_id, payload.assistant_msg)
except Exception as e:
logger.warning(f"[INGEST] Session update failed: {e}")
try:
add_exchange_internal({
"session_id": payload.session_id,