From 5f53fb32a4d379fb74e18b12c0c8bd950201e84f Mon Sep 17 00:00:00 2001
From: serversdown <brian@serversdown.net>
Date: Fri, 29 May 2026 18:20:56 -0400
Subject: [PATCH] feat: Refactor LLM router and integrate health check endpoint

- Simplified LLM call logic in llm_router.py, removing tool adapter complexity and enhancing error handling.
- Added health check endpoint to main.py for system status verification.
- Cleaned up router.py by removing unused imports and commented-out code, streamlining the structure.
- Updated docker-compose.yml to unify services under a single Lyra container, enhancing deployment simplicity.
- Created Dockerfile for unified container setup, including both Relay and Cortex services.
- Added QUICKSTART.md for improved onboarding and usage instructions.
- Implemented start.sh script to manage service startup and health checks.
---
 .dockerignore                   |   52 ++
 Dockerfile                      |   48 ++
 QUICKSTART.md                   |  124 ++++
 README.md                       | 1175 ++++++++++---------------------
 core/relay/server.js            |    6 +-
 cortex/ingest/__init__.py       |    1 -
 cortex/ingest/ingest_handler.py |   33 -
 cortex/ingest/intake_client.py  |   45 --
 cortex/intake/intake.py         |   94 ++-
 cortex/llm/llm_router.py        |  274 ++-----
 cortex/main.py                  |    5 +
 cortex/router.py                |  393 +----------
 docker-compose.yml              |  183 +----
 start.sh                        |   34 +
 14 files changed, 802 insertions(+), 1665 deletions(-)
 create mode 100644 .dockerignore
 create mode 100644 Dockerfile
 create mode 100644 QUICKSTART.md
 delete mode 100644 cortex/ingest/__init__.py
 delete mode 100644 cortex/ingest/ingest_handler.py
 delete mode 100644 cortex/ingest/intake_client.py
 create mode 100644 start.sh

diff --git a/.dockerignore b/.dockerignore
new file mode 100644
index 0000000..8687a3e
--- /dev/null
+++ b/.dockerignore
@@ -0,0 +1,52 @@
+# Git
+.git
+.gitignore
+
+# Docker
+docker-compose.yml
+Dockerfile*
+
+# Python
+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+*.so
+*.egg
+*.egg-info
+dist
+build
+.venv
+venv
+
+# Node
+node_modules
+npm-debug.log
+yarn-error.log
+
+# IDE
+.vscode
+.idea
+*.swp
+*.swo
+
+# Logs
+*.log
+logs
+
+# Environment
+.env.local
+.env.*.local
+
+# Backup directories
+*-old
+*-backup*
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Temp
+*.tmp
+tmp
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..382059e
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,48 @@
+# Unified Lyra Container - Relay (Node) + Cortex (Python)
+FROM python:3.11-slim
+
+# Install Node.js, npm, and docker CLI
+RUN apt-get update && apt-get install -y \
+    curl \
+    docker.io \
+    && curl -fsSL https://deb.nodesource.com/setup_18.x | bash - \
+    && apt-get install -y nodejs \
+    && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /app
+
+# ============================================================
+# Install Python dependencies (Cortex)
+# ============================================================
+COPY cortex/requirements.txt /app/cortex/requirements.txt
+RUN pip install --no-cache-dir -r /app/cortex/requirements.txt
+
+# ============================================================
+# Install Node dependencies (Relay)
+# ============================================================
+COPY core/relay/package*.json /app/relay/
+WORKDIR /app/relay
+RUN npm install
+
+# ============================================================
+# Copy application code
+# ============================================================
+WORKDIR /app
+COPY cortex/ /app/cortex/
+COPY core/relay/ /app/relay/
+
+# ============================================================
+# Copy startup script
+# ============================================================
+COPY start.sh /app/start.sh
+RUN chmod +x /app/start.sh
+
+# ============================================================
+# Expose ports
+# ============================================================
+EXPOSE 7078 7081
+
+# ============================================================
+# Start both services
+# ============================================================
+CMD ["/app/start.sh"]
diff --git a/QUICKSTART.md b/QUICKSTART.md
new file mode 100644
index 0000000..a91ca26
--- /dev/null
+++ b/QUICKSTART.md
@@ -0,0 +1,124 @@
+# Lyra Quickstart
+
+## Architecture
+
+Lyra is now a **unified container** running:
+- **Relay** (Node.js on port 7078) - User-facing API with OpenAI-compatible endpoints
+- **Cortex** (Python on port 7081) - Brain with Intake summarization pipeline
+- **Intake** - Multi-level summarization (L1-L30) that sends to Nebula
+
+## Running Lyra
+
+### 1. Start the system
+```bash
+docker-compose up -d
+```
+
+### 2. Check logs
+```bash
+# All services
+docker-compose logs -f lyra
+
+# Just startup
+docker-compose logs lyra
+```
+
+### 3. Verify it's running
+```bash
+# Check Relay
+curl http://localhost:7078/_health
+
+# Check Cortex
+curl http://localhost:7081/_health
+
+# View UI
+open http://localhost:8081
+```
+
+## Making Changes
+
+### Restart after code changes
+```bash
+docker-compose restart lyra
+```
+
+### Rebuild after dependency changes
+```bash
+docker-compose up -d --build lyra
+```
+
+## Architecture Details
+
+```
+┌─────────────────────────────────────┐
+│   Unified Container (lyra)          │
+│                                      │
+│  ┌──────────────┐  ┌─────────────┐  │
+│  │ Relay :7078  │  │Cortex :7081 │  │
+│  │  (Node.js)   │─→│  (Python)   │  │
+│  └──────────────┘  └─────────────┘  │
+│                         │            │
+│                         ↓            │
+│                    ┌─────────┐       │
+│                    │ Intake  │       │
+│                    │Summarize│       │
+│                    └─────────┘       │
+│                         │            │
+└─────────────────────────┼────────────┘
+                          ↓
+                    ┌──────────┐
+                    │  Nebula  │  (external, to be built)
+                    │  (vector │
+                    │ storage) │
+                    └──────────┘
+```
+
+## Endpoints
+
+### Relay (Port 7078)
+- `POST /chat` - Lyra-native chat endpoint
+- `POST /v1/chat/completions` - OpenAI-compatible endpoint
+- `GET /sessions` - List sessions
+- `GET /_health` - Health check
+
+### Cortex (Port 7081)
+- `POST /reason` - Full reasoning pipeline
+- `POST /simple` - Simple chat mode
+- `POST /ingest` - Internal intake endpoint
+- `GET /_health` - Health check
+
+## Environment Variables
+
+Key variables in `.env`:
+```bash
+# LLM Configuration
+PRIMARY_LLM_PROVIDER=anthropic
+ANTHROPIC_API_KEY=sk-...
+
+# Nebula (when available)
+NEBULA_API=http://nebula:7090
+NEBULA_KEY=your-key
+
+# Intake Settings
+INTAKE_LLM=PRIMARY
+SUMMARY_MAX_TOKENS=200
+SUMMARY_TEMPERATURE=0.3
+```
+
+## Data Persistence
+
+Until Nebula is running, summaries are saved to:
+```
+.nebula_fallback/
+  └── {session_id}/
+      ├── L10_20260223_203045.json
+      ├── L20_20260223_204512.json
+      └── L30_20260223_210030.json
+```
+
+Sessions are saved to:
+```
+core/relay/sessions/
+  ├── {session_id}.json
+  └── {session_id}.meta.json
+```
diff --git a/README.md b/README.md
index 37dcab5..ac606c8 100644
--- a/README.md
+++ b/README.md
@@ -1,578 +1,120 @@
-# Project Lyra - README v0.9.1
+# Project Lyra
 
-Lyra is a modular persistent AI companion system with advanced reasoning capabilities and autonomous decision-making.
-It provides memory-backed chat using **Relay** + **Cortex** with integrated **Autonomy System**,
-featuring a multi-stage reasoning pipeline powered by HTTP-based LLM backends.
+**A streamlined AI conversation system with intelligent summarization and memory**
 
-**NEW in v0.9.0:** Trilium Notes integration - Search and create notes from conversations
+Lyra is a unified conversational AI system that processes your thoughts, summarizes conversations at multiple levels, and prepares them for semantic memory storage. Think of it as your personal thought processor—you dump ideas, it makes sense of them, and stores both the raw conversation and progressive summaries.
 
-**Current Version:** v0.9.1 (2025-12-29)
+**Current Version:** v1.0.0 (2026-02-23)
 
-> **Note:** As of v0.6.0, NeoMem is **disabled by default** while we work out integration hiccups in the pipeline. The autonomy system is being refined independently before full memory integration.
+---
 
 ## Mission Statement
 
-The point of Project Lyra is to give an AI chatbot more abilities than a typical chatbot. Typical chatbots are essentially amnesic and forget evertything about your project. Lyra helps keep projects organized and remembers everything you have done. Think of her abilities as a notepad/schedule/database/co-creator/collaborator all with its own executive function. Say something in passing, Lyra remembers it then reminds you of it later.
+Project Lyra is designed to be your **external brain**. Unlike typical chatbots that forget everything, Lyra:
+- **Captures** everything you say in raw form
+- **Summarizes** conversations at multiple granularities (L1-L30)
+- **Stores** both raw and summarized data for future retrieval
+- **Prepares** everything for semantic search via vector embeddings (Nebula, coming soon)
+
+You can vomit ideas at it, and Lyra will organize, summarize, and remember.
 
 ---
 
 ## Architecture Overview
 
-Project Lyra operates as a **single docker-compose deployment** with multiple Docker containers networked together in a microservices architecture. Like how the brain has regions, Lyra has modules:
+Lyra runs as a **unified Docker container** with a clean separation of concerns:
 
-### Core Services
+```
+┌─────────────────────────────────────────────┐
+│   Unified Container (lyra)                  │
+│                                              │
+│  ┌──────────────┐  ┌──────────────────────┐ │
+│  │ Relay :7078  │  │   Cortex :7081       │ │
+│  │  (Node.js)   │→ │   (Python FastAPI)   │ │
+│  │              │  │                       │ │
+│  │ - API Gateway│  │ - /reason (full)     │ │
+│  │ - Sessions   │  │ - /simple (fast)     │ │
+│  │ - OpenAI API │  │ - /ingest (intake)   │ │
+│  └──────────────┘  └──────────────────────┘ │
+│                            │                 │
+│                            ↓                 │
+│                    ┌──────────────┐          │
+│                    │   Intake     │          │
+│                    │  (embedded)  │          │
+│                    │              │          │
+│                    │ - L1-L30     │          │
+│                    │ - Summary    │          │
+│                    │ - Buffer     │          │
+│                    └──────────────┘          │
+│                            │                 │
+└────────────────────────────┼─────────────────┘
+                             ↓
+                      ┌─────────────┐
+                      │   Nebula    │  (coming soon)
+                      │  (vector    │
+                      │   storage)  │
+                      └─────────────┘
+```
 
-**1. Relay** (Node.js/Express) - Port 7078
-- Main orchestrator and message router
-- Coordinates all module interactions
+### Components
+
+**1. Relay (Node.js - Port 7078)**
+- User-facing API gateway
 - OpenAI-compatible endpoint: `POST /v1/chat/completions`
-- Internal endpoint: `POST /chat`
-- Dual-mode routing: Standard Mode (simple chat) or Cortex Mode (full reasoning)
-- Server-side session persistence with file-based storage
-- Session management API: `GET/POST/PATCH/DELETE /sessions`
-- Manages async calls to Cortex ingest
-- *(NeoMem integration currently disabled in v0.6.0)*
+- Session management (save, load, rename, delete)
+- Proxies requests to Cortex
 
-**2. UI** (Static HTML) - Port 8081 (nginx)
-- Browser-based chat interface with cyberpunk theme
-- Mode selector (Standard/Cortex) in header
-- Settings modal with backend selection and session management
-- Light/Dark mode toggle (dark by default)
-- **NEW in v0.8.0:** "🧠 Show Work" button for real-time thinking stream
-  - Opens popup window with live SSE connection
-  - Color-coded events: thinking, tool calls, results, completion
-  - Auto-scrolling with animations
-  - Session-aware (matches current chat session)
-- Server-synced session management (persists across browsers and reboots)
-- OpenAI-compatible message format
+**2. Cortex (Python - Port 7081)**
+- Main reasoning and processing brain
+- Multi-stage reasoning pipeline
+- LLM routing to different backends
+- Embedded Intake module
 
-**3. NeoMem** (Python/FastAPI) - Port 7077 - **DISABLED IN v0.6.0**
-- Long-term memory database (fork of Mem0 OSS)
-- Vector storage (PostgreSQL + pgvector) + Graph storage (Neo4j)
-- RESTful API: `/memories`, `/search`
-- Semantic memory updates and retrieval
-- No external SDK dependencies - fully local
-- **Status:** Currently disabled while pipeline integration is refined
+**3. Intake (Python Module - Embedded)**
+- Short-term memory buffer (200 messages per session)
+- Multi-level summarization:
+  - **L1** (5 messages): Ultra-short summary
+  - **L5** (10 messages): Short overview
+  - **L10** (10 messages): "Reality Check" - tone, intent, direction
+  - **L20** (merged L10s): "Session Overview" - progress and themes
+  - **L30** (merged L20s): "Continuity Report" - high-level reflection
+- Sends summaries to Nebula (HTTP POST with disk fallback)
 
-### Reasoning Layer
-
-**4. Cortex** (Python/FastAPI) - Port 7081
-- Primary reasoning engine with multi-stage pipeline and autonomy system
-- **Includes embedded Intake module** (no separate service as of v0.5.1)
-- **Integrated Autonomy System** (NEW in v0.6.0) - See Autonomy System section below
-- **Tool Calling System** (NEW in v0.8.0) - Agentic execution for Standard Mode
-  - Sandboxed code execution (Python, JavaScript, Bash)
-  - Web search via Tavily API
-  - **Trilium knowledge base integration** (NEW in v0.9.0)
-  - Multi-iteration autonomous tool use (max 5 iterations)
-  - Real-time thinking stream via SSE
-- **Dual Operating Modes:**
-  - **Standard Mode** (v0.7.0) - Simple chatbot with context retention + tool calling (v0.8.0)
-    - Bypasses reflection, reasoning, refinement stages
-    - Direct LLM call with conversation history
-    - User-selectable backend (SECONDARY, OPENAI, or custom)
-    - **NEW:** Autonomous tool calling for code execution, web search, knowledge queries
-    - **NEW:** "Show Your Work" real-time thinking stream
-    - Faster responses for coding and practical tasks
-  - **Cortex Mode** - Full 4-stage reasoning pipeline
-    1. **Reflection** - Generates meta-awareness notes about conversation
-    2. **Reasoning** - Creates initial draft answer using context
-    3. **Refinement** - Polishes and improves the draft
-    4. **Persona** - Applies Lyra's personality and speaking style
-- Integrates with Intake for short-term context via internal Python imports
-- Flexible LLM router supporting multiple backends via HTTP
-- **Endpoints:**
-  - `POST /reason` - Main reasoning pipeline (Cortex Mode)
-  - `POST /simple` - Direct LLM chat with tool calling (Standard Mode)
-  - `GET /stream/thinking/{session_id}` - SSE stream for thinking events **NEW in v0.8.0**
-  - `POST /ingest` - Receives conversation exchanges from Relay
-  - `GET /health` - Service health check
-  - `GET /debug/sessions` - Inspect in-memory SESSIONS state
-  - `GET /debug/summary` - Test summarization for a session
-
-**5. Intake** (Python Module) - **Embedded in Cortex**
-- **No longer a standalone service** - runs as Python module inside Cortex container
-- Short-term memory management with session-based circular buffer
-- In-memory SESSIONS dictionary: `session_id → {buffer: deque(maxlen=200), created_at: timestamp}`
-- Multi-level summarization (L1/L5/L10/L20/L30) produced by `summarize_context()`
-- Deferred summarization - actual summary generation happens during `/reason` call
-- Internal Python API:
-  - `add_exchange_internal(exchange)` - Direct function call from Cortex
-  - `summarize_context(session_id, exchanges)` - Async LLM-based summarization
-  - `SESSIONS` - Module-level global state (requires single Uvicorn worker)
-
-### LLM Backends (HTTP-based)
-
-**All LLM communication is done via HTTP APIs:**
-- **PRIMARY**: llama.cpp server (`http://10.0.0.44:8080`) - AMD MI50 GPU backend
-- **SECONDARY**: Ollama server (`http://10.0.0.3:11434`) - RTX 3090 backend
-  - Model: qwen2.5:7b-instruct-q4_K_M
-- **CLOUD**: OpenAI API (`https://api.openai.com/v1`) - Cloud-based models
-  - Model: gpt-4o-mini
-- **FALLBACK**: Local backup (`http://10.0.0.41:11435`) - Emergency fallback
-  - Model: llama-3.2-8b-instruct
-
-Each module can be configured to use a different backend via environment variables.
-
-### Autonomy System (NEW in v0.6.0)
-
-**Cortex Autonomy Subsystems** - Multi-layered autonomous decision-making and learning
-- **Executive Layer** [cortex/autonomy/executive/](cortex/autonomy/executive/)
-  - High-level planning and goal setting
-  - Multi-step reasoning for complex objectives
-  - Strategic decision making
-- **Decision Engine** [cortex/autonomy/tools/decision_engine.py](cortex/autonomy/tools/decision_engine.py)
-  - Autonomous decision-making framework
-  - Option evaluation and selection
-  - Coordinated decision orchestration
-- **Autonomous Actions** [cortex/autonomy/actions/](cortex/autonomy/actions/)
-  - Self-initiated action execution
-  - Context-aware behavior implementation
-  - Action logging and tracking
-- **Pattern Learning** [cortex/autonomy/learning/](cortex/autonomy/learning/)
-  - Learns from interaction patterns
-  - Identifies recurring user needs
-  - Adaptive behavior refinement
-- **Proactive Monitoring** [cortex/autonomy/proactive/](cortex/autonomy/proactive/)
-  - System state monitoring
-  - Intervention opportunity detection
-  - Background awareness capabilities
-- **Self-Analysis** [cortex/autonomy/self/](cortex/autonomy/self/)
-  - Performance tracking and analysis
-  - Cognitive pattern identification
-  - Self-state persistence in [cortex/data/self_state.json](cortex/data/self_state.json)
-- **Orchestrator** [cortex/autonomy/tools/orchestrator.py](cortex/autonomy/tools/orchestrator.py)
-  - Coordinates all autonomy subsystems
-  - Manages tool selection and execution
-  - Handles external integrations (with enable/disable controls)
-
-**Autonomy Architecture:**
-The autonomy system operates in coordinated layers, all maintaining state in `self_state.json`:
-1. Executive Layer → Planning and goals
-2. Decision Layer → Evaluation and choices
-3. Action Layer → Execution
-4. Learning Layer → Pattern adaptation
-5. Monitoring Layer → Proactive awareness
+**4. Nebula (Future - Port 7090)**
+- Vector database for semantic memory
+- RAG (Retrieval-Augmented Generation)
+- Memory resurfacing based on similarity
 
 ---
 
-## Data Flow Architecture (v0.7.0)
+## What Makes Lyra Different?
 
-### Standard Mode Flow (NEW in v0.7.0):
+### Progressive Summarization
+Most chatbots either keep raw history (expensive) or forget everything (useless). Lyra does both:
+- **Raw storage**: Every conversation turn saved
+- **L1-L30 summaries**: Multiple granularities for different use cases
+  - L1: "What just happened?" (immediate context)
+  - L10: "What's the vibe?" (tone and direction)
+  - L20: "What did we accomplish?" (session overview)
+  - L30: "What's the big picture?" (continuity across sessions)
 
+### Nebula-Ready Architecture
+Summaries are sent via HTTP to Nebula (when available), with automatic disk fallback:
 ```
-User (UI) → POST /v1/chat/completions {mode: "standard", backend: "SECONDARY"}
-  ↓
-Relay (7078)
-  ↓ POST /simple
-Cortex (7081)
-  ↓ (internal Python call)
-Intake module → get_recent_messages() (last 20 messages)
-  ↓
-Direct LLM call (user-selected backend: SECONDARY/OPENAI/custom)
-  ↓
-Returns simple response to Relay
-  ↓
-Relay → POST /ingest (async)
-  ↓
-Cortex → add_exchange_internal() → SESSIONS buffer
-  ↓
-Relay → POST /sessions/:id (save session to file)
-  ↓
-Relay → UI (returns final response)
-
-Note: Bypasses reflection, reasoning, refinement, persona stages
+.nebula_fallback/
+  └── {session_id}/
+      ├── L10_20260223_203045.json
+      ├── L20_20260223_204512.json
+      └── L30_20260223_210030.json
 ```
 
-### Cortex Mode Flow (Full Reasoning):
-
-```
-User (UI) → POST /v1/chat/completions {mode: "cortex"}
-  ↓
-Relay (7078)
-  ↓ POST /reason
-Cortex (7081)
-  ↓ (internal Python call)
-Intake module → summarize_context()
-  ↓
-Autonomy System → Decision evaluation & pattern learning
-  ↓
-Cortex processes (4 stages):
-  1. reflection.py → meta-awareness notes (CLOUD backend)
-  2. reasoning.py → draft answer (PRIMARY backend, autonomy-aware)
-  3. refine.py → refined answer (PRIMARY backend)
-  4. persona/speak.py → Lyra personality (CLOUD backend, autonomy-aware)
-  ↓
-Returns persona answer to Relay
-  ↓
-Relay → POST /ingest (async)
-  ↓
-Cortex → add_exchange_internal() → SESSIONS buffer
-  ↓
-Autonomy System → Update self_state.json (pattern tracking)
-  ↓
-Relay → POST /sessions/:id (save session to file)
-  ↓
-Relay → UI (returns final response)
-
-Note: NeoMem integration disabled in v0.6.0
-```
-
-### Session Persistence Flow (NEW in v0.7.0):
-
-```
-UI loads → GET /sessions → Relay → List all sessions from files → UI dropdown
-User sends message → POST /sessions/:id → Relay → Save to sessions/*.json
-User renames session → PATCH /sessions/:id/metadata → Relay → Update *.meta.json
-User deletes session → DELETE /sessions/:id → Relay → Remove session files
-
-Sessions stored in: core/relay/sessions/
-- {sessionId}.json (conversation history)
-- {sessionId}.meta.json (name, timestamps, metadata)
-```
-
-### Cortex 4-Stage Reasoning Pipeline:
-
-1. **Reflection** (`reflection.py`) - Cloud LLM (OpenAI)
-   - Analyzes user intent and conversation context
-   - Generates meta-awareness notes
-   - "What is the user really asking?"
-
-2. **Reasoning** (`reasoning.py`) - Primary LLM (llama.cpp)
-   - Retrieves short-term context from Intake module
-   - Creates initial draft answer
-   - Integrates context, reflection notes, and user prompt
-
-3. **Refinement** (`refine.py`) - Primary LLM (llama.cpp)
-   - Polishes the draft answer
-   - Improves clarity and coherence
-   - Ensures factual consistency
-
-4. **Persona** (`speak.py`) - Cloud LLM (OpenAI)
-   - Applies Lyra's personality and speaking style
-   - Natural, conversational output
-   - Final answer returned to user
-
----
-
-## Features
-
-### Core Services
-
-**Relay**:
-- Main orchestrator and message router
-- OpenAI-compatible endpoint: `POST /v1/chat/completions`
-- Internal endpoint: `POST /chat`
-- Health check: `GET /_health`
-- **NEW:** Dual-mode routing (Standard/Cortex)
-- **NEW:** Server-side session persistence with CRUD API
-- **NEW:** Session management endpoints:
-  - `GET /sessions` - List all sessions
-  - `GET /sessions/:id` - Retrieve session history
-  - `POST /sessions/:id` - Save session history
-  - `PATCH /sessions/:id/metadata` - Update session metadata
-  - `DELETE /sessions/:id` - Delete session
-- Async non-blocking calls to Cortex
-- Shared request handler for code reuse
-- Comprehensive error handling
-
-**NeoMem (Memory Engine)**:
-- Forked from Mem0 OSS - fully independent
-- Drop-in compatible API (`/memories`, `/search`)
-- Local-first: runs on FastAPI with Postgres + Neo4j
-- No external SDK dependencies
-- Semantic memory updates - compares embeddings and performs in-place updates
-- Default service: `neomem-api` (port 7077)
-
-**UI**:
-- Lightweight static HTML chat interface
-- Cyberpunk theme with light/dark mode toggle
-- **NEW:** Mode selector (Standard/Cortex) in header
-- **NEW:** Settings modal (⚙ button) with:
-  - Backend selection for Standard Mode (SECONDARY/OPENAI/custom)
-  - Session management (view, delete sessions)
-  - Theme toggle (dark mode default)
-- **NEW:** Server-synced session management
-  - Sessions persist across browsers and reboots
-  - Rename sessions with custom names
-  - Delete sessions with confirmation
-  - Automatic session save on every message
-- OpenAI message format support
-
-### Reasoning Layer
-
-**Cortex** (v0.7.0):
-- **NEW:** Dual operating modes:
-  - **Standard Mode** - Simple chat with context (`/simple` endpoint)
-    - User-selectable backend (SECONDARY, OPENAI, or custom)
-    - Full conversation history via Intake integration
-    - Bypasses reasoning pipeline for faster responses
-  - **Cortex Mode** - Full reasoning pipeline (`/reason` endpoint)
-    - Multi-stage processing: reflection → reasoning → refine → persona
-    - Per-stage backend selection
-    - Autonomy system integration
-- Flexible LLM backend routing via HTTP
-- Async processing throughout
-- Embedded Intake module for short-term context
-- `/reason`, `/simple`, `/ingest`, `/health`, `/debug/sessions`, `/debug/summary` endpoints
-- Lenient error handling - never fails the chat pipeline
-
-**Intake** (Embedded Module):
-- **Architectural change**: Now runs as Python module inside Cortex container
-- In-memory SESSIONS management (session_id → buffer)
-- Multi-level summarization: L1 (ultra-short), L5 (short), L10 (medium), L20 (detailed), L30 (full)
-- Deferred summarization strategy - summaries generated during `/reason` call
-- `bg_summarize()` is a logging stub - actual work deferred
-- **Single-worker constraint**: SESSIONS requires single Uvicorn worker or Redis/shared storage
-
-**LLM Router**:
-- Dynamic backend selection via HTTP
-- Environment-driven configuration
-- Support for llama.cpp, Ollama, OpenAI, custom endpoints
-- Per-module backend preferences:
-  - `CORTEX_LLM=SECONDARY` (Ollama for reasoning)
-  - `INTAKE_LLM=PRIMARY` (llama.cpp for summarization)
-  - `SPEAK_LLM=OPENAI` (Cloud for persona)
-  - `NEOMEM_LLM=PRIMARY` (llama.cpp for memory operations)
-
-### Beta Lyrae (RAG Memory DB) - Currently Disabled
-
-- **RAG Knowledge DB - Beta Lyrae (sheliak)**
-  - This module implements the **Retrieval-Augmented Generation (RAG)** layer for Project Lyra.
-  - It serves as the long-term searchable memory store that Cortex and Relay can query for relevant context before reasoning or response generation.
-  - **Status**: Disabled in docker-compose.yml (v0.5.1)
-
-The system uses:
-- **ChromaDB** for persistent vector storage
-- **OpenAI Embeddings (`text-embedding-3-small`)** for semantic similarity
-- **FastAPI** (port 7090) for the `/rag/search` REST endpoint
-
-Directory Layout:
-```
-rag/
-├── rag_chat_import.py    # imports JSON chat logs
-├── rag_docs_import.py    # (planned) PDF/EPUB/manual importer
-├── rag_build.py          # legacy single-folder builder
-├── rag_query.py          # command-line query helper
-├── rag_api.py            # FastAPI service providing /rag/search
-├── chromadb/             # persistent vector store
-├── chatlogs/             # organized source data
-│   ├── poker/
-│   ├── work/
-│   ├── lyra/
-│   ├── personal/
-│   └── ...
-└── import.log            # progress log for batch runs
-```
-
-**OpenAI chatlog importer features:**
-- Recursive folder indexing with **category detection** from directory name
-- Smart chunking for long messages (5,000 chars per slice)
-- Automatic deduplication using SHA-1 hash of file + chunk
-- Timestamps for both file modification and import time
-- Full progress logging via tqdm
-- Safe to run in background with `nohup … &`
-
----
-
-## Docker Deployment
-
-All services run in a single docker-compose stack with the following containers:
-
-**Active Services:**
-- **relay** - Main orchestrator (port 7078)
-- **cortex** - Reasoning engine with embedded Intake and Autonomy System (port 7081)
-
-**Disabled Services (v0.6.0):**
-- **neomem-postgres** - PostgreSQL with pgvector extension (port 5432) - *disabled while refining pipeline*
-- **neomem-neo4j** - Neo4j graph database (ports 7474, 7687) - *disabled while refining pipeline*
-- **neomem-api** - NeoMem memory service (port 7077) - *disabled while refining pipeline*
-- **intake** - No longer needed (embedded in Cortex as of v0.5.1)
-- **rag** - Beta Lyrae RAG service (port 7090) - currently disabled
-
-All containers communicate via the `lyra_net` Docker bridge network.
-
-## External LLM Services
-
-The following LLM backends are accessed via HTTP (not part of docker-compose):
-
-- **llama.cpp Server** (`http://10.0.0.44:8080`)
-  - AMD MI50 GPU-accelerated inference
-  - Primary backend for reasoning and refinement stages
-  - Model path: `/model`
-
-- **Ollama Server** (`http://10.0.0.3:11434`)
-  - RTX 3090 GPU-accelerated inference
-  - Secondary/configurable backend
-  - Model: qwen2.5:7b-instruct-q4_K_M
-
-- **OpenAI API** (`https://api.openai.com/v1`)
-  - Cloud-based inference
-  - Used for reflection and persona stages
-  - Model: gpt-4o-mini
-
-- **Fallback Server** (`http://10.0.0.41:11435`)
-  - Emergency backup endpoint
-  - Local llama-3.2-8b-instruct model
-
----
-
-## Version History
-
-### v0.9.0 (2025-12-29) - Current Release
-**Major Feature: Trilium Notes Integration**
-- ✅ Added Trilium ETAPI integration for knowledge base access
-- ✅ `search_notes()` tool for searching personal notes during conversations
-- ✅ `create_note()` tool for capturing insights and information
-- ✅ ETAPI authentication with secure token management
-- ✅ Complete setup documentation and API reference
-- ✅ Environment configuration with feature flag (`ENABLE_TRILIUM`)
-- ✅ Automatic parent note handling (defaults to "root")
-- ✅ Connection error handling and user-friendly messages
-
-**Key Capabilities:**
-- Search your Trilium notes during conversations for context
-- Create new notes from conversation insights automatically
-- Cross-reference information between chat and knowledge base
-- Future: Find duplicates, suggest organization, summarize notes
-
-**Documentation:**
-- Added [TRILIUM_SETUP.md](TRILIUM_SETUP.md) - Complete setup guide
-- Added [docs/TRILIUM_API.md](docs/TRILIUM_API.md) - Full API reference
-
-### v0.8.0 (2025-12-26)
-**Major Feature: Agentic Tool Calling + "Show Your Work"**
-- ✅ Added tool calling system for Standard Mode
-- ✅ Real-time thinking stream visualization
-- ✅ Sandboxed code execution (Python, JavaScript, Bash)
-- ✅ Web search integration via Tavily API
-- ✅ Server-Sent Events (SSE) for live tool execution updates
-
-### v0.7.0 (2025-12-21)
-**Major Features: Standard Mode + Backend Selection + Session Persistence**
-- ✅ Added Standard Mode for simple chatbot functionality
-- ✅ UI mode selector (Standard/Cortex) in header
-- ✅ Settings modal with backend selection for Standard Mode
-- ✅ Server-side session persistence with file-based storage
-- ✅ Session management UI (view, rename, delete sessions)
-- ✅ Light/Dark mode toggle (dark by default)
-- ✅ Context retention in Standard Mode via Intake integration
-- ✅ Fixed modal positioning and z-index issues
-- ✅ Cortex `/simple` endpoint for direct LLM calls
-- ✅ Session CRUD API in Relay
-- ✅ Full backward compatibility - Cortex Mode unchanged
-
-**Key Changes:**
-- Standard Mode bypasses 6 of 7 reasoning stages for faster responses
-- Sessions now sync across browsers and survive container restarts
-- User can select SECONDARY (Ollama), OPENAI, or custom backend for Standard Mode
-- Theme preference and backend selection persisted in localStorage
-- Session files stored in `core/relay/sessions/` directory
-
-### v0.6.0 (2025-12-18)
-**Major Feature: Autonomy System (Phase 1, 2, and 2.5)**
-- ✅ Added autonomous decision-making framework
-- ✅ Implemented executive planning and goal-setting layer
-- ✅ Added pattern learning system for adaptive behavior
-- ✅ Implemented proactive monitoring capabilities
-- ✅ Created self-analysis and performance tracking system
-- ✅ Integrated self-state persistence (`cortex/data/self_state.json`)
-- ✅ Built decision engine with orchestrator coordination
-- ✅ Added autonomous action execution framework
-- ✅ Integrated autonomy into reasoning and persona layers
-- ✅ Created comprehensive test suites for autonomy features
-- ✅ Added complete system breakdown documentation
-
-**Architecture Changes:**
-- Autonomy system integrated into Cortex reasoning pipeline
-- Multi-layered autonomous decision-making architecture
-- Self-state tracking across sessions
-- NeoMem disabled by default while refining pipeline integration
-- Enhanced orchestrator with flexible service controls
-
-**Documentation:**
-- Added [PROJECT_LYRA_COMPLETE_BREAKDOWN.md](docs/PROJECT_LYRA_COMPLETE_BREAKDOWN.md)
-- Updated changelog with comprehensive autonomy system details
-
-### v0.5.1 (2025-12-11)
-**Critical Intake Integration Fixes:**
-- ✅ Fixed `bg_summarize()` NameError preventing SESSIONS persistence
-- ✅ Fixed `/ingest` endpoint unreachable code
-- ✅ Added `cortex/intake/__init__.py` for proper package structure
-- ✅ Added diagnostic logging to verify SESSIONS singleton behavior
-- ✅ Added `/debug/sessions` and `/debug/summary` endpoints
-- ✅ Documented single-worker constraint in Dockerfile
-- ✅ Implemented lenient error handling (never fails chat pipeline)
-- ✅ Intake now embedded in Cortex - no longer standalone service
-
-**Architecture Changes:**
-- Intake module runs inside Cortex container as pure Python import
-- No HTTP calls between Cortex and Intake (internal function calls)
-- SESSIONS persist correctly in Uvicorn worker
-- Deferred summarization strategy (summaries generated during `/reason`)
-
-### v0.5.0 (2025-11-28)
-- ✅ Fixed all critical API wiring issues
-- ✅ Added OpenAI-compatible endpoint to Relay (`/v1/chat/completions`)
-- ✅ Fixed Cortex → Intake integration
-- ✅ Added missing Python package `__init__.py` files
-- ✅ End-to-end message flow verified and working
-
-### Infrastructure v1.0.0 (2025-11-26)
-- Consolidated 9 scattered `.env` files into single source of truth
-- Multi-backend LLM strategy implemented
-- Docker Compose consolidation
-- Created `.env.example` security templates
-
-### v0.4.x (Major Rewire)
-- Cortex multi-stage reasoning pipeline
-- LLM router with multi-backend support
-- Major architectural restructuring
-
-### v0.3.x
-- Beta Lyrae RAG system
-- NeoMem integration
-- Basic Cortex reasoning loop
-
----
-
-## Known Issues (v0.7.0)
-
-### Temporarily Disabled
-- **NeoMem disabled by default** - Being refined independently before full integration
-  - PostgreSQL + pgvector storage inactive
-  - Neo4j graph database inactive
-  - Memory persistence endpoints not active
-- RAG service (Beta Lyrae) currently disabled in docker-compose.yml
-
-### Standard Mode Limitations
-- No reflection, reasoning, or refinement stages (by design)
-- DeepSeek R1 not recommended for Standard Mode (generates reasoning artifacts)
-- No RAG integration (same as Cortex Mode - currently disabled)
-- No NeoMem memory storage (same as Cortex Mode - currently disabled)
-
-### Session Management Limitations
-- Sessions stored in container filesystem - requires volume mount for true persistence
-- No session import/export functionality yet
-- No session search or filtering
-- Old localStorage sessions don't automatically migrate to server
-
-### Operational Notes
-- **Single-worker constraint**: Cortex must run with single Uvicorn worker to maintain SESSIONS state
-  - Multi-worker scaling requires migrating SESSIONS to Redis or shared storage
-- Diagnostic endpoints (`/debug/sessions`, `/debug/summary`) available for troubleshooting
-- Backend selection only affects Standard Mode - Cortex Mode uses environment-configured backends
-
-### Future Enhancements
-- Re-enable NeoMem integration after pipeline refinement
-- Full autonomy system maturation and optimization
-- Re-enable RAG service integration
-- Session import/export functionality
-- Session search and filtering UI
-- Migrate SESSIONS to Redis for multi-worker support
-- Add request correlation IDs for tracing
-- Comprehensive health checks across all services
-- Enhanced pattern learning with long-term memory integration
+### Dual Mode Operation
+- **Simple Mode** (`/simple`): Fast, direct LLM responses
+- **Cortex Mode** (`/reason`): Full 4-stage reasoning pipeline
+  1. Reflection (meta-awareness)
+  2. Reasoning (draft)
+  3. Refinement (polish)
+  4. Persona (Lyra's voice)
 
 ---
 
@@ -580,323 +122,362 @@ The following LLM backends are accessed via HTTP (not part of docker-compose):
 
 ### Prerequisites
 - Docker + Docker Compose
-- At least one HTTP-accessible LLM endpoint (llama.cpp, Ollama, or OpenAI API key)
+- At least one LLM backend (llama.cpp, Ollama, OpenAI API)
 
-### Setup
-1. Copy `.env.example` to `.env` and configure your LLM backend URLs and API keys:
-   ```bash
-   # Required: Configure at least one LLM backend
-   LLM_PRIMARY_URL=http://10.0.0.44:8080       # llama.cpp
-   LLM_SECONDARY_URL=http://10.0.0.3:11434     # Ollama
-   OPENAI_API_KEY=sk-...                        # OpenAI
-   ```
+### Run It
 
-2. Start all services with docker-compose:
-   ```bash
-   docker-compose up -d
-   ```
-
-3. Check service health:
-   ```bash
-   # Relay health
-   curl http://localhost:7078/_health
-
-   # Cortex health
-   curl http://localhost:7081/health
-
-   # NeoMem health
-   curl http://localhost:7077/health
-   ```
-
-4. Access the UI at `http://localhost:8081`
-
-### Using the UI
-
-**Mode Selection:**
-- Use the **Mode** dropdown in the header to switch between:
-  - **Standard** - Simple chatbot for coding and practical tasks
-  - **Cortex** - Full reasoning pipeline with autonomy features
-
-**Settings Menu:**
-1. Click the **⚙ Settings** button in the header
-2. **Backend Selection** (Standard Mode only):
-   - Choose **SECONDARY** (Ollama/Qwen on 3090) - Fast, local
-   - Choose **OPENAI** (GPT-4o-mini) - Cloud-based, high quality
-   - Enter custom backend name for advanced configurations
-3. **Session Management**:
-   - View all saved sessions with message counts and timestamps
-   - Click 🗑️ to delete unwanted sessions
-4. **Theme Toggle**:
-   - Click **🌙 Dark Mode** or **☀️ Light Mode** to switch themes
-
-**Session Management:**
-- Sessions automatically save on every message
-- Use the **Session** dropdown to switch between sessions
-- Click **➕ New** to create a new session
-- Click **✏️ Rename** to rename the current session
-- Sessions persist across browsers and container restarts
-
-### Test
-
-**Test Standard Mode:**
 ```bash
+# 1. Create .env file with your LLM backend
+cp .env.example .env
+# Edit .env with your LLM URLs and API keys
+
+# 2. Build and start
+docker-compose up -d --build
+
+# 3. Check health
+curl http://localhost:7078/_health  # Relay
+curl http://localhost:7081/_health  # Cortex
+
+# 4. Open UI
+open http://localhost:8081
+```
+
+### Test It
+
+```bash
+# Simple chat
 curl -X POST http://localhost:7078/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "mode": "standard",
-    "backend": "SECONDARY",
     "messages": [{"role": "user", "content": "Hello!"}],
     "sessionId": "test"
   }'
-```
 
-**Test Cortex Mode (Full Reasoning):**
-```bash
+# Full reasoning pipeline
 curl -X POST http://localhost:7078/v1/chat/completions \
   -H "Content-Type: application/json" \
   -d '{
     "mode": "cortex",
-    "messages": [{"role": "user", "content": "Hello Lyra!"}],
+    "messages": [{"role": "user", "content": "Explain quantum computing"}],
     "sessionId": "test"
   }'
 ```
 
-**Test Cortex /ingest endpoint:**
-```bash
-curl -X POST http://localhost:7081/ingest \
-  -H "Content-Type: application/json" \
-  -d '{
-    "session_id": "test",
-    "user_msg": "Hello",
-    "assistant_msg": "Hi there!"
-  }'
+---
+
+## Data Flow
+
+### Simple Mode (Fast Path)
+```
+User → Relay → Cortex (/simple) → Direct LLM → Response
+                  ↓
+              Intake (buffer + summarize on triggers)
+                  ↓
+              Nebula (summaries only)
 ```
 
-**Inspect SESSIONS state:**
-```bash
-curl http://localhost:7081/debug/sessions
+### Cortex Mode (Full Pipeline)
 ```
-
-**Get summary for a session:**
-```bash
-curl "http://localhost:7081/debug/summary?session_id=test"
+User → Relay → Cortex (/reason)
+                  ↓
+              1. Reflection (what's being asked?)
+                  ↓
+              2. Reasoning (draft answer)
+                  ↓
+              3. Refinement (polish)
+                  ↓
+              4. Persona (Lyra's voice)
+                  ↓
+              Intake (buffer + multi-level summaries)
+                  ↓
+              Nebula (raw + summaries)
+                  ↓
+              Response
 ```
 
-**List all sessions:**
-```bash
-curl http://localhost:7078/sessions
-```
-
-**Get session history:**
-```bash
-curl http://localhost:7078/sessions/sess-abc123
-```
-
-**Delete a session:**
-```bash
-curl -X DELETE http://localhost:7078/sessions/sess-abc123
-```
-
-All backend databases (PostgreSQL and Neo4j) are automatically started as part of the docker-compose stack.
-
 ---
 
-## Environment Variables
+## Configuration
 
-### LLM Backend Configuration
+### Environment Variables
 
-**Backend URLs (Full API endpoints):**
+**LLM Backends:**
 ```bash
-LLM_PRIMARY_URL=http://10.0.0.44:8080           # llama.cpp
+# Primary backend (llama.cpp on AMD MI50)
+LLM_PRIMARY_URL=http://10.0.0.44:8080
 LLM_PRIMARY_MODEL=/model
 
-LLM_SECONDARY_URL=http://10.0.0.3:11434         # Ollama
+# Secondary backend (Ollama on RTX 3090)
+LLM_SECONDARY_URL=http://10.0.0.3:11434
 LLM_SECONDARY_MODEL=qwen2.5:7b-instruct-q4_K_M
 
+# Cloud backend (OpenAI)
 LLM_OPENAI_URL=https://api.openai.com/v1
 LLM_OPENAI_MODEL=gpt-4o-mini
 OPENAI_API_KEY=sk-...
 ```
 
-**Module-specific backend selection:**
+**Module-Specific Backend Selection:**
 ```bash
-CORTEX_LLM=SECONDARY         # Use Ollama for reasoning
-INTAKE_LLM=PRIMARY           # Use llama.cpp for summarization
-SPEAK_LLM=OPENAI             # Use OpenAI for persona
-NEOMEM_LLM=PRIMARY           # Use llama.cpp for memory
-UI_LLM=OPENAI                # Use OpenAI for UI
-RELAY_LLM=PRIMARY            # Use llama.cpp for relay
-STANDARD_MODE_LLM=SECONDARY  # Default backend for Standard Mode (NEW in v0.7.0)
+CORTEX_LLM=PRIMARY       # Reasoning engine
+INTAKE_LLM=PRIMARY       # Summarization
+SPEAK_LLM=OPENAI         # Persona (final voice)
+STANDARD_MODE_LLM=SECONDARY  # Simple mode default
 ```
 
-### Database Configuration
+**Nebula Integration:**
 ```bash
-POSTGRES_USER=neomem
-POSTGRES_PASSWORD=neomempass
-POSTGRES_DB=neomem
-POSTGRES_HOST=neomem-postgres
-POSTGRES_PORT=5432
-
-NEO4J_URI=bolt://neomem-neo4j:7687
-NEO4J_USERNAME=neo4j
-NEO4J_PASSWORD=neomemgraph
+NEBULA_API=http://localhost:7090  # When Nebula is running
+NEBULA_KEY=your-api-key           # Optional auth
 ```
 
-### Service URLs (Internal Docker Network)
+**Intake Settings:**
 ```bash
-NEOMEM_API=http://neomem-api:7077
-CORTEX_API=http://cortex:7081
-CORTEX_REASON_URL=http://cortex:7081/reason
-CORTEX_SIMPLE_URL=http://cortex:7081/simple      # NEW in v0.7.0
-CORTEX_INGEST_URL=http://cortex:7081/ingest
-RELAY_URL=http://relay:7078
+INTAKE_LLM=PRIMARY
+SUMMARY_MAX_TOKENS=200
+SUMMARY_TEMPERATURE=0.3
 ```
 
-### Feature Flags
-```bash
-CORTEX_ENABLED=true
-MEMORY_ENABLED=true
-PERSONA_ENABLED=false
-DEBUG_PROMPT=true
-VERBOSE_DEBUG=true
-ENABLE_TRILIUM=true          # NEW in v0.9.0
-```
-
-For complete environment variable reference, see [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md).
-
 ---
 
-## Documentation
+## API Reference
 
-- [CHANGELOG.md](CHANGELOG.md) - Detailed version history
-- [PROJECT_SUMMARY.md](PROJECT_SUMMARY.md) - Comprehensive project overview for AI context
-- [ENVIRONMENT_VARIABLES.md](ENVIRONMENT_VARIABLES.md) - Environment variable reference
-- [DEPRECATED_FILES.md](DEPRECATED_FILES.md) - Deprecated files and migration guide
+### Relay Endpoints (Port 7078)
+
+**Chat (OpenAI-compatible):**
+```bash
+POST /v1/chat/completions
+{
+  "mode": "standard" | "cortex",
+  "messages": [{"role": "user", "content": "..."}],
+  "sessionId": "session-123"
+}
+```
+
+**Sessions:**
+```bash
+GET    /sessions           # List all sessions
+GET    /sessions/:id       # Get session history
+POST   /sessions/:id       # Save session
+PATCH  /sessions/:id/metadata  # Rename session
+DELETE /sessions/:id       # Delete session
+```
+
+**Health:**
+```bash
+GET /_health
+```
+
+### Cortex Endpoints (Port 7081)
+
+**Reasoning:**
+```bash
+POST /reason
+{
+  "session_id": "session-123",
+  "user_prompt": "Your question here"
+}
+```
+
+**Simple Mode:**
+```bash
+POST /simple
+{
+  "session_id": "session-123",
+  "user_prompt": "Your question here",
+  "backend": "SECONDARY"  # Optional
+}
+```
+
+**Intake:**
+```bash
+POST /ingest
+{
+  "session_id": "session-123",
+  "user_msg": "User message",
+  "assistant_msg": "Assistant response"
+}
+```
+
+**Health:**
+```bash
+GET /_health
+```
+
+---
+
+## File Structure
+
+```
+project-lyra/
+├── Dockerfile              # Unified container (Node + Python)
+├── docker-compose.yml      # Single lyra service + UI
+├── start.sh                # Startup script (Cortex → Relay)
+├── .dockerignore
+├── QUICKSTART.md           # Quick reference
+│
+├── core/
+│   └── relay/              # Node.js API gateway
+│       ├── server.js
+│       ├── lib/
+│       │   ├── cortex.js   # Cortex HTTP client
+│       │   └── llm.js      # LLM routing
+│       └── sessions/       # Session storage (volume)
+│
+├── cortex/                 # Python reasoning engine
+│   ├── main.py             # FastAPI app
+│   ├── router.py           # /reason, /simple, /ingest
+│   ├── context.py          # Session context
+│   ├── llm/
+│   │   └── llm_router.py   # Multi-backend LLM routing
+│   ├── intake/
+│   │   └── intake.py       # Summarization module
+│   ├── reasoning/
+│   │   ├── reflection.py
+│   │   ├── reasoning.py
+│   │   └── refine.py
+│   └── persona/
+│       └── speak.py
+│
+└── .nebula_fallback/       # Disk storage until Nebula runs
+    └── {session_id}/
+        ├── L10_*.json
+        ├── L20_*.json
+        └── L30_*.json
+```
+
+---
+
+## Roadmap
+
+### ✅ Phase 1 (Complete)
+- Unified container architecture
+- Multi-level summarization (L1-L30)
+- HTTP client for Nebula (with disk fallback)
+- Session management
+- Dual-mode operation
+
+### 🚧 Phase 2 (In Progress)
+- Build Nebula vector database
+- RAG integration
+- Memory resurfacing based on semantic similarity
+
+### 📋 Phase 3 (Planned)
+- Entity extraction from summaries
+- Topic clustering
+- Automatic knowledge graph generation
+- Temporal memory (what happened when)
 
 ---
 
 ## Troubleshooting
 
-### SESSIONS not persisting
-**Symptom:** Intake buffer always shows 0 exchanges, summaries always empty.
+### Container won't start
+```bash
+# Check logs
+docker-compose logs lyra
 
-**Solution (Fixed in v0.5.1):**
-- Ensure `cortex/intake/__init__.py` exists
-- Check Cortex logs for `[Intake Module Init]` message showing SESSIONS object ID
-- Verify single-worker mode (Dockerfile: `uvicorn main:app --workers 1`)
-- Use `/debug/sessions` endpoint to inspect current state
+# Common issues:
+# - Missing .env file
+# - Invalid LLM backend URLs
+# - Port conflicts (7078, 7081)
+```
 
-### Cortex connection errors
-**Symptom:** Relay can't reach Cortex, 502 errors.
+### Summaries not appearing
+```bash
+# Check Nebula fallback directory
+ls -la .nebula_fallback/
 
-**Solution:**
-- Verify Cortex container is running: `docker ps | grep cortex`
-- Check Cortex health: `curl http://localhost:7081/health`
-- Verify environment variables: `CORTEX_REASON_URL=http://cortex:7081/reason`
-- Check docker network: `docker network inspect lyra_net`
+# Verify Cortex is processing
+docker-compose logs lyra | grep "Nebula"
+```
 
-### LLM backend timeouts
-**Symptom:** Reasoning stage hangs or times out.
+### Sessions not persisting
+```bash
+# Check volume mount
+docker-compose exec lyra ls -la /app/relay/sessions/
 
-**Solution:**
-- Verify LLM backend is running and accessible
-- Check LLM backend health: `curl http://10.0.0.44:8080/health`
-- Increase timeout in llm_router.py if using slow models
-- Check logs for specific backend errors
+# Verify session save calls
+curl http://localhost:7078/sessions
+```
+
+---
+
+## Development
+
+### Making Changes
+
+**Code changes (hot reload):**
+```bash
+docker-compose restart lyra
+```
+
+**Dependency changes (rebuild):**
+```bash
+docker-compose up -d --build lyra
+```
+
+**View logs:**
+```bash
+docker-compose logs -f lyra
+```
+
+### Adding a New LLM Backend
+
+1. Add to `.env`:
+```bash
+LLM_CUSTOM_URL=http://your-backend:port
+LLM_CUSTOM_MODEL=model-name
+```
+
+2. Configure module:
+```bash
+CORTEX_LLM=CUSTOM
+```
+
+3. Restart:
+```bash
+docker-compose restart lyra
+```
+
+---
+
+## Version History
+
+### v1.0.0 (2026-02-23) - The Great Simplification
+**Major Refactor:**
+- ✅ Unified Relay + Cortex into single container
+- ✅ Removed NeoMem (replaced by upcoming Nebula)
+- ✅ Removed old ingest_handler and RAG services
+- ✅ Simplified to core flow: intake → summarize → store
+- ✅ Added HTTP client for Nebula with disk fallback
+- ✅ Cleaned docker-compose (2 services instead of 7)
+- ✅ Updated documentation to reflect new architecture
+
+**Architecture Changes:**
+- Intake now sends summaries to Nebula (HTTP POST)
+- Disk fallback writes JSON files to `.nebula_fallback/`
+- Relay and Cortex communicate via localhost (faster)
+- Single build, single deploy, single log stream
 
 ---
 
 ## License
 
-NeoMem is a derivative work based on Mem0 OSS (Apache 2.0).
-© 2025 Terra-Mechanics / ServersDown Labs. All modifications released under Apache 2.0.
+© 2026 Terra-Mechanics / ServersDown Labs. Apache 2.0.
 
 **Built with Claude Code**
 
 ---
 
-## Integration Notes
+## Credits
 
-- NeoMem API is compatible with Mem0 OSS endpoints (`/memories`, `/search`)
-- All services communicate via Docker internal networking on the `lyra_net` bridge
-- History and entity graphs are managed via PostgreSQL + Neo4j
-- LLM backends are accessed via HTTP and configured in `.env`
-- Intake module is imported internally by Cortex (no HTTP communication)
-- SESSIONS state is maintained in-memory within Cortex container
+Built by Brian with assistance from Claude (Anthropic).
 
----
-
-## Beta Lyrae - RAG Memory System (Currently Disabled)
-
-**Note:** The RAG service is currently disabled in docker-compose.yml
-
-### Requirements
-- Python 3.10+
-- Dependencies: `chromadb openai tqdm python-dotenv fastapi uvicorn`
-- Persistent storage: `./chromadb` or `/mnt/data/lyra_rag_db`
-
-### Setup
-1. Import chat logs (must be in OpenAI message format):
-   ```bash
-   python3 rag/rag_chat_import.py
-   ```
-
-2. Build and start the RAG API server:
-   ```bash
-   cd rag
-   python3 rag_build.py
-   uvicorn rag_api:app --host 0.0.0.0 --port 7090
-   ```
-
-3. Query the RAG system:
-   ```bash
-   curl -X POST http://127.0.0.1:7090/rag/search \
-     -H "Content-Type: application/json" \
-     -d '{
-       "query": "What is the current state of Cortex?",
-       "where": {"category": "lyra"}
-     }'
-   ```
-
----
-
-## Development Notes
-
-### Cortex Architecture (v0.6.0)
-- Cortex contains embedded Intake module at `cortex/intake/`
-- Intake is imported as: `from intake.intake import add_exchange_internal, SESSIONS`
-- SESSIONS is a module-level global dictionary (singleton pattern)
-- Single-worker constraint required to maintain SESSIONS state
-- Diagnostic endpoints available for debugging: `/debug/sessions`, `/debug/summary`
-- **NEW:** Autonomy system integrated at `cortex/autonomy/`
-  - Executive, decision, action, learning, and monitoring layers
-  - Self-state persistence in `cortex/data/self_state.json`
-  - Coordinated via orchestrator with flexible service controls
-
-### Adding New LLM Backends
-1. Add backend URL to `.env`:
-   ```bash
-   LLM_CUSTOM_URL=http://your-backend:port
-   LLM_CUSTOM_MODEL=model-name
-   ```
-
-2. Configure module to use new backend:
-   ```bash
-   CORTEX_LLM=CUSTOM
-   ```
-
-3. Restart Cortex container:
-   ```bash
-   docker-compose restart cortex
-   ```
-
-### Debugging Tips
-- Enable verbose logging: `VERBOSE_DEBUG=true` in `.env`
-- Check Cortex logs: `docker logs cortex -f`
-- Check Relay logs: `docker logs relay -f`
-- Inspect SESSIONS: `curl http://localhost:7081/debug/sessions`
-- Test summarization: `curl "http://localhost:7081/debug/summary?session_id=test"`
-- List sessions: `curl http://localhost:7078/sessions`
-- Test Standard Mode: `curl -X POST http://localhost:7078/v1/chat/completions -H "Content-Type: application/json" -d '{"mode":"standard","backend":"SECONDARY","messages":[{"role":"user","content":"test"}],"sessionId":"test"}'`
-- Monitor Docker network: `docker network inspect lyra_net`
-- Check session files: `ls -la core/relay/sessions/`
+Special thanks to the open source community:
+- FastAPI
+- Express.js
+- Docker
+- llama.cpp
+- Ollama
diff --git a/core/relay/server.js b/core/relay/server.js
index 9d66ed0..2db743e 100644
--- a/core/relay/server.js
+++ b/core/relay/server.js
@@ -21,9 +21,9 @@ app.use(express.json());
 
 const PORT = Number(process.env.PORT || 7078);
 
-// Cortex endpoints
-const CORTEX_REASON = process.env.CORTEX_REASON_URL || "http://cortex:7081/reason";
-const CORTEX_SIMPLE = process.env.CORTEX_SIMPLE_URL || "http://cortex:7081/simple";
+// Cortex endpoints (localhost since they're in the same container now)
+const CORTEX_REASON = process.env.CORTEX_REASON_URL || "http://localhost:7081/reason";
+const CORTEX_SIMPLE = process.env.CORTEX_SIMPLE_URL || "http://localhost:7081/simple";
 
 // -----------------------------------------------------
 // Helper request wrapper
diff --git a/cortex/ingest/__init__.py b/cortex/ingest/__init__.py
deleted file mode 100644
index 0b058b3..0000000
--- a/cortex/ingest/__init__.py
+++ /dev/null
@@ -1 +0,0 @@
-# Ingest module - handles communication with Intake service
diff --git a/cortex/ingest/ingest_handler.py b/cortex/ingest/ingest_handler.py
deleted file mode 100644
index f5da113..0000000
--- a/cortex/ingest/ingest_handler.py
+++ /dev/null
@@ -1,33 +0,0 @@
-# ingest_handler.py
-import os
-import httpx
-
-NEOMEM_URL = os.getenv("NEOMEM_API", "http://nvgram-api:7077")
-
-async def handle_ingest(payload):
-    """
-    Pass user+assistant turns to NeoMem.
-    Minimal version. Does not process or annotate.
-    """
-    data = {
-        "messages": [],
-        "user_id": "brian"   # default for now
-    }
-
-    if payload.user:
-        data["messages"].append({"role": "user", "content": payload.user})
-
-    if payload.assistant:
-        data["messages"].append({"role": "assistant", "content": payload.assistant})
-
-    try:
-        async with httpx.AsyncClient() as client:
-            r = await client.post(
-                f"{NEOMEM_URL}/memories",
-                json=data,
-                timeout=5
-            )
-            if r.status_code != 200:
-                print(f"[Ingest] NeoMem returned {r.status_code}: {r.text}")
-    except Exception as e:
-        print(f"[Ingest] Failed to send to NeoMem: {e}")
diff --git a/cortex/ingest/intake_client.py b/cortex/ingest/intake_client.py
deleted file mode 100644
index f0b1760..0000000
--- a/cortex/ingest/intake_client.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# cortex/intake_client.py
-import os, httpx, logging
-from typing import Dict, Any, Optional
-
-logger = logging.getLogger(__name__)
-
-class IntakeClient:
-    """Handles short-term / episodic summaries from Intake service."""
-
-    def __init__(self):
-        self.base_url = os.getenv("INTAKE_API_URL", "http://intake:7080")
-
-    async def summarize_turn(self, session_id: str, user_msg: str, assistant_msg: Optional[str] = None) -> Dict[str, Any]:
-        """
-        DEPRECATED: Intake v0.2 removed the /summarize endpoint.
-        Use add_exchange() instead, which auto-summarizes in the background.
-        This method is kept for backwards compatibility but will fail.
-        """
-        payload = {
-            "session_id": session_id,
-            "turns": [{"role": "user", "content": user_msg}]
-        }
-        if assistant_msg:
-            payload["turns"].append({"role": "assistant", "content": assistant_msg})
-
-        async with httpx.AsyncClient(timeout=30) as client:
-            try:
-                r = await client.post(f"{self.base_url}/summarize", json=payload)
-                r.raise_for_status()
-                return r.json()
-            except Exception as e:
-                logger.warning(f"Intake summarize_turn failed (endpoint removed in v0.2): {e}")
-                return {}
-
-    async def get_context(self, session_id: str) -> str:
-        """Get summarized context for a session from Intake."""
-        async with httpx.AsyncClient(timeout=15) as client:
-            try:
-                r = await client.get(f"{self.base_url}/summaries", params={"session_id": session_id})
-                r.raise_for_status()
-                data = r.json()
-                return data.get("summary_text", "")
-            except Exception as e:
-                logger.warning(f"Intake get_context failed: {e}")
-                return ""
diff --git a/cortex/intake/intake.py b/cortex/intake/intake.py
index da3e973..6ab9b4f 100644
--- a/cortex/intake/intake.py
+++ b/cortex/intake/intake.py
@@ -33,8 +33,8 @@ INTAKE_LLM = os.getenv("INTAKE_LLM", "PRIMARY").upper()
 SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", "200"))
 SUMMARY_TEMPERATURE = float(os.getenv("SUMMARY_TEMPERATURE", "0.3"))
 
-NEOMEM_API = os.getenv("NEOMEM_API")
-NEOMEM_KEY = os.getenv("NEOMEM_KEY")
+NEBULA_API = os.getenv("NEBULA_API", "http://localhost:7090")
+NEBULA_KEY = os.getenv("NEBULA_KEY")
 
 # ─────────────────────────────
 # Internal history for L10/L20/L30
@@ -120,7 +120,7 @@ async def summarize_L5(buf: List[Dict[str, Any]]) -> str:
 
 
 async def summarize_L10(session_id: str, buf: List[Dict[str, Any]]) -> str:
-    # “Reality Check” for last 10 exchanges
+    # "Reality Check" for last 10 exchanges
     text = _format_exchanges(buf[-10:])
 
     prompt = f"""
@@ -138,6 +138,9 @@ Reality Check:
     L10_HISTORY.setdefault(session_id, [])
     L10_HISTORY[session_id].append(summary)
 
+    # Send to Nebula
+    await send_to_nebula(summary, session_id, "L10")
+
     return summary
 
 
@@ -165,6 +168,9 @@ Overview:
     L20_HISTORY.setdefault(session_id, [])
     L20_HISTORY[session_id].append(summary)
 
+    # Send to Nebula
+    await send_to_nebula(summary, session_id, "L20")
+
     return summary
 
 
@@ -187,45 +193,77 @@ noting major themes, persistent goals, and shifts.
 
 Continuity Report:
 """
-    return await _llm(prompt)
+    summary = await _llm(prompt)
+
+    # Send to Nebula
+    await send_to_nebula(summary, session_id, "L30")
+
+    return summary
 
 
 # ─────────────────────────────
-# NeoMem push
+# Nebula push
 # ─────────────────────────────
 
-def push_to_neomem(summary: str, session_id: str, level: str) -> None:
+async def send_to_nebula(summary: str, session_id: str, level: str) -> None:
     """
-    Fire-and-forget push of a summary into NeoMem.
+    Send summary to Nebula vector memory system.
+    Falls back to disk storage if Nebula is not available.
     """
-    if not NEOMEM_API or not summary:
+    if not summary:
         return
 
-    headers = {"Content-Type": "application/json"}
-    if NEOMEM_KEY:
-        headers["Authorization"] = f"Bearer {NEOMEM_KEY}"
-
     payload = {
-        "messages": [{"role": "assistant", "content": summary}],
-        "user_id": "brian",
-        "metadata": {
-            "source": "intake",
-            "session_id": session_id,
-            "level": level,
-        },
+        "summary": summary,
+        "session_id": session_id,
+        "level": level,
+        "timestamp": datetime.now().isoformat(),
+        "source": "intake",
     }
 
+    # Try HTTP POST to Nebula first
     try:
-        import requests
-        requests.post(
-            f"{NEOMEM_API}/memories",
-            json=payload,
-            headers=headers,
-            timeout=20,
-        ).raise_for_status()
-        print(f"🧠 NeoMem updated ({level}) for {session_id}")
+        import httpx
+        headers = {"Content-Type": "application/json"}
+        if NEBULA_KEY:
+            headers["Authorization"] = f"Bearer {NEBULA_KEY}"
+
+        async with httpx.AsyncClient() as client:
+            response = await client.post(
+                f"{NEBULA_API}/summaries",
+                json=payload,
+                headers=headers,
+                timeout=10.0,
+            )
+            response.raise_for_status()
+            print(f"🌌 Nebula updated ({level}) for {session_id}")
+            return
+
     except Exception as e:
-        print(f"NeoMem push failed ({level}, {session_id}): {e}")
+        print(f"⚠️  Nebula unavailable, falling back to disk: {e}")
+
+    # Fallback: Write to disk
+    try:
+        fallback_dir = os.path.join(os.path.dirname(__file__), "../../.nebula_fallback")
+        os.makedirs(fallback_dir, exist_ok=True)
+
+        # Create session directory
+        session_dir = os.path.join(fallback_dir, session_id)
+        os.makedirs(session_dir, exist_ok=True)
+
+        # Write summary to timestamped file
+        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+        filename = f"{level}_{timestamp}.json"
+        filepath = os.path.join(session_dir, filename)
+
+        import json
+        with open(filepath, "w") as f:
+            json.dump(payload, f, indent=2)
+
+        print(f"💾 Saved to disk: {filepath}")
+
+    except Exception as e:
+        print(f"❌ Failed to save summary to disk: {e}")
 
 
 # ─────────────────────────────
diff --git a/cortex/llm/llm_router.py b/cortex/llm/llm_router.py
index d929539..08deed4 100644
--- a/cortex/llm/llm_router.py
+++ b/cortex/llm/llm_router.py
@@ -1,15 +1,15 @@
 # llm_router.py
+
 import os
 import httpx
 import json
 import logging
 from typing import Optional, List, Dict
-from autonomy.tools.adapters import OpenAIAdapter, OllamaAdapter, LlamaCppAdapter
 
 logger = logging.getLogger(__name__)
 
 # ------------------------------------------------------------
-# Load backend registry from root .env
+# Backend Configuration
 # ------------------------------------------------------------
 
 BACKENDS = {
@@ -38,50 +38,25 @@ BACKENDS = {
 
 DEFAULT_BACKEND = "PRIMARY"
 
-# Reusable async HTTP client
 http_client = httpx.AsyncClient(timeout=120.0)
 
-# Tool adapters for each backend
-TOOL_ADAPTERS = {
-    "OPENAI": OpenAIAdapter(),
-    "OLLAMA": OllamaAdapter(),
-    "MI50": LlamaCppAdapter(),  # MI50 uses llama.cpp
-    "PRIMARY": None,  # Determined at runtime
-    "SECONDARY": None,  # Determined at runtime
-    "FALLBACK": None,  # Determined at runtime
-}
-
-
 # ------------------------------------------------------------
-# Public call
+# Public LLM Call
 # ------------------------------------------------------------
+
 async def call_llm(
-    prompt: str = None,
-    messages: list = None,
-    backend: str | None = None,
+    prompt: Optional[str] = None,
+    messages: Optional[List[Dict]] = None,
+    backend: Optional[str] = None,
     temperature: float = 0.7,
     max_tokens: int = 512,
-    tools: Optional[List[Dict]] = None,
-    tool_choice: Optional[str] = None,
-    return_adapter_response: bool = False,
 ):
     """
-    Call an LLM backend with optional tool calling support.
-
-    Args:
-        prompt: String prompt (for completion-style APIs like mi50)
-        messages: List of message dicts (for chat-style APIs like Ollama/OpenAI)
-        backend: Which backend to use (PRIMARY, SECONDARY, OPENAI, etc.)
-        temperature: Sampling temperature
-        max_tokens: Maximum tokens to generate
-        tools: List of Lyra tool definitions (provider-agnostic)
-        tool_choice: How to use tools ("auto", "required", "none")
-        return_adapter_response: If True, return dict with content and tool_calls
-
-    Returns:
-        str (default) or dict (if return_adapter_response=True):
-            {"content": str, "tool_calls": [...] or None}
+    Simple LLM call.
+    Supports: ollama, mi50 (llama.cpp), openai.
+    Returns plain text response.
     """
+
     backend = (backend or DEFAULT_BACKEND).upper()
 
     if backend not in BACKENDS:
@@ -95,207 +70,96 @@ async def call_llm(
     if not url or not model:
         raise RuntimeError(f"Backend '{backend}' missing url/model in env")
 
-    # If tools are requested, use adapter to prepare request
-    if tools:
-        # Get adapter for this backend
-        adapter = TOOL_ADAPTERS.get(backend)
+    # Convert prompt → messages if needed
+    if not messages:
+        messages = [{"role": "user", "content": prompt or ""}]
 
-        # For PRIMARY/SECONDARY/FALLBACK, determine adapter based on provider
-        if adapter is None and backend in ["PRIMARY", "SECONDARY", "FALLBACK"]:
-            if provider == "openai":
-                adapter = TOOL_ADAPTERS["OPENAI"]
-            elif provider == "ollama":
-                adapter = TOOL_ADAPTERS["OLLAMA"]
-            elif provider == "mi50":
-                adapter = TOOL_ADAPTERS["MI50"]
-
-        if adapter:
-            # Use messages array if provided, otherwise convert prompt to messages
-            if not messages:
-                messages = [{"role": "user", "content": prompt}]
-
-            # Prepare request through adapter
-            adapted_request = await adapter.prepare_request(messages, tools, tool_choice)
-            messages = adapted_request["messages"]
-
-            # Extract tools in provider format if present
-            provider_tools = adapted_request.get("tools")
-            provider_tool_choice = adapted_request.get("tool_choice")
-        else:
-            logger.warning(f"No adapter available for backend {backend}, ignoring tools")
-            provider_tools = None
-            provider_tool_choice = None
-    else:
-        provider_tools = None
-        provider_tool_choice = None
-
-    # -------------------------------
-    # Provider: MI50 (llama.cpp server)
-    # -------------------------------
-    if provider == "mi50":
-        # If tools requested, convert messages to prompt with tool instructions
-        if messages and tools:
-            # Combine messages into a prompt
-            prompt_parts = []
-            for msg in messages:
-                role = msg.get("role", "user")
-                content = msg.get("content", "")
-                prompt_parts.append(f"{role.capitalize()}: {content}")
-            prompt = "\n".join(prompt_parts) + "\nAssistant:"
-
-        payload = {
-            "prompt": prompt,
-            "n_predict": max_tokens,
-            "temperature": temperature,
-            "stop": ["User:", "\nUser:", "Assistant:", "\n\n\n"]
-        }
-        try:
-            r = await http_client.post(f"{url}/completion", json=payload)
-            r.raise_for_status()
-            data = r.json()
-            response_content = data.get("content", "")
-
-            # If caller wants adapter response with tool calls, parse and return
-            if return_adapter_response and tools:
-                adapter = TOOL_ADAPTERS.get(backend) or TOOL_ADAPTERS["MI50"]
-                return await adapter.parse_response(response_content)
-            else:
-                return response_content
-
-        except httpx.HTTPError as e:
-            logger.error(f"HTTP error calling mi50: {type(e).__name__}: {str(e)}")
-            raise RuntimeError(f"LLM API error (mi50): {type(e).__name__}: {str(e)}")
-        except (KeyError, json.JSONDecodeError) as e:
-            logger.error(f"Response parsing error from mi50: {e}")
-            raise RuntimeError(f"Invalid response format (mi50): {e}")
-        except Exception as e:
-            logger.error(f"Unexpected error calling mi50: {type(e).__name__}: {str(e)}")
-            raise RuntimeError(f"Unexpected error (mi50): {type(e).__name__}: {str(e)}")
-
-    # -------------------------------
-    # Provider: OLLAMA (your 3090)
-    # -------------------------------
-    logger.info(f"🔍 LLM Router: provider={provider}, checking if ollama...")
+    # ------------------------------------------------------------
+    # OLLAMA
+    # ------------------------------------------------------------
     if provider == "ollama":
-        logger.info(f"🔍 LLM Router: Matched ollama provider, tools={bool(tools)}, return_adapter_response={return_adapter_response}")
-        # Use messages array if provided, otherwise convert prompt to single user message
-        if messages:
-            chat_messages = messages
-        else:
-            chat_messages = [{"role": "user", "content": prompt}]
-
         payload = {
             "model": model,
-            "messages": chat_messages,
+            "messages": messages,
             "stream": False,
             "options": {
                 "temperature": temperature,
                 "num_predict": max_tokens
             }
         }
+
         try:
             r = await http_client.post(f"{url}/api/chat", json=payload)
             r.raise_for_status()
             data = r.json()
-            response_content = data["message"]["content"]
+            return data["message"]["content"]
 
-            # If caller wants adapter response with tool calls, parse and return
-            if return_adapter_response and tools:
-                logger.info(f"🔍 Ollama: return_adapter_response=True, calling adapter.parse_response")
-                adapter = TOOL_ADAPTERS.get(backend) or TOOL_ADAPTERS["OLLAMA"]
-                logger.info(f"🔍 Ollama: Using adapter {adapter.__class__.__name__}")
-                result = await adapter.parse_response(response_content)
-                logger.info(f"🔍 Ollama: Adapter returned {result}")
-                return result
-            else:
-                return response_content
-
-        except httpx.HTTPError as e:
-            logger.error(f"HTTP error calling ollama: {type(e).__name__}: {str(e)}")
-            raise RuntimeError(f"LLM API error (ollama): {type(e).__name__}: {str(e)}")
-        except (KeyError, json.JSONDecodeError) as e:
-            logger.error(f"Response parsing error from ollama: {e}")
-            raise RuntimeError(f"Invalid response format (ollama): {e}")
         except Exception as e:
-            logger.error(f"Unexpected error calling ollama: {type(e).__name__}: {str(e)}")
-            raise RuntimeError(f"Unexpected error (ollama): {type(e).__name__}: {str(e)}")
+            logger.error(f"Ollama error: {e}")
+            raise RuntimeError(f"Ollama API error: {e}")
 
+    # ------------------------------------------------------------
+    # MI50 (llama.cpp server)
+    # ------------------------------------------------------------
+    if provider == "mi50":
 
-    # -------------------------------
-    # Provider: OPENAI
-    # -------------------------------
+        # Convert messages to plain prompt
+        prompt_parts = []
+        for msg in messages:
+            role = msg.get("role", "user")
+            content = msg.get("content", "")
+            prompt_parts.append(f"{role.capitalize()}: {content}")
+        full_prompt = "\n".join(prompt_parts) + "\nAssistant:"
+
+        payload = {
+            "prompt": full_prompt,
+            "n_predict": max_tokens,
+            "temperature": temperature,
+            "stop": ["User:", "\nUser:", "Assistant:", "\n\n\n"]
+        }
+
+        try:
+            r = await http_client.post(f"{url}/completion", json=payload)
+            r.raise_for_status()
+            data = r.json()
+            return data.get("content", "")
+
+        except Exception as e:
+            logger.error(f"MI50 error: {e}")
+            raise RuntimeError(f"MI50 API error: {e}")
+
+    # ------------------------------------------------------------
+    # OPENAI
+    # ------------------------------------------------------------
     if provider == "openai":
+
         headers = {
-            "Authorization": f"Bearer {cfg['api_key']}",
+            "Authorization": f"Bearer {cfg.get('api_key')}",
             "Content-Type": "application/json"
         }
 
-        # Use messages array if provided, otherwise convert prompt to single user message
-        if messages:
-            chat_messages = messages
-        else:
-            chat_messages = [{"role": "user", "content": prompt}]
-
         payload = {
             "model": model,
-            "messages": chat_messages,
+            "messages": messages,
             "temperature": temperature,
             "max_tokens": max_tokens,
         }
 
-        # Add tools if available (OpenAI native function calling)
-        if provider_tools:
-            payload["tools"] = provider_tools
-            if provider_tool_choice:
-                payload["tool_choice"] = provider_tool_choice
-
         try:
-            r = await http_client.post(f"{url}/chat/completions", json=payload, headers=headers)
+            r = await http_client.post(
+                f"{url}/chat/completions",
+                json=payload,
+                headers=headers
+            )
             r.raise_for_status()
             data = r.json()
+            return data["choices"][0]["message"]["content"]
 
-            # If caller wants adapter response with tool calls, parse and return
-            if return_adapter_response and tools:
-                # Create mock response object for adapter
-                class MockChoice:
-                    def __init__(self, message_data):
-                        self.message = type('obj', (object,), {})()
-                        self.message.content = message_data.get("content")
-                        # Convert tool_calls dicts to objects
-                        raw_tool_calls = message_data.get("tool_calls")
-                        if raw_tool_calls:
-                            self.message.tool_calls = []
-                            for tc in raw_tool_calls:
-                                tool_call_obj = type('obj', (object,), {})()
-                                tool_call_obj.id = tc.get("id")
-                                tool_call_obj.function = type('obj', (object,), {})()
-                                tool_call_obj.function.name = tc.get("function", {}).get("name")
-                                tool_call_obj.function.arguments = tc.get("function", {}).get("arguments")
-                                self.message.tool_calls.append(tool_call_obj)
-                        else:
-                            self.message.tool_calls = None
-
-                class MockResponse:
-                    def __init__(self, data):
-                        self.choices = [MockChoice(data["choices"][0]["message"])]
-
-                mock_resp = MockResponse(data)
-                adapter = TOOL_ADAPTERS.get(backend) or TOOL_ADAPTERS["OPENAI"]
-                return await adapter.parse_response(mock_resp)
-            else:
-                return data["choices"][0]["message"]["content"]
-
-        except httpx.HTTPError as e:
-            logger.error(f"HTTP error calling openai: {type(e).__name__}: {str(e)}")
-            raise RuntimeError(f"LLM API error (openai): {type(e).__name__}: {str(e)}")
-        except (KeyError, json.JSONDecodeError) as e:
-            logger.error(f"Response parsing error from openai: {e}")
-            raise RuntimeError(f"Invalid response format (openai): {e}")
         except Exception as e:
-            logger.error(f"Unexpected error calling openai: {type(e).__name__}: {str(e)}")
-            raise RuntimeError(f"Unexpected error (openai): {type(e).__name__}: {str(e)}")
+            logger.error(f"OpenAI error: {e}")
+            raise RuntimeError(f"OpenAI API error: {e}")
 
-    # -------------------------------
-    # Unknown provider
-    # -------------------------------
-    raise RuntimeError(f"Provider '{provider}' not implemented.")
+    # ------------------------------------------------------------
+    # Unknown Provider
+    # ------------------------------------------------------------
+    raise RuntimeError(f"Provider '{provider}' not implemented.")
\ No newline at end of file
diff --git a/cortex/main.py b/cortex/main.py
index 5ff9c92..c4da4c4 100644
--- a/cortex/main.py
+++ b/cortex/main.py
@@ -13,4 +13,9 @@ app.add_middleware(
     allow_headers=["*"],
 )
 
+# Health check endpoint
+@app.get("/_health")
+async def health_check():
+    return {"status": "ok"}
+
 app.include_router(cortex_router)
\ No newline at end of file
diff --git a/cortex/router.py b/cortex/router.py
index 4e7ff30..112c87f 100644
--- a/cortex/router.py
+++ b/cortex/router.py
@@ -6,21 +6,8 @@ import asyncio
 from fastapi import APIRouter
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
-
-from reasoning.reasoning import reason_check
-from reasoning.reflection import reflect_notes
-from reasoning.refine import refine_answer
-from persona.speak import speak
-from persona.identity import load_identity
-from context import collect_context, update_last_assistant_message
 from intake.intake import add_exchange_internal
 
-from autonomy.monologue.monologue import InnerMonologue
-from autonomy.self.state import load_self_state
-from autonomy.tools.stream_events import get_stream_manager
-
-
-# -------------------------------------------------------------------
 # Setup
 # -------------------------------------------------------------------
 LOG_DETAIL_LEVEL = os.getenv("LOG_DETAIL_LEVEL", "summary").lower()
@@ -35,10 +22,7 @@ console_handler.setFormatter(logging.Formatter(
 ))
 logger.addHandler(console_handler)
 
-
 cortex_router = APIRouter()
-inner_monologue = InnerMonologue()
-
 
 # -------------------------------------------------------------------
 # Models
@@ -49,292 +33,6 @@ class ReasonRequest(BaseModel):
     temperature: float | None = None
     backend: str | None = None
 
-
-# -------------------------------------------------------------------
-# /reason endpoint
-# -------------------------------------------------------------------
-@cortex_router.post("/reason")
-async def run_reason(req: ReasonRequest):
-    from datetime import datetime
-    pipeline_start = datetime.now()
-    stage_timings = {}
-
-    # Show pipeline start in detailed/verbose mode
-    if LOG_DETAIL_LEVEL in ["detailed", "verbose"]:
-        logger.info(f"\n{'='*100}")
-        logger.info(f"🚀 PIPELINE START | Session: {req.session_id} | {datetime.now().strftime('%H:%M:%S.%f')[:-3]}")
-        logger.info(f"{'='*100}")
-        logger.info(f"📝 User: {req.user_prompt[:150]}...")
-        logger.info(f"{'-'*100}\n")
-
-    # ----------------------------------------------------------------
-    # STAGE 0 — Context
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-    context_state = await collect_context(req.session_id, req.user_prompt)
-    stage_timings["context"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # STAGE 0.5 — Identity
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-    identity_block = load_identity(req.session_id)
-    stage_timings["identity"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # STAGE 0.6 — Inner Monologue (observer-only)
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-
-    inner_result = None
-    try:
-        self_state = load_self_state()
-
-        mono_context = {
-            "user_message": req.user_prompt,
-            "session_id": req.session_id,
-            "self_state": self_state,
-            "context_summary": context_state,
-        }
-
-        inner_result = await inner_monologue.process(mono_context)
-        logger.info(f"🧠 Monologue | {inner_result.get('intent', 'unknown')} | Tone: {inner_result.get('tone', 'neutral')}")
-
-        # Store in context for downstream use
-        context_state["monologue"] = inner_result
-
-    except Exception as e:
-        logger.warning(f"⚠️  Monologue failed: {e}")
-
-    stage_timings["monologue"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # STAGE 0.7 — Executive Planning (conditional)
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-    executive_plan = None
-    if inner_result and inner_result.get("consult_executive"):
-
-        try:
-            from autonomy.executive.planner import plan_execution
-            executive_plan = await plan_execution(
-                user_prompt=req.user_prompt,
-                intent=inner_result.get("intent", "unknown"),
-                context_state=context_state,
-                identity_block=identity_block
-            )
-            logger.info(f"🎯 Executive plan: {executive_plan.get('summary', 'N/A')[:80]}...")
-        except Exception as e:
-            logger.warning(f"⚠️  Executive planning failed: {e}")
-            executive_plan = None
-
-    stage_timings["executive"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # STAGE 0.8 — Autonomous Tool Invocation
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-    tool_results = None
-    autonomous_enabled = os.getenv("ENABLE_AUTONOMOUS_TOOLS", "true").lower() == "true"
-    tool_confidence_threshold = float(os.getenv("AUTONOMOUS_TOOL_CONFIDENCE_THRESHOLD", "0.6"))
-
-    if autonomous_enabled and inner_result:
-
-        try:
-            from autonomy.tools.decision_engine import ToolDecisionEngine
-            from autonomy.tools.orchestrator import ToolOrchestrator
-
-            # Analyze which tools to invoke
-            decision_engine = ToolDecisionEngine()
-            tool_decision = await decision_engine.analyze_tool_needs(
-                user_prompt=req.user_prompt,
-                monologue=inner_result,
-                context_state=context_state,
-                available_tools=["RAG", "WEB", "WEATHER", "CODEBRAIN"]
-            )
-
-            # Execute tools if confidence threshold met
-            if tool_decision["should_invoke_tools"] and tool_decision["confidence"] >= tool_confidence_threshold:
-                orchestrator = ToolOrchestrator(tool_timeout=30)
-                tool_results = await orchestrator.execute_tools(
-                    tools_to_invoke=tool_decision["tools_to_invoke"],
-                    context_state=context_state
-                )
-
-                # Format results for context injection
-                tool_context = orchestrator.format_results_for_context(tool_results)
-                context_state["autonomous_tool_results"] = tool_context
-
-                summary = tool_results.get("execution_summary", {})
-                logger.info(f"🛠️  Tools executed: {summary.get('successful', [])} succeeded")
-            else:
-                logger.info(f"🛠️  No tools invoked (confidence: {tool_decision.get('confidence', 0):.2f})")
-
-        except Exception as e:
-            logger.warning(f"⚠️  Autonomous tool invocation failed: {e}")
-            if LOG_DETAIL_LEVEL == "verbose":
-                import traceback
-                traceback.print_exc()
-
-    stage_timings["tools"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # STAGE 1-5 — Core Reasoning Pipeline
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-
-    # Extract intake summary
-    intake_summary = "(no context available)"
-    if context_state.get("intake"):
-        l20 = context_state["intake"].get("L20")
-        if isinstance(l20, dict):
-            intake_summary = l20.get("summary", intake_summary)
-        elif isinstance(l20, str):
-            intake_summary = l20
-
-    # Reflection
-    try:
-        reflection = await reflect_notes(intake_summary, identity_block=identity_block)
-        reflection_notes = reflection.get("notes", [])
-    except Exception as e:
-        reflection_notes = []
-        logger.warning(f"⚠️  Reflection failed: {e}")
-
-    stage_timings["reflection"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # Reasoning (draft)
-    stage_start = datetime.now()
-    draft = await reason_check(
-        req.user_prompt,
-        identity_block=identity_block,
-        rag_block=context_state.get("rag", []),
-        reflection_notes=reflection_notes,
-        context=context_state,
-        monologue=inner_result,
-        executive_plan=executive_plan
-    )
-    stage_timings["reasoning"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # Refinement
-    stage_start = datetime.now()
-    result = await refine_answer(
-        draft_output=draft,
-        reflection_notes=reflection_notes,
-        identity_block=identity_block,
-        rag_block=context_state.get("rag", []),
-    )
-    final_neutral = result["final_output"]
-    stage_timings["refinement"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # Persona
-    stage_start = datetime.now()
-    tone = inner_result.get("tone", "neutral") if inner_result else "neutral"
-    depth = inner_result.get("depth", "medium") if inner_result else "medium"
-    persona_answer = await speak(final_neutral, tone=tone, depth=depth)
-    stage_timings["persona"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # STAGE 6 — Session update
-    # ----------------------------------------------------------------
-    update_last_assistant_message(req.session_id, persona_answer)
-
-    # ----------------------------------------------------------------
-    # STAGE 6.5 — Self-state update & Pattern Learning
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-    try:
-        from autonomy.self.analyzer import analyze_and_update_state
-        await analyze_and_update_state(
-            monologue=inner_result or {},
-            user_prompt=req.user_prompt,
-            response=persona_answer,
-            context=context_state
-        )
-    except Exception as e:
-        logger.warning(f"⚠️  Self-state update failed: {e}")
-
-    try:
-        from autonomy.learning.pattern_learner import get_pattern_learner
-        learner = get_pattern_learner()
-        await learner.learn_from_interaction(
-            user_prompt=req.user_prompt,
-            response=persona_answer,
-            monologue=inner_result or {},
-            context=context_state
-        )
-    except Exception as e:
-        logger.warning(f"⚠️  Pattern learning failed: {e}")
-
-    stage_timings["learning"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # STAGE 7 — Proactive Monitoring & Suggestions
-    # ----------------------------------------------------------------
-    stage_start = datetime.now()
-    proactive_enabled = os.getenv("ENABLE_PROACTIVE_MONITORING", "true").lower() == "true"
-    proactive_min_priority = float(os.getenv("PROACTIVE_SUGGESTION_MIN_PRIORITY", "0.6"))
-
-    if proactive_enabled:
-        try:
-            from autonomy.proactive.monitor import get_proactive_monitor
-
-            monitor = get_proactive_monitor(min_priority=proactive_min_priority)
-            self_state = load_self_state()
-
-            suggestion = await monitor.analyze_session(
-                session_id=req.session_id,
-                context_state=context_state,
-                self_state=self_state
-            )
-
-            if suggestion:
-                suggestion_text = monitor.format_suggestion(suggestion)
-                persona_answer += suggestion_text
-                logger.info(f"💡 Proactive suggestion: {suggestion['type']} (priority: {suggestion['priority']:.2f})")
-
-        except Exception as e:
-            logger.warning(f"⚠️  Proactive monitoring failed: {e}")
-
-    stage_timings["proactive"] = (datetime.now() - stage_start).total_seconds() * 1000
-
-    # ----------------------------------------------------------------
-    # PIPELINE COMPLETE — Summary
-    # ----------------------------------------------------------------
-    total_duration = (datetime.now() - pipeline_start).total_seconds() * 1000
-
-    # Always show pipeline completion
-    logger.info(f"\n{'='*100}")
-    logger.info(f"✨ PIPELINE COMPLETE | Session: {req.session_id} | Total: {total_duration:.0f}ms")
-    logger.info(f"{'='*100}")
-
-    # Show timing breakdown in detailed/verbose mode
-    if LOG_DETAIL_LEVEL in ["detailed", "verbose"]:
-        logger.info("⏱️  Stage Timings:")
-        for stage, duration in stage_timings.items():
-            pct = (duration / total_duration) * 100 if total_duration > 0 else 0
-            logger.info(f"   {stage:15s}: {duration:6.0f}ms ({pct:5.1f}%)")
-
-    logger.info(f"📤 Output: {len(persona_answer)} chars")
-    logger.info(f"{'='*100}\n")
-
-    # ----------------------------------------------------------------
-    # RETURN
-    # ----------------------------------------------------------------
-    return {
-        "draft": draft,
-        "neutral": final_neutral,
-        "persona": persona_answer,
-        "reflection": reflection_notes,
-        "session_id": req.session_id,
-        "context_summary": {
-            "rag_results": len(context_state.get("rag", [])),
-            "minutes_since_last": context_state.get("minutes_since_last_msg"),
-            "message_count": context_state.get("message_count"),
-            "mode": context_state.get("mode"),
-        }
-    }
-
-
 # -------------------------------------------------------------------
 # /simple endpoint - Standard chatbot mode (no reasoning pipeline)
 # -------------------------------------------------------------------
@@ -346,7 +44,6 @@ async def run_simple(req: ReasonRequest):
     """
     from datetime import datetime
     from llm.llm_router import call_llm
-    from autonomy.tools.function_caller import FunctionCaller
 
     start_time = datetime.now()
 
@@ -356,9 +53,6 @@ async def run_simple(req: ReasonRequest):
     logger.info(f"📝 User: {req.user_prompt[:150]}...")
     logger.info(f"{'-'*100}\n")
 
-    # Get conversation history from context and intake buffer
-    context_state = await collect_context(req.session_id, req.user_prompt)
-
     # Get recent messages from Intake buffer
     from intake.intake import get_recent_messages
     recent_msgs = get_recent_messages(req.session_id, limit=20)
@@ -400,31 +94,10 @@ async def run_simple(req: ReasonRequest):
 
     temperature = req.temperature if req.temperature is not None else 0.7
 
-    # Check if tools are enabled
-    enable_tools = os.getenv("STANDARD_MODE_ENABLE_TOOLS", "false").lower() == "true"
+
 
     # Call LLM with or without tools
     try:
-        if enable_tools:
-            # Use FunctionCaller for tool-enabled conversation
-            logger.info(f"🛠️  Tool calling enabled for Standard Mode")
-            logger.info(f"🔍 Creating FunctionCaller with backend={backend}, temp={temperature}")
-            function_caller = FunctionCaller(backend, temperature)
-            logger.info(f"🔍 FunctionCaller created, calling call_with_tools...")
-            result = await function_caller.call_with_tools(
-                messages=messages,
-                max_tokens=2048,
-                session_id=req.session_id  # Pass session_id for streaming
-            )
-            logger.info(f"🔍 call_with_tools returned: iterations={result.get('iterations')}, tool_calls={len(result.get('tool_calls', []))}")
-
-            # Log tool usage
-            if result.get("tool_calls"):
-                tool_names = [tc["name"] for tc in result["tool_calls"]]
-                logger.info(f"🔧 Tools used: {', '.join(tool_names)} ({result['iterations']} iterations)")
-
-            response = result["content"].strip()
-        else:
             # Direct LLM call without tools (original behavior)
             raw_response = await call_llm(
                 messages=messages,
@@ -440,7 +113,6 @@ async def run_simple(req: ReasonRequest):
 
     # Update session with the exchange
     try:
-        update_last_assistant_message(req.session_id, response)
         add_exchange_internal({
             "session_id": req.session_id,
             "role": "user",
@@ -473,64 +145,6 @@ async def run_simple(req: ReasonRequest):
         }
     }
 
-
-# -------------------------------------------------------------------
-# /stream/thinking endpoint - SSE stream for "show your work"
-# -------------------------------------------------------------------
-@cortex_router.get("/stream/thinking/{session_id}")
-async def stream_thinking(session_id: str):
-    """
-    Server-Sent Events stream for tool calling "show your work" feature.
-
-    Streams real-time updates about:
-    - Thinking/planning steps
-    - Tool calls being made
-    - Tool execution results
-    - Final completion
-    """
-    stream_manager = get_stream_manager()
-    queue = stream_manager.subscribe(session_id)
-
-    async def event_generator():
-        try:
-            # Send initial connection message
-            import json
-            connected_event = json.dumps({"type": "connected", "session_id": session_id})
-            yield f"data: {connected_event}\n\n"
-
-            while True:
-                # Wait for events with timeout to send keepalive
-                try:
-                    event = await asyncio.wait_for(queue.get(), timeout=30.0)
-
-                    # Format as SSE
-                    event_data = json.dumps(event)
-                    yield f"data: {event_data}\n\n"
-
-                    # If it's a "done" event, close the stream
-                    if event.get("type") == "done":
-                        break
-
-                except asyncio.TimeoutError:
-                    # Send keepalive comment
-                    yield ": keepalive\n\n"
-
-        except asyncio.CancelledError:
-            logger.info(f"Stream cancelled for session {session_id}")
-        finally:
-            stream_manager.unsubscribe(session_id, queue)
-
-    return StreamingResponse(
-        event_generator(),
-        media_type="text/event-stream",
-        headers={
-            "Cache-Control": "no-cache",
-            "Connection": "keep-alive",
-            "X-Accel-Buffering": "no"  # Disable nginx buffering
-        }
-    )
-
-
 # -------------------------------------------------------------------
 # /ingest endpoint (internal)
 # -------------------------------------------------------------------
@@ -542,11 +156,6 @@ class IngestPayload(BaseModel):
 
 @cortex_router.post("/ingest")
 async def ingest(payload: IngestPayload):
-    try:
-        update_last_assistant_message(payload.session_id, payload.assistant_msg)
-    except Exception as e:
-        logger.warning(f"[INGEST] Session update failed: {e}")
-
     try:
         add_exchange_internal({
             "session_id": payload.session_id,
diff --git a/docker-compose.yml b/docker-compose.yml
index 04b965f..71e3384 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -3,101 +3,40 @@ networks:
     driver: bridge
 
 volumes:
-  postgres_data:
+  nebula_fallback:
     driver: local
-  neo4j_data:
-    driver: local
-  code_executions:
+  relay_sessions:
     driver: local
 
 services:
 
-  # # ============================================================
-  # # NeoMem: Postgres
-  # # ============================================================
-  # neomem-postgres:
-  #   image: ankane/pgvector:v0.5.1
-  #   container_name: neomem-postgres
-  #   restart: unless-stopped
-  #   environment:
-  #     POSTGRES_USER: neomem
-  #     POSTGRES_PASSWORD: neomempass
-  #     POSTGRES_DB: neomem
-  #   volumes:
-  #     - ./volumes/postgres_data:/var/lib/postgresql/data
-  #   ports:
-  #     - "5432:5432"
-  #   healthcheck:
-  #     test: ["CMD-SHELL", "pg_isready -U neomem -d neomem || exit 1"]
-  #     interval: 5s
-  #     timeout: 5s
-  #     retries: 10
-  #   networks:
-  #     - lyra_net
-
-  # # ============================================================
-  # # NeoMem: Neo4j Graph
-  # # ============================================================
-  # neomem-neo4j:
-  #   image: neo4j:5
-  #   container_name: neomem-neo4j
-  #   restart: unless-stopped
-  #   environment:
-  #     NEO4J_AUTH: "neo4j/neomemgraph"
-  #     NEO4JLABS_PLUGINS: '["graph-data-science"]'
-  #   volumes:
-  #     - ./volumes/neo4j_data:/data
-  #   ports:
-  #     - "7474:7474"
-  #     - "7687:7687"
-  #   healthcheck:
-  #     test: ["CMD-SHELL", "cypher-shell -u neo4j -p neomemgraph 'RETURN 1' || exit 1"]
-  #     interval: 10s
-  #     timeout: 10s
-  #     retries: 10
-  #   networks:
-  #     - lyra_net
-
   # ============================================================
-  # NeoMem API
+  # Lyra (Unified: Relay + Cortex + Intake)
   # ============================================================
-  # neomem-api:
-  #   build:
-  #     context: ./neomem
-  #   image: lyra-neomem:latest
-  #   container_name: neomem-api
-  #   restart: unless-stopped
-  #   env_file:
-  #     - ./neomem/.env
-  #     - ./.env
-  #   volumes:
-  #     - ./neomem_history:/app/history
-  #   ports:
-  #     - "7077:7077"
-  #   depends_on:
-  #     neomem-postgres:
-  #       condition: service_healthy
-  #     neomem-neo4j:
-  #       condition: service_healthy
-  #   networks:
-  #     - lyra_net
-
-  # ============================================================
-  # Relay  (host mode)
-  # ============================================================
-  relay:
+  lyra:
     build:
-      context: ./core/relay
-    container_name: relay
+      context: .
+      dockerfile: Dockerfile
+    container_name: lyra
     restart: unless-stopped
     env_file:
       - ./.env
     volumes:
-      - ./core/relay/sessions:/app/sessions
+      - relay_sessions:/app/relay/sessions
+      - nebula_fallback:/app/.nebula_fallback
+      - ./cortex:/app/cortex  # Mount for hot reload during development
+      - /var/run/docker.sock:/var/run/docker.sock:ro
     ports:
-      - "7078:7078"
+      - "7078:7078"  # Relay API (user-facing)
+      - "7081:7081"  # Cortex API (internal/debug)
     networks:
       - lyra_net
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:7078/_health"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s
 
   # ============================================================
   # UI Server
@@ -112,84 +51,6 @@ services:
       - ./core/ui:/usr/share/nginx/html:ro
     networks:
       - lyra_net
-
-
-  # ============================================================
-  # Cortex
-  # ============================================================
-  cortex:
-    build:
-      context: ./cortex
-    container_name: cortex
-    restart: unless-stopped
-    env_file:
-      - ./cortex/.env
-      - ./.env
-    volumes:
-      - ./cortex:/app
-      - /var/run/docker.sock:/var/run/docker.sock:ro
-    ports:
-      - "7081:7081"
-    networks:
-      - lyra_net
-
-  # ============================================================
-  # Code Sandbox (for tool execution)
-  # ============================================================
-  code-sandbox:
-    build:
-      context: ./sandbox
-    container_name: lyra-code-sandbox
-    restart: unless-stopped
-    security_opt:
-      - no-new-privileges:true
-    cap_drop:
-      - ALL
-    cap_add:
-      - CHOWN
-      - SETUID
-      - SETGID
-    network_mode: "none"
-    volumes:
-      - code_executions:/executions
-    mem_limit: 512m
-    cpus: 1.0
-    pids_limit: 100
-    user: sandbox
-    command: tail -f /dev/null
-
-  # ============================================================
-  # Intake
-  # ============================================================
-#  intake:
-#   build:
-#      context: ./intake
-#    container_name: intake
-#    restart: unless-stopped
-#    env_file:
-#      - ./intake/.env
-#      - ./.env
-#    ports:
-#      - "7080:7080"
-#    volumes:
-#      - ./intake:/app
-#      - ./intake-logs:/app/logs
-#    depends_on:
-#      - cortex
-#    networks:
-#      - lyra_net
-
-  # ============================================================
-  # RAG Service
-  # ============================================================
-  # rag:
-  #   build:
-  #     context: ./rag
-  #   container_name: rag
-  #   restart: unless-stopped
-  #   environment:
-  #     NEOMEM_URL: http://neomem-api:7077
-  #   ports:
-  #     - "7090:7090"
-  #   networks:
-  #     - lyra_net  
\ No newline at end of file
+    depends_on:
+      lyra:
+        condition: service_healthy
diff --git a/start.sh b/start.sh
new file mode 100644
index 0000000..24f2f79
--- /dev/null
+++ b/start.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+# Unified startup script for Lyra (Relay + Cortex)
+
+set -e
+
+echo "🚀 Starting Lyra unified container..."
+
+# Start Cortex (Python/FastAPI) in the background
+echo "📡 Starting Cortex on port 7081..."
+cd /app/cortex
+uvicorn main:app --host 0.0.0.0 --port 7081 &
+CORTEX_PID=$!
+
+# Wait for Cortex to be ready
+echo "⏳ Waiting for Cortex to be ready..."
+for i in {1..30}; do
+    if curl -sf http://localhost:7081/_health > /dev/null 2>&1; then
+        echo "✅ Cortex is ready!"
+        break
+    fi
+    if [ $i -eq 30 ]; then
+        echo "❌ Cortex failed to start within 30 seconds"
+        exit 1
+    fi
+    sleep 1
+done
+
+# Start Relay (Node.js/Express) in the foreground
+echo "🔌 Starting Relay on port 7078..."
+cd /app/relay
+exec node server.js
+
+# Note: We exec the last process so signals get forwarded properly
+# If Relay dies, the container stops. If Cortex dies, Relay will fail too.