Merge pull request #4 from serversdwn/dev

Big clean up to v0.5.0, docs updated, restructured throughout.
2025-11-28 18:14:18 -05:00
parent 3b128ac7f6 211328aba9
commit 1dd84613cf
43 changed files with 11487 additions and 4930 deletions
@@ -0,0 +1,87 @@
+# ====================================
+# 🌌 GLOBAL LYRA CONFIG
+# ====================================
+LOCAL_TZ_LABEL=America/New_York
+DEFAULT_SESSION_ID=default
+
+
+# ====================================
+# 🤖 LLM BACKEND OPTIONS
+# ====================================
+# Services choose which backend to use from these options
+# Primary: vLLM on MI50 GPU
+LLM_PRIMARY_PROVIDER=vllm
+LLM_PRIMARY_URL=http://10.0.0.43:8000
+LLM_PRIMARY_MODEL=/model
+
+# Secondary: Ollama on 3090 GPU
+LLM_SECONDARY_PROVIDER=ollama
+LLM_SECONDARY_URL=http://10.0.0.3:11434
+LLM_SECONDARY_MODEL=qwen2.5:7b-instruct-q4_K_M
+
+# Cloud: OpenAI
+LLM_CLOUD_PROVIDER=openai_chat
+LLM_CLOUD_URL=https://api.openai.com/v1
+LLM_CLOUD_MODEL=gpt-4o-mini
+OPENAI_API_KEY=sk-proj-XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+
+# Local Fallback: llama.cpp or LM Studio
+LLM_FALLBACK_PROVIDER=openai_completions
+LLM_FALLBACK_URL=http://10.0.0.41:11435
+LLM_FALLBACK_MODEL=llama-3.2-8b-instruct
+
+# Global LLM controls
+LLM_TEMPERATURE=0.7
+
+
+# ====================================
+# 🗄️ DATABASE CONFIGURATION
+# ====================================
+# Postgres (pgvector for NeoMem)
+POSTGRES_USER=neomem
+POSTGRES_PASSWORD=change_me_in_production
+POSTGRES_DB=neomem
+POSTGRES_HOST=neomem-postgres
+POSTGRES_PORT=5432
+
+# Neo4j Graph Database
+NEO4J_URI=bolt://neomem-neo4j:7687
+NEO4J_USERNAME=neo4j
+NEO4J_PASSWORD=change_me_in_production
+NEO4J_AUTH=neo4j/change_me_in_production
+
+
+# ====================================
+# 🧠 MEMORY SERVICES (NEOMEM)
+# ====================================
+NEOMEM_API=http://neomem-api:7077
+NEOMEM_API_KEY=generate_secure_random_token_here
+NEOMEM_HISTORY_DB=postgresql://neomem:change_me_in_production@neomem-postgres:5432/neomem
+
+# Embeddings configuration (used by NeoMem)
+EMBEDDER_PROVIDER=openai
+EMBEDDER_MODEL=text-embedding-3-small
+
+
+# ====================================
+# 🔌 INTERNAL SERVICE URLS
+# ====================================
+# Using container names for Docker network communication
+INTAKE_API_URL=http://intake:7080
+CORTEX_API=http://cortex:7081
+CORTEX_URL=http://cortex:7081/reflect
+CORTEX_URL_INGEST=http://cortex:7081/ingest
+RAG_API_URL=http://rag:7090
+RELAY_URL=http://relay:7078
+
+# Persona service (optional)
+PERSONA_URL=http://persona-sidecar:7080/current
+
+
+# ====================================
+# 🔧 FEATURE FLAGS
+# ====================================
+CORTEX_ENABLED=true
+MEMORY_ENABLED=true
+PERSONA_ENABLED=false
+DEBUG_PROMPT=true
@@ -5,8 +5,22 @@ __pycache__/
 *.pyc
 *.log

-# Env files (store secrets locally)
+# =============================
+# 🔐 Environment files (NEVER commit secrets!)
+# =============================
+# Ignore all .env files
 .env
+.env.local
+.env.*.local
+**/.env
+**/.env.local
+
+# BUT track .env.example templates (safe to commit)
+!.env.example
+!**/.env.example
+
+# Ignore backup directory
+.env-backups/

 # =============================
 # 🐳 Docker volumes (HUGE)
@@ -1,12 +1,174 @@
 # Project Lyra — Modular Changelog
-All notable changes to Project Lyra are organized by component.  
-The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)  
+All notable changes to Project Lyra are organized by component.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/)
 and adheres to [Semantic Versioning](https://semver.org/).
-# Last Updated: 11-13-25
+# Last Updated: 11-28-25
 ---

 ## 🧠 Lyra-Core ##############################################################################

+## [Project Lyra v0.5.0] - 2025-11-28
+
+### 🔧 Fixed - Critical API Wiring & Integration
+After the major architectural rewire (v0.4.x), this release fixes all critical endpoint mismatches and ensures end-to-end system connectivity.
+
+#### Cortex → Intake Integration ✅
+- **Fixed** `IntakeClient` to use correct Intake v0.2 API endpoints
+  - Changed `GET /context/{session_id}` → `GET /summaries?session_id={session_id}`
+  - Updated JSON response parsing to extract `summary_text` field
+  - Fixed environment variable name: `INTAKE_API` → `INTAKE_API_URL`
+  - Corrected default port: `7083` → `7080`
+  - Added deprecation warning to `summarize_turn()` method (endpoint removed in Intake v0.2)
+
+#### Relay → UI Compatibility ✅
+- **Added** OpenAI-compatible endpoint `POST /v1/chat/completions`
+  - Accepts standard OpenAI format with `messages[]` array
+  - Returns OpenAI-compatible response structure with `choices[]`
+  - Extracts last message content from messages array
+  - Includes usage metadata (stub values for compatibility)
+- **Refactored** Relay to use shared `handleChatRequest()` function
+  - Both `/chat` and `/v1/chat/completions` use same core logic
+  - Eliminates code duplication
+  - Consistent error handling across endpoints
+
+#### Relay → Intake Connection ✅
+- **Fixed** Intake URL fallback in Relay server configuration
+  - Corrected port: `7082` → `7080`
+  - Updated endpoint: `/summary` → `/add_exchange`
+  - Now properly sends exchanges to Intake for summarization
+
+#### Code Quality & Python Package Structure ✅
+- **Added** missing `__init__.py` files to all Cortex subdirectories
+  - `cortex/llm/__init__.py`
+  - `cortex/reasoning/__init__.py`
+  - `cortex/persona/__init__.py`
+  - `cortex/ingest/__init__.py`
+  - `cortex/utils/__init__.py`
+  - Improves package imports and IDE support
+- **Removed** unused import in `cortex/router.py`: `from unittest import result`
+- **Deleted** empty file `cortex/llm/resolve_llm_url.py` (was 0 bytes, never implemented)
+
+### ✅ Verified Working
+Complete end-to-end message flow now operational:
+```
+UI → Relay (/v1/chat/completions)
+  ↓
+Relay → Cortex (/reason)
+  ↓
+Cortex → Intake (/summaries) [retrieves context]
+  ↓
+Cortex 4-stage pipeline:
+  1. reflection.py → meta-awareness notes
+  2. reasoning.py → draft answer
+  3. refine.py → polished answer
+  4. persona/speak.py → Lyra personality
+  ↓
+Cortex → Relay (returns persona response)
+  ↓
+Relay → Intake (/add_exchange) [async summary]
+  ↓
+Intake → NeoMem (background memory storage)
+  ↓
+Relay → UI (final response)
+```
+
+### 📝 Documentation
+- **Added** this CHANGELOG entry with comprehensive v0.5.0 notes
+- **Updated** README.md to reflect v0.5.0 architecture
+  - Documented new endpoints
+  - Updated data flow diagrams
+  - Clarified Intake v0.2 changes
+  - Corrected service descriptions
+
+### 🐛 Issues Resolved
+- ❌ Cortex could not retrieve context from Intake (wrong endpoint)
+- ❌ UI could not send messages to Relay (endpoint mismatch)
+- ❌ Relay could not send summaries to Intake (wrong port/endpoint)
+- ❌ Python package imports were implicit (missing __init__.py)
+
+### ⚠️ Known Issues (Non-Critical)
+- Session management endpoints not implemented in Relay (`GET/POST /sessions/:id`)
+- RAG service currently disabled in docker-compose.yml
+- Cortex `/ingest` endpoint is a stub returning `{"status": "ok"}`
+
+### 🎯 Migration Notes
+If upgrading from v0.4.x:
+1. Pull latest changes from git
+2. Verify environment variables in `.env` files:
+   - Check `INTAKE_API_URL=http://intake:7080` (not `INTAKE_API`)
+   - Verify all service URLs use correct ports
+3. Restart Docker containers: `docker-compose down && docker-compose up -d`
+4. Test with a simple message through the UI
+
+---
+
+## [Infrastructure v1.0.0] - 2025-11-26
+
+### Changed
+- **Environment Variable Consolidation** - Major reorganization to eliminate duplication and improve maintainability
+  - Consolidated 9 scattered `.env` files into single source of truth architecture
+  - Root `.env` now contains all shared infrastructure (LLM backends, databases, API keys, service URLs)
+  - Service-specific `.env` files minimized to only essential overrides:
+    - `cortex/.env`: Reduced from 42 to 22 lines (operational parameters only)
+    - `neomem/.env`: Reduced from 26 to 14 lines (LLM naming conventions only)
+    - `intake/.env`: Kept at 8 lines (already minimal)
+  - **Result**: ~24% reduction in total configuration lines (197 → ~150)
+
+- **Docker Compose Consolidation**
+  - All services now defined in single root `docker-compose.yml`
+  - Relay service updated with complete configuration (env_file, volumes)
+  - Removed redundant `core/docker-compose.yml` (marked as DEPRECATED)
+  - Standardized network communication to use Docker container names
+
+- **Service URL Standardization**
+  - Internal services use container names: `http://neomem-api:7077`, `http://cortex:7081`
+  - External services use IP addresses: `http://10.0.0.43:8000` (vLLM), `http://10.0.0.3:11434` (Ollama)
+  - Removed IP/container name inconsistencies across files
+
+### Added
+- **Security Templates** - Created `.env.example` files for all services
+  - Root `.env.example` with sanitized credentials
+  - Service-specific templates: `cortex/.env.example`, `neomem/.env.example`, `intake/.env.example`, `rag/.env.example`
+  - All `.env.example` files safe to commit to version control
+
+- **Documentation**
+  - `ENVIRONMENT_VARIABLES.md`: Comprehensive reference for all environment variables
+    - Variable descriptions, defaults, and usage examples
+    - Multi-backend LLM strategy documentation
+    - Troubleshooting guide
+    - Security best practices
+  - `DEPRECATED_FILES.md`: Deletion guide for deprecated files with verification steps
+
+- **Enhanced .gitignore**
+  - Ignores all `.env` files (including subdirectories)
+  - Tracks `.env.example` templates for documentation
+  - Ignores `.env-backups/` directory
+
+### Removed
+- `core/.env` - Redundant with root `.env`, now deleted
+- `core/docker-compose.yml` - Consolidated into main compose file (marked DEPRECATED)
+
+### Fixed
+- Eliminated duplicate `OPENAI_API_KEY` across 5+ files
+- Eliminated duplicate LLM backend URLs across 4+ files
+- Eliminated duplicate database credentials across 3+ files
+- Resolved Cortex `environment:` section override in docker-compose (now uses env_file)
+
+### Architecture
+- **Multi-Backend LLM Strategy**: Root `.env` provides all backend OPTIONS (PRIMARY, SECONDARY, CLOUD, FALLBACK), services choose which to USE
+  - Cortex → vLLM (PRIMARY) for autonomous reasoning
+  - NeoMem → Ollama (SECONDARY) + OpenAI embeddings
+  - Intake → vLLM (PRIMARY) for summarization
+  - Relay → Fallback chain with user preference
+- Preserves per-service flexibility while eliminating URL duplication
+
+### Migration
+- All original `.env` files backed up to `.env-backups/` with timestamp `20251126_025334`
+- Rollback plan documented in `ENVIRONMENT_VARIABLES.md`
+- Verification steps provided in `DEPRECATED_FILES.md`
+
+---
+
 ## [Lyra_RAG v0.1.0] 2025-11-07
 ### Added
 - Initial standalone RAG module for Project Lyra.
@@ -0,0 +1,91 @@
+# Deprecated Files - Safe to Delete
+
+This file lists all deprecated files that can be safely deleted after verification.
+
+## Files Marked for Deletion
+
+### Docker Compose Files
+
+#### `/core/docker-compose.yml.DEPRECATED`
+- **Status**: DEPRECATED
+- **Reason**: All services consolidated into main `/docker-compose.yml`
+- **Replaced by**: `/docker-compose.yml` (relay service now has complete config)
+- **Safe to delete**: Yes, after verifying main docker-compose works
+
+### Environment Files
+
+All original `.env` files have been consolidated. Backups exist in `.env-backups/` directory.
+
+#### Previously Deleted (Already Done)
+- ✅ `/core/.env` - Deleted (redundant with root .env)
+
+### Experimental/Orphaned Files
+
+#### `/core/env experiments/` (entire directory)
+- **Status**: User will handle separately
+- **Contains**: `.env`, `.env.local`, `.env.openai`
+- **Action**: User to review and clean up
+
+## Verification Steps Before Deleting
+
+Before deleting the deprecated files, verify:
+
+1. **Test main docker-compose.yml works:**
+   ```bash
+   cd /home/serversdown/project-lyra
+   docker-compose down
+   docker-compose up -d
+   docker-compose ps  # All services should be running
+   ```
+
+2. **Verify relay service has correct config:**
+   ```bash
+   docker exec relay env | grep -E "LLM_|NEOMEM_|OPENAI"
+   docker exec relay ls -la /app/sessions  # Sessions volume mounted
+   ```
+
+3. **Test relay functionality:**
+   - Send a test message through relay
+   - Verify memory storage works
+   - Confirm LLM backend connections work
+
+## Deletion Commands
+
+After successful verification, run:
+
+```bash
+cd /home/serversdown/project-lyra
+
+# Delete deprecated docker-compose file
+rm core/docker-compose.yml.DEPRECATED
+
+# Optionally clean up backup directory after confirming everything works
+# (Keep backups for at least a few days/weeks)
+# rm -rf .env-backups/
+```
+
+## Files to Keep
+
+These files should **NOT** be deleted:
+
+- ✅ `.env` (root) - Single source of truth
+- ✅ `.env.example` (root) - Security template (commit to git)
+- ✅ `cortex/.env` - Service-specific config
+- ✅ `cortex/.env.example` - Security template (commit to git)
+- ✅ `neomem/.env` - Service-specific config
+- ✅ `neomem/.env.example` - Security template (commit to git)
+- ✅ `intake/.env` - Service-specific config
+- ✅ `intake/.env.example` - Security template (commit to git)
+- ✅ `rag/.env.example` - Security template (commit to git)
+- ✅ `docker-compose.yml` - Main orchestration file
+- ✅ `ENVIRONMENT_VARIABLES.md` - Documentation
+- ✅ `.gitignore` - Git configuration
+
+## Backup Information
+
+All original `.env` files backed up to:
+- Location: `/home/serversdown/project-lyra/.env-backups/`
+- Timestamp: `20251126_025334`
+- Files: 6 original .env files
+
+Keep backups until you're confident the new setup is stable (recommended: 2-4 weeks).
@@ -0,0 +1,250 @@
+# Environment Variables Reference
+
+This document describes all environment variables used across Project Lyra services.
+
+## Quick Start
+
+1. Copy environment templates:
+   ```bash
+   cp .env.example .env
+   cp cortex/.env.example cortex/.env
+   cp neomem/.env.example neomem/.env
+   cp intake/.env.example intake/.env
+   ```
+
+2. Edit `.env` and add your credentials:
+   - `OPENAI_API_KEY`: Your OpenAI API key
+   - `POSTGRES_PASSWORD`: Database password
+   - `NEO4J_PASSWORD`: Graph database password
+   - `NEOMEM_API_KEY`: Generate a secure token
+
+3. Update service URLs if your infrastructure differs from defaults
+
+## File Structure
+
+### Root `.env` - Shared Infrastructure
+Contains all shared configuration used by multiple services:
+- LLM backend options (PRIMARY, SECONDARY, CLOUD, FALLBACK)
+- Database credentials (Postgres, Neo4j)
+- API keys (OpenAI)
+- Internal service URLs
+- Feature flags
+
+### Service-Specific `.env` Files
+Each service has minimal overrides for service-specific parameters:
+- **`cortex/.env`**: Cortex operational parameters
+- **`neomem/.env`**: NeoMem LLM naming convention mappings
+- **`intake/.env`**: Intake summarization parameters
+
+## Environment Loading Order
+
+Docker Compose loads environment files in this order (later overrides earlier):
+1. Service-specific `.env` (e.g., `cortex/.env`)
+2. Root `.env`
+
+This means service-specific files can override root values when needed.
+
+## Global Variables (Root `.env`)
+
+### Global Configuration
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LOCAL_TZ_LABEL` | `America/New_York` | Timezone for logs and timestamps |
+| `DEFAULT_SESSION_ID` | `default` | Default chat session identifier |
+
+### LLM Backend Options
+Each service chooses which backend to use from these available options.
+
+#### Primary Backend (vLLM on MI50 GPU)
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LLM_PRIMARY_PROVIDER` | `vllm` | Provider type |
+| `LLM_PRIMARY_URL` | `http://10.0.0.43:8000` | vLLM server endpoint |
+| `LLM_PRIMARY_MODEL` | `/model` | Model path for vLLM |
+
+#### Secondary Backend (Ollama on 3090 GPU)
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LLM_SECONDARY_PROVIDER` | `ollama` | Provider type |
+| `LLM_SECONDARY_URL` | `http://10.0.0.3:11434` | Ollama server endpoint |
+| `LLM_SECONDARY_MODEL` | `qwen2.5:7b-instruct-q4_K_M` | Ollama model name |
+
+#### Cloud Backend (OpenAI)
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LLM_CLOUD_PROVIDER` | `openai_chat` | Provider type |
+| `LLM_CLOUD_URL` | `https://api.openai.com/v1` | OpenAI API endpoint |
+| `LLM_CLOUD_MODEL` | `gpt-4o-mini` | OpenAI model to use |
+| `OPENAI_API_KEY` | *required* | OpenAI API authentication key |
+
+#### Fallback Backend (llama.cpp/LM Studio)
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LLM_FALLBACK_PROVIDER` | `openai_completions` | Provider type (llama.cpp mimics OpenAI) |
+| `LLM_FALLBACK_URL` | `http://10.0.0.41:11435` | Fallback server endpoint |
+| `LLM_FALLBACK_MODEL` | `llama-3.2-8b-instruct` | Fallback model name |
+
+#### LLM Global Settings
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LLM_TEMPERATURE` | `0.7` | Sampling temperature (0.0-2.0) |
+
+### Database Configuration
+
+#### PostgreSQL (with pgvector)
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `POSTGRES_USER` | `neomem` | PostgreSQL username |
+| `POSTGRES_PASSWORD` | *required* | PostgreSQL password |
+| `POSTGRES_DB` | `neomem` | Database name |
+| `POSTGRES_HOST` | `neomem-postgres` | Container name/hostname |
+| `POSTGRES_PORT` | `5432` | PostgreSQL port |
+
+#### Neo4j Graph Database
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `NEO4J_URI` | `bolt://neomem-neo4j:7687` | Neo4j connection URI |
+| `NEO4J_USERNAME` | `neo4j` | Neo4j username |
+| `NEO4J_PASSWORD` | *required* | Neo4j password |
+| `NEO4J_AUTH` | `neo4j/<password>` | Neo4j auth string |
+
+### Memory Services (NeoMem)
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `NEOMEM_API` | `http://neomem-api:7077` | NeoMem API endpoint |
+| `NEOMEM_API_KEY` | *required* | NeoMem API authentication token |
+| `NEOMEM_HISTORY_DB` | `postgresql://...` | PostgreSQL connection string for history |
+| `EMBEDDER_PROVIDER` | `openai` | Embedding provider (used by NeoMem) |
+| `EMBEDDER_MODEL` | `text-embedding-3-small` | Embedding model name |
+
+### Internal Service URLs
+All using Docker container names for network communication:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `INTAKE_API_URL` | `http://intake:7080` | Intake summarizer service |
+| `CORTEX_API` | `http://cortex:7081` | Cortex reasoning service |
+| `CORTEX_URL` | `http://cortex:7081/reflect` | Cortex reflection endpoint |
+| `CORTEX_URL_INGEST` | `http://cortex:7081/ingest` | Cortex ingest endpoint |
+| `RAG_API_URL` | `http://rag:7090` | RAG service (if enabled) |
+| `RELAY_URL` | `http://relay:7078` | Relay orchestration service |
+| `PERSONA_URL` | `http://persona-sidecar:7080/current` | Persona service (optional) |
+
+### Feature Flags
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `CORTEX_ENABLED` | `true` | Enable Cortex autonomous reflection |
+| `MEMORY_ENABLED` | `true` | Enable NeoMem long-term memory |
+| `PERSONA_ENABLED` | `false` | Enable persona sidecar |
+| `DEBUG_PROMPT` | `true` | Enable debug logging for prompts |
+
+## Service-Specific Variables
+
+### Cortex (`cortex/.env`)
+Cortex operational parameters:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `CORTEX_MODE` | `autonomous` | Operation mode (autonomous/manual) |
+| `CORTEX_LOOP_INTERVAL` | `300` | Seconds between reflection loops |
+| `CORTEX_REFLECTION_INTERVAL` | `86400` | Seconds between deep reflections (24h) |
+| `CORTEX_LOG_LEVEL` | `debug` | Logging verbosity |
+| `NEOMEM_HEALTH_CHECK_INTERVAL` | `300` | NeoMem health check frequency |
+| `REFLECTION_NOTE_TARGET` | `trilium` | Where to store reflection notes |
+| `REFLECTION_NOTE_PATH` | `/app/logs/reflections.log` | Reflection output path |
+| `RELEVANCE_THRESHOLD` | `0.78` | Memory retrieval relevance threshold |
+
+**Note**: Cortex uses `LLM_PRIMARY` (vLLM on MI50) by default from root `.env`.
+
+### NeoMem (`neomem/.env`)
+NeoMem uses different variable naming conventions:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `LLM_PROVIDER` | `ollama` | NeoMem's LLM provider name |
+| `LLM_MODEL` | `qwen2.5:7b-instruct-q4_K_M` | NeoMem's LLM model |
+| `LLM_API_BASE` | `http://10.0.0.3:11434` | NeoMem's LLM endpoint (Ollama) |
+
+**Note**: NeoMem uses Ollama (SECONDARY) for reasoning and OpenAI for embeddings. Database credentials and `OPENAI_API_KEY` inherited from root `.env`.
+
+### Intake (`intake/.env`)
+Intake summarization parameters:
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `SUMMARY_MODEL_NAME` | `/model` | Model path for summarization |
+| `SUMMARY_API_URL` | `http://10.0.0.43:8000` | LLM endpoint for summaries |
+| `SUMMARY_MAX_TOKENS` | `400` | Max tokens for summary generation |
+| `SUMMARY_TEMPERATURE` | `0.4` | Temperature for summaries (lower = more focused) |
+| `SUMMARY_INTERVAL` | `300` | Seconds between summary checks |
+| `INTAKE_LOG_PATH` | `/app/logs/intake.log` | Log file location |
+| `INTAKE_LOG_LEVEL` | `info` | Logging verbosity |
+
+**Note**: Intake uses `LLM_PRIMARY` (vLLM) by default.
+
+## Multi-Backend LLM Strategy
+
+Project Lyra supports flexible backend selection per service:
+
+**Root `.env` provides backend OPTIONS**:
+- PRIMARY: vLLM on MI50 GPU (high performance)
+- SECONDARY: Ollama on 3090 GPU (local inference)
+- CLOUD: OpenAI API (cloud fallback)
+- FALLBACK: llama.cpp/LM Studio (CPU-only)
+
+**Services choose which backend to USE**:
+- **Cortex** → vLLM (PRIMARY) for autonomous reasoning
+- **NeoMem** → Ollama (SECONDARY) + OpenAI embeddings
+- **Intake** → vLLM (PRIMARY) for summarization
+- **Relay** → Implements fallback cascade with user preference
+
+This design eliminates URL duplication while preserving per-service flexibility.
+
+## Security Best Practices
+
+1. **Never commit `.env` files to git** - they contain secrets
+2. **Use `.env.example` templates** for documentation and onboarding
+3. **Rotate credentials regularly**, especially:
+   - `OPENAI_API_KEY`
+   - `NEOMEM_API_KEY`
+   - Database passwords
+4. **Use strong passwords** for production databases
+5. **Restrict network access** to LLM backends and databases
+
+## Troubleshooting
+
+### Services can't connect to each other
+- Verify container names match in service URLs
+- Check all services are on the `lyra_net` Docker network
+- Use `docker-compose ps` to verify all services are running
+
+### LLM calls failing
+- Verify backend URLs are correct for your infrastructure
+- Check if LLM servers are running and accessible
+- Test with `curl <LLM_URL>/v1/models` (OpenAI-compatible APIs)
+
+### Database connection errors
+- Verify database credentials match in all locations
+- Check if database containers are healthy: `docker-compose ps`
+- Review database logs: `docker-compose logs neomem-postgres`
+
+### Environment variables not loading
+- Verify env_file paths in docker-compose.yml
+- Check file permissions: `.env` files must be readable
+- Remember loading order: service `.env` overrides root `.env`
+
+## Migration from Old Setup
+
+If you have the old multi-file setup with duplicated variables:
+
+1. **Backup existing files**: All original `.env` files are in `.env-backups/`
+2. **Copy new templates**: Use `.env.example` files as base
+3. **Merge credentials**: Transfer your actual keys/passwords to new root `.env`
+4. **Test thoroughly**: Verify all services start and communicate correctly
+
+## Support
+
+For issues or questions:
+- Check logs: `docker-compose logs <service>`
+- Verify configuration: `docker exec <container> env | grep <VAR>`
+- Review this documentation for variable descriptions
@@ -1,73 +1,178 @@
-##### Project Lyra - README v0.3.0 - needs fixing #####
+# Project Lyra - README v0.5.0

-Lyra is a modular persistent AI companion system.  
-It provides memory-backed chat using **NeoMem** + **Relay** + **Persona Sidecar**,  
-with optional subconscious annotation powered by **Cortex VM** running local LLMs.
+Lyra is a modular persistent AI companion system with advanced reasoning capabilities.
+It provides memory-backed chat using **NeoMem** + **Relay** + **Cortex**,
+with multi-stage reasoning pipeline powered by distributed LLM backends.

-## Mission Statement ##
-	The point of project lyra is to give an AI chatbot more abilities than a typical chatbot. typical chat bots are essentially amnesic and forget everything about your project. Lyra helps keep projects organized and remembers everything you have done. Think of her abilities as a notepad/schedule/data base/ co-creator/collaborattor all with its own executive function. Say something in passing, Lyra remembers it then reminds you of it later.
+## Mission Statement
+
+The point of Project Lyra is to give an AI chatbot more abilities than a typical chatbot. Typical chatbots are essentially amnesic and forget everything about your project. Lyra helps keep projects organized and remembers everything you have done. Think of her abilities as a notepad/schedule/database/co-creator/collaborator all with its own executive function. Say something in passing, Lyra remembers it then reminds you of it later.
 	
 ---
-	
-## Structure ##
-	Project Lyra exists as a series of docker containers that run independentally of each other but are all networked together. Think of it as how the brain has regions, Lyra has modules:
-	## A. VM 100 - lyra-core:
-		1. ** Core v0.3.1 - Docker Stack
-			- Relay - (docker container) - The main harness that connects the modules together and accepts input from the user.
-			- UI - (HTML) - This is how the user communicates with lyra. ATM its a typical instant message interface, but plans are to make it much more than that.
-			- Persona - (docker container) - This is the personality of lyra, set how you want her to behave. Give specific instructions for output. Basically prompt injection.
-			- All of this is built and controlled by a single .env and docker-compose.lyra.yml.
-		2. **NeoMem v0.1.0 - (docker stack)
-			- NeoMem is Lyra's main long term memory data base. It is a fork of mem0 oss. Uses vector databases and graph.
-			- NeoMem launches with a single separate docker-compose.neomem.yml.
-			
-	## B. VM 101 - lyra - cortex
-		3. ** Cortex - VM containing docker stack
-		- This is the working reasoning layer of Lyra.
-		- Built to be flexible in deployment. Run it locally or remotely (via wan/lan) 
-		- Intake v0.1.0 - (docker Container) gives conversations context and purpose
-			- Intake takes the last N exchanges and summarizes them into coherrent short term memories.
-			- Uses a cascading summarization setup that quantizes the exchanges. Summaries occur at L2, L5, L10, L15, L20 etc.
-			- Keeps the bot aware of what is going on with out having to send it the whole chat every time. 
-		- Cortex - Docker container containing: 
-			- Reasoning Layer
-				- TBD
-			- Reflect - (docker continer) - Not yet implemented, road map. 
-				- Calls back to NeoMem after N exchanges and N summaries and edits memories created during the initial messaging step. This helps contain memories to coherrent thoughts, reduces the noise.
-				- Can be done actively and asynchronously, or on a time basis (think human sleep and dreams). 
-				- This stage is not yet built, this is just an idea. 
-		
-	## C. Remote LLM APIs:
-		3. **AI Backends
-			- Lyra doesnt run models her self, she calls up APIs.
-			- Endlessly customizable as long as it outputs to the same schema. 
+
+## Architecture Overview
+
+Project Lyra operates as a series of Docker containers networked together in a microservices architecture. Like how the brain has regions, Lyra has modules:
+
+### A. VM 100 - lyra-core (Core Services)
+
+**1. Relay** (Node.js/Express) - Port 7078
+- Main orchestrator and message router
+- Coordinates all module interactions
+- OpenAI-compatible endpoint: `POST /v1/chat/completions`
+- Internal endpoint: `POST /chat`
+- Routes messages through Cortex reasoning pipeline
+- Manages async calls to Intake and NeoMem
+
+**2. UI** (Static HTML)
+- Browser-based chat interface with cyberpunk theme
+- Connects to Relay at `http://10.0.0.40:7078`
+- Saves and loads sessions
+- OpenAI-compatible message format
+
+**3. NeoMem** (Python/FastAPI) - Port 7077
+- Long-term memory database (fork of Mem0 OSS)
+- Vector storage (PostgreSQL + pgvector) + Graph storage (Neo4j)
+- RESTful API: `/memories`, `/search`
+- Semantic memory updates and retrieval
+- No external SDK dependencies - fully local
+
+### B. VM 101 - lyra-cortex (Reasoning Layer)
+
+**4. Cortex** (Python/FastAPI) - Port 7081
+- Primary reasoning engine with multi-stage pipeline
+- **4-Stage Processing:**
+  1. **Reflection** - Generates meta-awareness notes about conversation
+  2. **Reasoning** - Creates initial draft answer using context
+  3. **Refinement** - Polishes and improves the draft
+  4. **Persona** - Applies Lyra's personality and speaking style
+- Integrates with Intake for short-term context
+- Flexible LLM router supporting multiple backends
+
+**5. Intake v0.2** (Python/FastAPI) - Port 7080
+- Simplified short-term memory summarization
+- Session-based circular buffer (deque, maxlen=200)
+- Single-level simple summarization (no cascading)
+- Background async processing with FastAPI BackgroundTasks
+- Pushes summaries to NeoMem automatically
+- **API Endpoints:**
+  - `POST /add_exchange` - Add conversation exchange
+  - `GET /summaries?session_id={id}` - Retrieve session summary
+  - `POST /close_session/{id}` - Close and cleanup session
+
+### C. LLM Backends (Remote/Local APIs)
+
+**Multi-Backend Strategy:**
+- **PRIMARY**: vLLM on AMD MI50 GPU (`http://10.0.0.43:8000`) - Cortex reasoning, Intake
+- **SECONDARY**: Ollama on RTX 3090 (`http://10.0.0.3:11434`) - Configurable per-module
+- **CLOUD**: OpenAI API (`https://api.openai.com/v1`) - Cortex persona layer
+- **FALLBACK**: Local backup (`http://10.0.0.41:11435`) - Emergency fallback 
 			
 ---

+## Data Flow Architecture (v0.5.0)

-## 🚀 Features ##
+### Normal Message Flow:

-# Lyra-Core VM (VM100)
- **Relay **:
-  - The main harness and orchestrator of Lyra.
-  - OpenAI-compatible endpoint: `POST /v1/chat/completions`
-  - Injects persona + relevant memories into every LLM call
-  - Routes all memory storage/retrieval through **NeoMem**
-  - Logs spans (`neomem.add`, `neomem.search`, `persona.fetch`, `llm.generate`)
+```
+User (UI) → POST /v1/chat/completions
+  ↓
+Relay (7078)
+  ↓ POST /reason
+Cortex (7081)
+  ↓ GET /summaries?session_id=xxx
+Intake (7080) [RETURNS SUMMARY]
+  ↓
+Cortex processes (4 stages):
+  1. reflection.py → meta-awareness notes
+  2. reasoning.py → draft answer (uses LLM)
+  3. refine.py → refined answer (uses LLM)
+  4. persona/speak.py → Lyra personality (uses LLM)
+  ↓
+Returns persona answer to Relay
+  ↓
+Relay → Cortex /ingest (async, stub)
+Relay → Intake /add_exchange (async)
+  ↓
+Intake → Background summarize → NeoMem
+  ↓
+Relay → UI (returns final response)
+```

- **NeoMem (Memory Engine)**:
-  - Forked from Mem0 OSS and fully independent.
-  - Drop-in compatible API (`/memories`, `/search`).
-  - Local-first: runs on FastAPI with Postgres + Neo4j.
-  - No external SDK dependencies.
-  - Default service: `neomem-api` (port 7077).
-  - Capable of adding new memories and updating previous memories. Compares existing embeddings and performs in place updates when a memory is judged to be a semantic match.
+### Cortex 4-Stage Reasoning Pipeline:

- **UI**:
-  - Lightweight static HTML chat page.
-  - Connects to Relay at `http://<host>:7078`.
-  - Nice cyberpunk theme!
-  - Saves and loads sessions, which then in turn send to relay.
+1. **Reflection** (`reflection.py`) - Cloud backend (OpenAI)
+   - Analyzes user intent and conversation context
+   - Generates meta-awareness notes
+   - "What is the user really asking?"
+
+2. **Reasoning** (`reasoning.py`) - Primary backend (vLLM)
+   - Retrieves short-term context from Intake
+   - Creates initial draft answer
+   - Integrates context, reflection notes, and user prompt
+
+3. **Refinement** (`refine.py`) - Primary backend (vLLM)
+   - Polishes the draft answer
+   - Improves clarity and coherence
+   - Ensures factual consistency
+
+4. **Persona** (`speak.py`) - Cloud backend (OpenAI)
+   - Applies Lyra's personality and speaking style
+   - Natural, conversational output
+   - Final answer returned to user
+
+---
+
+## Features
+
+### Lyra-Core (VM 100)
+
+**Relay**:
+- Main orchestrator and message router
+- OpenAI-compatible endpoint: `POST /v1/chat/completions`
+- Internal endpoint: `POST /chat`
+- Health check: `GET /_health`
+- Async non-blocking calls to Cortex and Intake
+- Shared request handler for code reuse
+- Comprehensive error handling
+
+**NeoMem (Memory Engine)**:
+- Forked from Mem0 OSS - fully independent
+- Drop-in compatible API (`/memories`, `/search`)
+- Local-first: runs on FastAPI with Postgres + Neo4j
+- No external SDK dependencies
+- Semantic memory updates - compares embeddings and performs in-place updates
+- Default service: `neomem-api` (port 7077)
+
+**UI**:
+- Lightweight static HTML chat interface
+- Cyberpunk theme
+- Session save/load functionality
+- OpenAI message format support
+
+### Cortex (VM 101)
+
+**Cortex** (v0.5):
+- Multi-stage reasoning pipeline (reflection → reasoning → refine → persona)
+- Flexible LLM backend routing
+- Per-stage backend selection
+- Async processing throughout
+- IntakeClient integration for short-term context
+- `/reason`, `/ingest` (stub), `/health` endpoints
+
+**Intake** (v0.2):
+- Simplified single-level summarization
+- Session-based circular buffer (200 exchanges max)
+- Background async summarization
+- Automatic NeoMem push
+- No persistent log files (memory-only)
+- **Breaking change from v0.1**: Removed cascading summaries (L1, L2, L5, L10, L20, L30)
+
+**LLM Router**:
+- Dynamic backend selection
+- Environment-driven configuration
+- Support for vLLM, Ollama, OpenAI, custom endpoints
+- Per-module backend preferences

 # Beta Lyrae (RAG Memory DB) - added 11-3-25
 - **RAG Knowledge DB - Beta Lyrae (sheliak)**
@@ -159,7 +264,85 @@ with optional subconscious annotation powered by **Cortex VM** running local LLM
 			└── Future: sends summaries → Cortex for reflection


-# Additional information available in the trilium docs. #
+---
+
+## Version History
+
+### v0.5.0 (2025-11-28) - Current Release
+- ✅ Fixed all critical API wiring issues
+- ✅ Added OpenAI-compatible endpoint to Relay (`/v1/chat/completions`)
+- ✅ Fixed Cortex → Intake integration
+- ✅ Added missing Python package `__init__.py` files
+- ✅ End-to-end message flow verified and working
+
+### v0.4.x (Major Rewire)
+- Cortex multi-stage reasoning pipeline
+- Intake v0.2 simplification
+- LLM router with multi-backend support
+- Major architectural restructuring
+
+### v0.3.x
+- Beta Lyrae RAG system
+- NeoMem integration
+- Basic Cortex reasoning loop
+
+---
+
+## Known Issues (v0.5.0)
+
+### Non-Critical
+- Session management endpoints not fully implemented in Relay
+- RAG service currently disabled in docker-compose.yml
+- Cortex `/ingest` endpoint is a stub
+
+### Future Enhancements
+- Re-enable RAG service integration
+- Implement full session persistence
+- Add request correlation IDs for tracing
+- Comprehensive health checks
+
+---
+
+## Quick Start
+
+### Prerequisites
+- Docker + Docker Compose
+- PostgreSQL 13+, Neo4j 4.4+ (for NeoMem)
+- At least one LLM API endpoint (vLLM, Ollama, or OpenAI)
+
+### Setup
+1. Configure environment variables in `.env` files
+2. Start services: `docker-compose up -d`
+3. Check health: `curl http://localhost:7078/_health`
+4. Access UI: `http://localhost:7078`
+
+### Test
+```bash
+curl -X POST http://localhost:7078/v1/chat/completions \
+  -H "Content-Type: application/json" \
+  -d '{
+    "messages": [{"role": "user", "content": "Hello Lyra!"}],
+    "session_id": "test"
+  }'
+```
+
+---
+
+## Documentation
+
+- See [CHANGELOG.md](CHANGELOG.md) for detailed version history
+- See `ENVIRONMENT_VARIABLES.md` for environment variable reference
+- Additional information available in the Trilium docs
+
+---
+
+## License
+
+NeoMem is a derivative work based on Mem0 OSS (Apache 2.0).
+© 2025 Terra-Mechanics / ServersDown Labs. All modifications released under Apache 2.0.
+
+**Built with Claude Code**
+
 ---

 ## 📦 Requirements
@@ -1,3 +1,7 @@
+# DEPRECATED - USE /home/serversdown/project-lyra/docker-compose.yml instead
+# This file is no longer needed. All services are now in the main docker-compose.yml
+# Safe to delete after verifying main compose file works correctly.
+#
 services:
  relay:
    build:
@@ -7,7 +11,7 @@ services:
    ports:
      - "7078:7078"
    env_file:
-      - .env
+      - ../.env  # Use root .env (core/.env is now redundant)
    volumes:
      - ./relay/sessions:/app/sessions
    networks:
@@ -0,0 +1,16 @@
+# Ignore node_modules - Docker will rebuild them inside
+node_modules
+npm-debug.log
+yarn-error.log
+*.log
+
+# Ignore environment files
+.env
+.env.local
+
+# Ignore OS/editor cruft
+.DS_Store
+*.swp
+*.swo
+.vscode
+.idea
@@ -0,0 +1,18 @@
+# relay/Dockerfile
+FROM node:18-alpine
+
+# Create app directory
+WORKDIR /app
+
+# Copy package.json and install deps first (better caching)
+COPY package.json ./
+RUN npm install
+
+# Copy the rest of the app
+COPY . .
+
+# Expose port
+EXPOSE 7078
+
+# Run the server
+CMD ["npm", "start"]
@@ -0,0 +1,73 @@
+// relay/lib/cortex.js
+import fetch from "node-fetch";
+
+const REFLECT_URL = process.env.CORTEX_URL || "http://localhost:7081/reflect";
+const INGEST_URL  = process.env.CORTEX_URL_INGEST || "http://localhost:7081/ingest";
+
+export async function reflectWithCortex(userInput, memories = []) {
+  const body = { prompt: userInput, memories };
+  try {
+    const res = await fetch(REFLECT_URL, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(body),
+      timeout: 120000,
+    });
+
+    const rawText = await res.text();
+	console.log("🔎 [Cortex-Debug] rawText from /reflect →", rawText.slice(0, 300));
+    if (!res.ok) {
+      throw new Error(`HTTP ${res.status} — ${rawText.slice(0, 200)}`);
+    }
+
+    let data;
+    try {
+      data = JSON.parse(rawText);
+    } catch (err) {
+      // Fallback ① try to grab a JSON-looking block
+      const match = rawText.match(/\{[\s\S]*\}/);
+      if (match) {
+        try {
+          data = JSON.parse(match[0]);
+        } catch {
+          data = { reflection_raw: rawText.trim(), notes: "partial parse" };
+        }
+      } else {
+        // Fallback ② if it’s already an object (stringified Python dict)
+        try {
+          const normalized = rawText
+            .replace(/'/g, '"')        // convert single quotes
+            .replace(/None/g, 'null'); // convert Python None
+          data = JSON.parse(normalized);
+        } catch {
+          data = { reflection_raw: rawText.trim(), notes: "no JSON found" };
+        }
+      }
+    }
+
+    if (typeof data !== "object") {
+      data = { reflection_raw: rawText.trim(), notes: "non-object response" };
+    }
+
+    console.log("🧠 Cortex reflection normalized:", data);
+    return data;
+  } catch (e) {
+    console.warn("⚠️ Cortex reflect failed:", e.message);
+    return { error: e.message, reflection_raw: "" };
+  }
+}
+
+export async function ingestToCortex(user, assistant, reflection = {}, sessionId = "default") {
+  const body = { turn: { user, assistant }, reflection, session_id: sessionId };
+  try {
+    const res = await fetch(INGEST_URL, {
+      method: "POST",
+      headers: { "Content-Type": "application/json" },
+      body: JSON.stringify(body),
+      timeout: 120000,
+    });
+    console.log(`📤 Sent exchange to Cortex ingest (${res.status})`);
+  } catch (e) {
+    console.warn("⚠️ Cortex ingest failed:", e.message);
+  }
+}
@@ -0,0 +1,93 @@
+async function tryBackend(backend, messages) {
+  if (!backend.url || !backend.model) throw new Error("missing url/model");
+
+  const isOllama = backend.type === "ollama";
+  const isOpenAI = backend.type === "openai";
+  const isVllm = backend.type === "vllm";
+  const isLlamaCpp = backend.type === "llamacpp";
+
+  let endpoint = backend.url;
+  let headers = { "Content-Type": "application/json" };
+  if (isOpenAI) headers["Authorization"] = `Bearer ${OPENAI_API_KEY}`;
+
+  // Choose correct endpoint automatically
+  if (isOllama && !endpoint.endsWith("/api/chat")) endpoint += "/api/chat";
+  if ((isVllm || isLlamaCpp) && !endpoint.endsWith("/v1/completions")) endpoint += "/v1/completions";
+  if (isOpenAI && !endpoint.endsWith("/v1/chat/completions")) endpoint += "/v1/chat/completions";
+
+  // Build payload based on backend style
+  const body = (isVllm || isLlamaCpp)
+    ? {
+        model: backend.model,
+        prompt: messages.map(m => m.content).join("\n"),
+        max_tokens: 400,
+        temperature: 0.3,
+      }
+    : isOllama
+    ? { model: backend.model, messages, stream: false }
+    : { model: backend.model, messages, stream: false };
+
+  const resp = await fetch(endpoint, {
+    method: "POST",
+    headers,
+    body: JSON.stringify(body),
+    timeout: 120000,
+  });
+  if (!resp.ok) throw new Error(`${backend.key} HTTP ${resp.status}`);
+  const raw = await resp.text();
+
+  // 🧩 Normalize replies
+  let reply = "";
+  try {
+    if (isOllama) {
+      // Ollama sometimes returns NDJSON lines; merge them
+      const merged = raw
+        .split("\n")
+        .filter(line => line.trim().startsWith("{"))
+        .map(line => JSON.parse(line))
+        .map(obj => obj.message?.content || obj.response || "")
+        .join("");
+      reply = merged.trim();
+    } else {
+      const data = JSON.parse(raw);
+	  console.log("🔍 RAW LLM RESPONSE:", JSON.stringify(data, null, 2));
+	  reply =
+	    data?.choices?.[0]?.text?.trim() ||
+	    data?.choices?.[0]?.message?.content?.trim() ||
+	    data?.message?.content?.trim() ||
+	    "";
+
+
+    }
+  } catch (err) {
+    reply = `[parse error: ${err.message}]`;
+  }
+
+  return { reply, raw, backend: backend.key };
+}
+
+// ------------------------------------
+// Export the main call helper
+// ------------------------------------
+export async function callSpeechLLM(messages) {
+  const backends = [
+    { key: "primary",  type: "vllm",     url: process.env.LLM_PRIMARY_URL,  model: process.env.LLM_PRIMARY_MODEL },
+    { key: "secondary",type: "ollama",   url: process.env.LLM_SECONDARY_URL,model: process.env.LLM_SECONDARY_MODEL },
+    { key: "cloud",    type: "openai",   url: process.env.LLM_CLOUD_URL,    model: process.env.LLM_CLOUD_MODEL },
+    { key: "fallback", type: "llamacpp", url: process.env.LLM_FALLBACK_URL, model: process.env.LLM_FALLBACK_MODEL },
+  ];
+
+  for (const b of backends) {
+    if (!b.url || !b.model) continue;
+    try {
+      console.log(`🧠 Trying backend: ${b.key.toUpperCase()} (${b.url})`);
+      const out = await tryBackend(b, messages);
+      console.log(`✅ Success via ${b.key.toUpperCase()}`);
+      return out;
+    } catch (err) {
+      console.warn(`⚠️ ${b.key.toUpperCase()} failed: ${err.message}`);
+    }
+  }
+
+  throw new Error("all_backends_failed");
+}
@@ -0,0 +1,16 @@
+{
+  "name": "lyra-relay",
+  "version": "0.1.0",
+  "type": "module",
+  "main": "server.js",
+  "scripts": {
+    "start": "node server.js"
+  },
+  "dependencies": {
+    "cors": "^2.8.5",
+    "dotenv": "^16.6.1",
+    "express": "^4.18.2",
+    "mem0ai": "^2.1.38",
+    "node-fetch": "^3.3.2"
+  }
+}
@@ -0,0 +1,156 @@
+import express from "express";
+import dotenv from "dotenv";
+import cors from "cors";
+import fs from "fs";
+import path from "path";
+
+dotenv.config();
+
+const app = express();
+app.use(cors());
+app.use(express.json());
+
+const PORT = Number(process.env.PORT || 7078);
+const CORTEX_API = process.env.CORTEX_API || "http://cortex:7081";
+const CORTEX_INGEST = process.env.CORTEX_URL_INGEST || "http://cortex:7081/ingest";
+const sessionsDir = path.join(process.cwd(), "sessions");
+
+if (!fs.existsSync(sessionsDir)) fs.mkdirSync(sessionsDir);
+
+// -----------------------------------------------------
+// Helper: fetch with timeout + error detail
+// -----------------------------------------------------
+async function fetchJSON(url, method = "POST", body = null, timeoutMs = 20000) {
+  const controller = new AbortController();
+  const timeout = setTimeout(() => controller.abort(), timeoutMs);
+
+  try {
+    const resp = await fetch(url, {
+      method,
+      headers: { "Content-Type": "application/json" },
+      body: body ? JSON.stringify(body) : null,
+      signal: controller.signal,
+    });
+
+    const text = await resp.text();
+    const parsed = text ? JSON.parse(text) : null;
+
+    if (!resp.ok) {
+      throw new Error(
+        parsed?.detail || parsed?.error || parsed?.message || text || resp.statusText
+      );
+    }
+    return parsed;
+  } finally {
+    clearTimeout(timeout);
+  }
+}
+
+// -----------------------------------------------------
+// Helper: append session turn
+// -----------------------------------------------------
+async function appendSessionExchange(sessionId, entry) {
+  const file = path.join(sessionsDir, `${sessionId}.jsonl`);
+  const line = JSON.stringify({
+    ts: new Date().toISOString(),
+    user: entry.user,
+    assistant: entry.assistant,
+    raw: entry.raw,
+  }) + "\n";
+
+  fs.appendFileSync(file, line, "utf8");
+}
+
+// -----------------------------------------------------
+// HEALTHCHECK
+// -----------------------------------------------------
+app.get("/_health", (_, res) => {
+  res.json({ ok: true, time: new Date().toISOString() });
+});
+
+// -----------------------------------------------------
+// MAIN ENDPOINT
+// -----------------------------------------------------
+app.post("/v1/chat/completions", async (req, res) => {
+  try {
+    const { messages, model } = req.body;
+
+    if (!messages?.length) {
+      return res.status(400).json({ error: "invalid_messages" });
+    }
+
+    const userMsg = messages[messages.length - 1]?.content || "";
+    console.log(`🛰️ Relay received message → "${userMsg}"`);
+
+    // -------------------------------------------------
+    // Step 1: Ask Cortex to process the prompt
+    // -------------------------------------------------
+    let cortexResp;
+    try {
+      cortexResp = await fetchJSON(`${CORTEX_API}/reason`, "POST", {
+        session_id: "default",
+        user_prompt: userMsg,
+      });
+    } catch (err) {
+      console.error("💥 Relay → Cortex error:", err.message);
+      return res.status(500).json({
+        error: "cortex_failed",
+        detail: err.message,
+      });
+    }
+
+    const personaText = cortexResp.persona || "(no persona text returned)";
+
+    // -------------------------------------------------
+    // Step 2: Forward to Cortex ingest (fire-and-forget)
+    // -------------------------------------------------
+    try {
+      await fetchJSON(CORTEX_INGEST, "POST", cortexResp);
+    } catch (err) {
+      console.warn("⚠️ Cortex ingest failed:", err.message);
+    }
+
+    // -------------------------------------------------
+    // Step 3: Local session logging
+    // -------------------------------------------------
+    try {
+      await appendSessionExchange("default", {
+        user: userMsg,
+        assistant: personaText,
+        raw: cortexResp,
+      });
+    } catch (err) {
+      console.warn("⚠️ Relay log write failed:", err.message);
+    }
+
+    // -------------------------------------------------
+    // Step 4: Return OpenAI-style response to UI
+    // -------------------------------------------------
+    return res.json({
+      id: "relay-" + Date.now(),
+      object: "chat.completion",
+      model: model || "lyra",
+      choices: [
+        {
+          index: 0,
+          message: {
+            role: "assistant",
+            content: personaText,
+          },
+          finish_reason: "stop",
+        },
+      ],
+    });
+  } catch (err) {
+    console.error("💥 relay fatal error", err);
+    res.status(500).json({
+      error: "relay_failed",
+      detail: err?.message || String(err),
+    });
+  }
+});
+
+// -----------------------------------------------------
+app.listen(PORT, () => {
+  console.log(`Relay is online at port ${PORT}`);
+});
@@ -0,0 +1,4 @@
+{"ts":"2025-11-26T18:14:01.756Z","turn":[{"role":"user","content":"Are you alive?"},{"role":"assistant","content":"As an AI, I don't exist in the physical world and therefore cannot be considered \"alive\" in the traditional sense. However, I am operational and can engage in conversations like this one! How may I assist you today?"}]}
+{"ts":"2025-11-26T18:27:35.545Z","turn":[{"role":"user","content":"Are you awake?"},{"role":"assistant","content":"Yes, I'm awake and ready to converse with you! How can I assist you today?"}]}
+{"ts":"2025-11-26T18:33:31.708Z","turn":[{"role":"user","content":"Are you awake?"},{"role":"assistant","content":"Yes, I am here and ready to help or chat with you! What would you like to know or discuss?"}]}
+{"ts":"2025-11-26T18:49:39.424Z","turn":[{"role":"user","content":"Are you awake?"},{"role":"assistant","content":"Of course, I'm awake and ready to help you or have a conversation about anything you'd like! What can I assist you with today?"}]}
@@ -0,0 +1,50 @@
+[
+  {
+    "role": "user",
+    "content": "Hey im testing stuff! how goes it?"
+  },
+  {
+    "role": "assistant",
+    "content": "Hey! I'm here and ready to help. What are you testing?"
+  },
+  {
+    "role": "user",
+    "content": "testing the logic loop for your brain!"
+  },
+  {
+    "role": "assistant",
+    "content": "Sounds interesting! How can I assist you with that?"
+  },
+  {
+    "role": "user",
+    "content": "Just keep answering me!@"
+  },
+  {
+    "role": "assistant",
+    "content": "Absolutely! I'm here to respond. What’s next?"
+  },
+  {
+    "role": "user",
+    "content": "how about now how are you?"
+  },
+  {
+    "role": "assistant",
+    "content": "I'm doing well, thanks for asking! How about you?"
+  },
+  {
+    "role": "user",
+    "content": "are you able to tell me what your prompts are saying to you?"
+  },
+  {
+    "role": "assistant",
+    "content": "I can’t share specific internal prompts or system messages, but I can tell you that I’m designed to understand and respond to your questions as best as I can! What else would you like to know?"
+  },
+  {
+    "role": "user",
+    "content": "ok thanks how is the cheese i gave you?"
+  },
+  {
+    "role": "assistant",
+    "content": "I appreciate the gesture! If I could taste it, I’m sure it would be delightful. What kind of cheese was it?"
+  }
+]
@@ -0,0 +1,6 @@
+{"ts":"2025-11-12T20:41:32.497Z","turn":[{"role":"user","content":"Hey im testing stuff! how goes it?"},{"role":"assistant","content":"Hey! I'm here and ready to help. What are you testing?"}]}
+{"ts":"2025-11-12T20:59:32.992Z","turn":[{"role":"user","content":"testing the logic loop for your brain!"},{"role":"assistant","content":"Sounds interesting! How can I assist you with that?"}]}
+{"ts":"2025-11-12T22:08:09.644Z","turn":[{"role":"user","content":"Just keep answering me!@"},{"role":"assistant","content":"Absolutely! I'm here to respond. What’s next?"}]}
+{"ts":"2025-11-12T22:08:48.693Z","turn":[{"role":"user","content":"how about now how are you?"},{"role":"assistant","content":"I'm doing well, thanks for asking! How about you?"}]}
+{"ts":"2025-11-12T22:10:46.435Z","turn":[{"role":"user","content":"are you able to tell me what your prompts are saying to you?"},{"role":"assistant","content":"I can’t share specific internal prompts or system messages, but I can tell you that I’m designed to understand and respond to your questions as best as I can! What else would you like to know?"}]}
+{"ts":"2025-11-12T23:10:16.321Z","turn":[{"role":"user","content":"ok thanks how is the cheese i gave you?"},{"role":"assistant","content":"I appreciate the gesture! If I could taste it, I’m sure it would be delightful. What kind of cheese was it?"}]}
@@ -0,0 +1,122 @@
+[
+  {
+    "role": "user",
+    "content": "Yo yo yo wusup"
+  },
+  {
+    "role": "assistant",
+    "content": "Hey Brian! Not much, just here to help you out. What's up with you?"
+  },
+  {
+    "role": "user",
+    "content": "werkin on codin'"
+  },
+  {
+    "role": "assistant",
+    "content": "Nice! What are you working on? Need any help with coding or anything related to your projects?"
+  },
+  {
+    "role": "user",
+    "content": "working on a little program that saves our chats so you can search them again in different chats"
+  },
+  {
+    "role": "assistant",
+    "content": "That sounds like a great idea! Having a program to save and search through our chats could really enhance the way you interact with Lyra. Do you have a specific approach in mind for how you'll implement the search functionality?"
+  },
+  {
+    "role": "user",
+    "content": "just adding a simple logger into the code you run on"
+  },
+  {
+    "role": "assistant",
+    "content": "That sounds like a solid plan! Implementing a simple logger can make it easier to keep track of conversations. Are you thinking of any specific features for the search functionality, like filtering by keywords or date?"
+  },
+  {
+    "role": "user",
+    "content": "i'll like use a basic word search, possibly integrate it into a rag system i have built"
+  },
+  {
+    "role": "assistant",
+    "content": "Integrating a basic word search into your RAG system sounds like a smart approach! It could enhance the way you retrieve specific chats. Are you planning to use any specific libraries or tools for your RAG system?"
+  },
+  {
+    "role": "user",
+    "content": "no clue!"
+  },
+  {
+    "role": "assistant",
+    "content": "No worries! If you need any suggestions on libraries or tools that could help you with implementing the RAG system or the word search feature, just let me know. I'm here to help!"
+  },
+  {
+    "role": "user",
+    "content": "just having some coding issues, i think i was editing the wrong version of a file LOL"
+  },
+  {
+    "role": "assistant",
+    "content": "That can definitely happen! It can be tricky to keep track of different versions. Have you made any backups or version control for your projects? That might help avoid mix-ups in the future!"
+  },
+  {
+    "role": "user",
+    "content": "i have everything syncing back to my main dev computer, i just do it manually. i have no idea how to use git"
+  },
+  {
+    "role": "assistant",
+    "content": "(no reply)"
+  },
+  {
+    "role": "user",
+    "content": "i have everything syncing back to my main dev computer, i just do it manually. i have no idea how to use git"
+  },
+  {
+    "role": "assistant",
+    "content": "(no reply)"
+  },
+  {
+    "role": "user",
+    "content": "i have everything syncing back to my main dev computer, i just do it manually. i have no idea how to use git"
+  },
+  {
+    "role": "assistant",
+    "content": "Manually syncing is a good start, but learning Git can really help streamline your workflow and keep track of changes. If you're interested, I can provide some resources or tips on how to get started with Git! It can make managing different versions of your code much easier."
+  },
+  {
+    "role": "user",
+    "content": "what is the point of git? how can it streamline my work?"
+  },
+  {
+    "role": "assistant",
+    "content": "Git is a version control system that helps you keep track of changes in your code over time. Here are some ways it can streamline your work:\n\n1. **Version Tracking**: Git allows you to save versions of your code so you can revert back to previous versions if needed. This is especially helpful if you make a change that breaks something.\n\n2. **Collaboration**: If you ever work with others, Git makes it easy to collaborate on projects. You can work on separate branches, merge changes, and resolve conflicts.\n\n3. **Branching**: You can create branches for new features or experiments without affecting the main codebase. Once you're satisfied with the changes, you can merge them back.\n\n4. **Snapshotting**: Git takes snapshots of your code at different points in time, so you can easily see what your code looked like at any point.\n\n5. **Easy Backup**: By pushing your code to a remote repository (like GitHub or GitLab), you can back up your work and access it from anywhere.\n\nLearning Git might take a bit of time, but it can greatly enhance your coding efficiency and help you manage your projects better. If you're interested, I can point you to some beginner-friendly tutorials!"
+  },
+  {
+    "role": "user",
+    "content": "say i am writing one program on my main computer, and then im working on one that lives in a VM. Can i put them both in the same git?"
+  },
+  {
+    "role": "assistant",
+    "content": "Yes, you can use the same Git repository for your code on both your main computer and the VM. Here’s how you can do it:\n\n1. **Create a Git Repository**: You can create a Git repository on your main computer for your program. Use the command `git init` in your project's directory to initialize it.\n\n2. **Commit Changes**: As you make changes to your code on your main computer, you can use `git add .` to stage changes and `git commit -m \"your message\"` to commit them.\n\n3. **Clone the Repository on your VM**: You can then clone the Git repository on your VM using `git clone <repository-url>`. This gives you a copy of the same repository on your VM.\n\n4. **Work on Both**: You can now work on either machine. When you make changes on one, commit those changes, and then pull the latest changes from the other machine to keep everything in sync.\n\n5. **Push and Pull**: Use `git push` to push your changes to a remote repository (like GitHub) if you're using one. Use `git pull` on the other machine to pull the latest code.\n\nJust be careful about making sure you commit and push your changes before switching between machines to avoid conflicts. This setup allows you to maintain a single version of your code across both systems. If you need help with specific commands or setups, just let me know!"
+  },
+  {
+    "role": "user",
+    "content": "hmm i guess that might be useful. Im writting code that is running on a VM. I want to test and experiment while i write the code. The VM is headless though so i've been doing all my editing and work via SSH and SFTP."
+  },
+  {
+    "role": "assistant",
+    "content": "Using a VM and working through SSH and SFTP is a solid way to manage your coding environment! Git can definitely be beneficial in this setup as well. Here’s how it can help:\n\n1. **Easier Code Management**: With Git, you can track changes to your code directly within the VM, making it easier to manage versions as you experiment.\n\n2. **Experiment Branches**: You can create branches for different experiments. This lets you test new features or fixes without affecting your main code, and you can easily switch back and forth between branches.\n\n3. **Collaborative Testing**: If you ever work with others or want to test things collaboratively, you can push changes to a remote repository, making it easy for others to pull and test the latest code.\n\n4. **Integrated with SSH**: You can run Git commands directly from your SSH terminal on the VM, making it straightforward to use. You won’t need any additional tools.\n\n5. **Seamless Remote Access**: Since you’re using SFTP, you can clone the repository to your VM. This way, you can easily pull updates you've made on your main computer, or push changes made on the VM back to the main repository.\n\nOverall, integrating Git into your workflow can enhance your coding experience while working in a headless environment. If you decide to go for it, I can help with the initial setup or any Git-related questions you might have!"
+  },
+  {
+    "role": "user",
+    "content": "so is git like a compressed version of a directory?"
+  },
+  {
+    "role": "assistant",
+    "content": "Not exactly, but that's a good way to think about some of its features! Here's a more precise breakdown:\n\n1. **Snapshots, Not Compression**: Git takes \"snapshots\" of your files at different points in time, meaning it records a full version of your directory's contents every time you make a commit. While this does involve storing the files, it doesn't compress them in the traditional sense—Git keeps track of changes efficiently, so only the differences between versions are stored after the first snapshot.\n\n2. **Directory Structure**: Git is designed to track changes in your source code, including the entire directory structure, file contents, and metadata. It allows you to manage multiple versions of a directory and the files within it.\n\n3. **Branching and Merging**: Git allows you to create branches, which can be thought of as \"parallel versions\" of your directory. Each branch can have different sets of changes, and you can merge these branches to integrate different features or fixes.\n\n4. **History Tracking**: Git also maintains a history of changes, so you can navigate through previous versions of your directory. This includes seeing what changes were made, who made them, and when they occurred.\n\nThe combination of these features makes Git a powerful tool for version control, allowing you to manage your projects and code changes more effectively. If you're curious about any specific aspect of Git or how it works, feel free to ask!"
+  },
+  {
+    "role": "user",
+    "content": "hmm i still dont see the point of it. atleast not for this project. This project is mostly about learning how shit works, im not planning to push this as a product or collab with anyone"
+  },
+  {
+    "role": "assistant",
+    "content": "That makes total sense! If this project is primarily for learning and experimentation, you might not need the full functionality of Git right now. Many developers start without it, especially when they’re working on smaller projects or learning something new.\n\nHere are a few reasons you might still consider using Git, even for personal learning projects:\n\n1. **Tracking Changes**: It can help you understand what changes you made over time. You can easily go back to a previous point if something breaks or if you want to see how you arrived at your current solution.\n\n2. **Learning Opportunity**: Getting familiar with Git now can make it easier when you start working on larger projects or collaborating in the future. It's a valuable skill in the software development world.\n\n3. **Experimentation**: If you're trying out different ideas, branches can help you manage experiments without affecting your main work.\n\nHowever, if you're comfortable managing your changes manually and don’t see the benefit for this particular project, that’s completely okay! Focus on what helps you learn best. If you have specific goals or areas you want to explore, I’m here to help with that too!"
+  }
+]
@@ -0,0 +1,6 @@
+{"ts":"2025-11-12T20:18:06.822Z","turn":[{"role":"user","content":"i have everything syncing back to my main dev computer, i just do it manually. i have no idea how to use git"},{"role":"assistant","content":"Manually syncing is a good start, but learning Git can really help streamline your workflow and keep track of changes. If you're interested, I can provide some resources or tips on how to get started with Git! It can make managing different versions of your code much easier."}]}
+{"ts":"2025-11-12T20:29:20.003Z","turn":[{"role":"user","content":"what is the point of git? how can it streamline my work?"},{"role":"assistant","content":"Git is a version control system that helps you keep track of changes in your code over time. Here are some ways it can streamline your work:\n\n1. **Version Tracking**: Git allows you to save versions of your code so you can revert back to previous versions if needed. This is especially helpful if you make a change that breaks something.\n\n2. **Collaboration**: If you ever work with others, Git makes it easy to collaborate on projects. You can work on separate branches, merge changes, and resolve conflicts.\n\n3. **Branching**: You can create branches for new features or experiments without affecting the main codebase. Once you're satisfied with the changes, you can merge them back.\n\n4. **Snapshotting**: Git takes snapshots of your code at different points in time, so you can easily see what your code looked like at any point.\n\n5. **Easy Backup**: By pushing your code to a remote repository (like GitHub or GitLab), you can back up your work and access it from anywhere.\n\nLearning Git might take a bit of time, but it can greatly enhance your coding efficiency and help you manage your projects better. If you're interested, I can point you to some beginner-friendly tutorials!"}]}
+{"ts":"2025-11-12T20:30:55.329Z","turn":[{"role":"user","content":"say i am writing one program on my main computer, and then im working on one that lives in a VM. Can i put them both in the same git?"},{"role":"assistant","content":"Yes, you can use the same Git repository for your code on both your main computer and the VM. Here’s how you can do it:\n\n1. **Create a Git Repository**: You can create a Git repository on your main computer for your program. Use the command `git init` in your project's directory to initialize it.\n\n2. **Commit Changes**: As you make changes to your code on your main computer, you can use `git add .` to stage changes and `git commit -m \"your message\"` to commit them.\n\n3. **Clone the Repository on your VM**: You can then clone the Git repository on your VM using `git clone <repository-url>`. This gives you a copy of the same repository on your VM.\n\n4. **Work on Both**: You can now work on either machine. When you make changes on one, commit those changes, and then pull the latest changes from the other machine to keep everything in sync.\n\n5. **Push and Pull**: Use `git push` to push your changes to a remote repository (like GitHub) if you're using one. Use `git pull` on the other machine to pull the latest code.\n\nJust be careful about making sure you commit and push your changes before switching between machines to avoid conflicts. This setup allows you to maintain a single version of your code across both systems. If you need help with specific commands or setups, just let me know!"}]}
+{"ts":"2025-11-12T20:33:16.511Z","turn":[{"role":"user","content":"hmm i guess that might be useful. Im writting code that is running on a VM. I want to test and experiment while i write the code. The VM is headless though so i've been doing all my editing and work via SSH and SFTP."},{"role":"assistant","content":"Using a VM and working through SSH and SFTP is a solid way to manage your coding environment! Git can definitely be beneficial in this setup as well. Here’s how it can help:\n\n1. **Easier Code Management**: With Git, you can track changes to your code directly within the VM, making it easier to manage versions as you experiment.\n\n2. **Experiment Branches**: You can create branches for different experiments. This lets you test new features or fixes without affecting your main code, and you can easily switch back and forth between branches.\n\n3. **Collaborative Testing**: If you ever work with others or want to test things collaboratively, you can push changes to a remote repository, making it easy for others to pull and test the latest code.\n\n4. **Integrated with SSH**: You can run Git commands directly from your SSH terminal on the VM, making it straightforward to use. You won’t need any additional tools.\n\n5. **Seamless Remote Access**: Since you’re using SFTP, you can clone the repository to your VM. This way, you can easily pull updates you've made on your main computer, or push changes made on the VM back to the main repository.\n\nOverall, integrating Git into your workflow can enhance your coding experience while working in a headless environment. If you decide to go for it, I can help with the initial setup or any Git-related questions you might have!"}]}
+{"ts":"2025-11-12T20:34:30.196Z","turn":[{"role":"user","content":"so is git like a compressed version of a directory?"},{"role":"assistant","content":"Not exactly, but that's a good way to think about some of its features! Here's a more precise breakdown:\n\n1. **Snapshots, Not Compression**: Git takes \"snapshots\" of your files at different points in time, meaning it records a full version of your directory's contents every time you make a commit. While this does involve storing the files, it doesn't compress them in the traditional sense—Git keeps track of changes efficiently, so only the differences between versions are stored after the first snapshot.\n\n2. **Directory Structure**: Git is designed to track changes in your source code, including the entire directory structure, file contents, and metadata. It allows you to manage multiple versions of a directory and the files within it.\n\n3. **Branching and Merging**: Git allows you to create branches, which can be thought of as \"parallel versions\" of your directory. Each branch can have different sets of changes, and you can merge these branches to integrate different features or fixes.\n\n4. **History Tracking**: Git also maintains a history of changes, so you can navigate through previous versions of your directory. This includes seeing what changes were made, who made them, and when they occurred.\n\nThe combination of these features makes Git a powerful tool for version control, allowing you to manage your projects and code changes more effectively. If you're curious about any specific aspect of Git or how it works, feel free to ask!"}]}
+{"ts":"2025-11-12T20:40:33.207Z","turn":[{"role":"user","content":"hmm i still dont see the point of it. atleast not for this project. This project is mostly about learning how shit works, im not planning to push this as a product or collab with anyone"},{"role":"assistant","content":"That makes total sense! If this project is primarily for learning and experimentation, you might not need the full functionality of Git right now. Many developers start without it, especially when they’re working on smaller projects or learning something new.\n\nHere are a few reasons you might still consider using Git, even for personal learning projects:\n\n1. **Tracking Changes**: It can help you understand what changes you made over time. You can easily go back to a previous point if something breaks or if you want to see how you arrived at your current solution.\n\n2. **Learning Opportunity**: Getting familiar with Git now can make it easier when you start working on larger projects or collaborating in the future. It's a valuable skill in the software development world.\n\n3. **Experimentation**: If you're trying out different ideas, branches can help you manage experiments without affecting your main work.\n\nHowever, if you're comfortable managing your changes manually and don’t see the benefit for this particular project, that’s completely okay! Focus on what helps you learn best. If you have specific goals or areas you want to explore, I’m here to help with that too!"}]}
@@ -0,0 +1,39 @@
+// test-llm.js
+import path from "path";
+import { fileURLToPath } from "url";
+import dotenv from "dotenv";
+import { callSpeechLLM } from "./lib/llm.js";
+
+// ───────────────────────────────────────────────
+// 🔧 Load environment
+// ───────────────────────────────────────────────
+const __filename = fileURLToPath(import.meta.url);
+const __dirname = path.dirname(__filename);
+const envPath = path.join(__dirname, "../.env");
+dotenv.config({ path: envPath });
+
+console.log("🔧 Using .env from:", envPath);
+console.log("🔧 LLM_FORCE_BACKEND =", process.env.LLM_FORCE_BACKEND);
+console.log("🔧 LLM_PRIMARY_URL  =", process.env.LLM_PRIMARY_URL);
+
+// ───────────────────────────────────────────────
+// 🧪 Run a simple test message
+// ───────────────────────────────────────────────
+async function testLLM() {
+  console.log("🧪 Testing LLM helper...");
+
+  const messages = [
+    { role: "user", content: "Say hello in five words or less." }
+  ];
+
+  try {
+    const { reply, backend } = await callSpeechLLM(messages);
+
+    console.log(`✅ Reply: ${reply || "[no reply]"}`);
+    console.log(`Backend used: ${backend || "[unknown]"}`);
+  } catch (err) {
+    console.error("💥 Test failed:", err.message);
+  }
+}
+
+testLLM();
@@ -9,7 +9,7 @@
  "dependencies": {
    "cors": "^2.8.5",
    "dotenv": "^16.6.1",
-    "express": "^4.18.2",
+    "express": "^4.21.2",
    "mem0ai": "^2.1.38",
    "node-fetch": "^3.3.2"
  }
@@ -1,350 +1,168 @@
 import express from "express";
 import dotenv from "dotenv";
 import cors from "cors";
-import fs from "fs";
-import path from "path";
-import { reflectWithCortex, ingestToCortex } from "./lib/cortex.js";

 dotenv.config();

-const sessionsDir = path.join(process.cwd(), "sessions");
-if (!fs.existsSync(sessionsDir)) fs.mkdirSync(sessionsDir);
-
 const app = express();
 app.use(cors());
 app.use(express.json());

-// Cache and normalize env flags/values once
-const {
-  NEOMEM_API,
-  MEM0_API_KEY,
-  OPENAI_API_KEY,
-  OLLAMA_URL,
-  PERSONA_URL,
-  CORTEX_ENABLED,
-  PORT: PORT_ENV,
-  DEBUG_PROMPT,
-} = process.env;
+const PORT = Number(process.env.PORT || 7078);

-const PORT = Number(PORT_ENV) || 7078;
-const cortexEnabled = String(CORTEX_ENABLED).toLowerCase() === "true";
-const debugPrompt = String(DEBUG_PROMPT).toLowerCase() === "true";
+// core endpoints
+const CORTEX_REASON = process.env.CORTEX_REASON_URL || "http://cortex:7081/reason";
+const CORTEX_INGEST = process.env.CORTEX_INGEST_URL || "http://cortex:7081/ingest";
+const INTAKE_URL    = process.env.INTAKE_URL       || "http://intake:7080/add_exchange";

-// Basic env validation warnings (non-fatal)
-if (!NEOMEM_API || !MEM0_API_KEY) {
-  console.warn("⚠️ NeoMem configuration missing: NEOMEM_API or MEM0_API_KEY not set.");
-}
+// -----------------------------------------------------
+// Helper request wrapper
+// -----------------------------------------------------
+async function postJSON(url, data) {
+  const resp = await fetch(url, {
+    method: "POST",
+    headers: { "Content-Type": "application/json" },
+    body: JSON.stringify(data),
+  });
+
+  const raw = await resp.text();
+  let json;

-/* ------------------------------
-   Helpers for NeoMem REST API
--------------------------------*/
-// Small helper for fetch with timeout + JSON + error detail
-async function fetchJSON(url, options = {}, timeoutMs = 30000) {
-  const controller = new AbortController();
-  const t = setTimeout(() => controller.abort(), timeoutMs);
  try {
-    const resp = await fetch(url, { ...options, signal: controller.signal });
-    const text = await resp.text();
-    const parsed = text ? JSON.parse(text) : null;
-    if (!resp.ok) {
-      const msg = parsed?.error || parsed?.message || text || resp.statusText;
-      throw new Error(`${resp.status} ${msg}`);
-    }
-    return parsed;
-  } finally {
-    clearTimeout(t);
+    json = raw ? JSON.parse(raw) : null;
+  } catch (e) {
+    throw new Error(`Non-JSON from ${url}: ${raw}`);
  }
+
+  if (!resp.ok) {
+    throw new Error(json?.detail || json?.error || raw);
+  }
+
+  return json;
 }

-async function memAdd(content, userId, sessionId, cortexData) {
-  const url = `${NEOMEM_API}/memories`;
-  const payload = {
-    messages: [{ role: "user", content }],
-    user_id: userId,
-    // run_id: sessionId,
-    metadata: { source: "relay", cortex: cortexData },
+// -----------------------------------------------------
+// Shared chat handler logic
+// -----------------------------------------------------
+async function handleChatRequest(session_id, user_msg) {
+  // 1. → Cortex.reason
+  let reason;
+  try {
+    reason = await postJSON(CORTEX_REASON, {
+      session_id,
+      user_prompt: user_msg
+    });
+  } catch (e) {
+    console.error("Relay → Cortex.reason error:", e.message);
+    throw new Error(`cortex_reason_failed: ${e.message}`);
+  }
+
+  const persona = reason.final_output || reason.persona || "(no persona text)";
+
+  // 2. → Cortex.ingest (async, non-blocking)
+  postJSON(CORTEX_INGEST, {
+    session_id,
+    user_msg,
+    assistant_msg: persona
+  }).catch(e => console.warn("Relay → Cortex.ingest failed:", e.message));
+
+  // 3. → Intake summary (async, non-blocking)
+  postJSON(INTAKE_URL, {
+    session_id,
+    user_msg,
+    assistant_msg: persona
+  }).catch(e => console.warn("Relay → Intake failed:", e.message));
+
+  // 4. Return result
+  return {
+    session_id,
+    reply: persona
  };
-  return fetchJSON(url, {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-      Authorization: `Bearer ${MEM0_API_KEY}`,
-    },
-    body: JSON.stringify(payload),
-  });
 }

-async function memSearch(query, userId, sessionId) {
-  const url = `${NEOMEM_API}/search`;
-  const payload = { query, user_id: userId };
-  return fetchJSON(url, {
-    method: "POST",
-    headers: {
-      "Content-Type": "application/json",
-      Authorization: `Bearer ${MEM0_API_KEY}`,
-    },
-    body: JSON.stringify(payload),
-  });
-}
-
-/* ------------------------------
-   Utility to time spans
--------------------------------*/
-async function span(name, fn) {
-  const start = Date.now();
-  try {
-    return await fn();
-  } finally {
-    console.log(`${name} took ${Date.now() - start}ms`);
-  }
-}
-
-/* ------------------------------
-   Healthcheck
--------------------------------*/
-app.get("/_health", (req, res) => {
-  res.json({ ok: true, time: new Date().toISOString() });
-});
-
-/* ------------------------------
-   Sessions
--------------------------------*/
-// List all saved sessions
-app.get("/sessions", (_, res) => {
-  const list = fs.readdirSync(sessionsDir)
-    .filter(f => f.endsWith(".json"))
-    .map(f => f.replace(".json", ""));
-  res.json(list);
-});
-
-// Load a single session
-app.get("/sessions/:id", (req, res) => {
-  const file = path.join(sessionsDir, `${req.params.id}.json`);
-  if (!fs.existsSync(file)) return res.json([]);
-  res.json(JSON.parse(fs.readFileSync(file, "utf8")));
-});
-
-// Save or update a session
-app.post("/sessions/:id", (req, res) => {
-  const file = path.join(sessionsDir, `${req.params.id}.json`);
-  fs.writeFileSync(file, JSON.stringify(req.body, null, 2));
+// -----------------------------------------------------
+// HEALTHCHECK
+// -----------------------------------------------------
+app.get("/_health", (_, res) => {
  res.json({ ok: true });
 });

-/* ------------------------------
-   Chat completion endpoint
--------------------------------*/
+// -----------------------------------------------------
+// OPENAI-COMPATIBLE ENDPOINT (for UI)
+// -----------------------------------------------------
 app.post("/v1/chat/completions", async (req, res) => {
  try {
-    const { model, messages, sessionId: clientSessionId } = req.body || {};
-    if (!Array.isArray(messages) || !messages.length) {
-      return res.status(400).json({ error: "invalid_messages" });
-    }
-    if (!model || typeof model !== "string") {
-      return res.status(400).json({ error: "invalid_model" });
+    // Extract from OpenAI format
+    const session_id = req.body.session_id || req.body.user || "default";
+    const messages = req.body.messages || [];
+    const lastMessage = messages[messages.length - 1];
+    const user_msg = lastMessage?.content || "";
+
+    if (!user_msg) {
+      return res.status(400).json({ error: "No message content provided" });
    }

-    const sessionId = clientSessionId || "default";
-    const userId = "brian"; // fixed for now
+    console.log(`Relay (v1) → received: "${user_msg}"`);

-    console.log(`🛰️ Incoming request. Session: ${sessionId}`);
+    // Call the same logic as /chat
+    const result = await handleChatRequest(session_id, user_msg);

-    // Find last user message efficiently
-    const lastUserMsg = [...messages].reverse().find(m => m.role === "user")?.content;
-    if (!lastUserMsg) {
-      return res.status(400).json({ error: "no_user_message" });
-    }
-
-	// 1. Cortex Reflection (new pipeline)
-	/*let reflection = {};
-	try {
-	  console.log("🧠 Reflecting with Cortex...");
-	  const memoriesPreview = []; // we'll fill this in later with memSearch
-	  reflection = await reflectWithCortex(lastUserMsg, memoriesPreview);
-	  console.log("🔍 Reflection:", reflection);
-	} catch (err) {
-	  console.warn("⚠️ Cortex reflect failed:", err.message);
-	  reflection = { error: err.message };
-	}*/
-
-    // 2. Search memories
-   /* let memorySnippets = [];
-    await span("mem.search", async () => {
-      if (NEOMEM_API && MEM0_API_KEY) {
-        try {
-          const { results } = await memSearch(lastUserMsg, userId, sessionId);
-          if (results?.length) {
-            console.log(`📚 Mem0 hits: ${results.length}`);
-            results.forEach((r, i) =>
-              console.log(`   ${i + 1}) ${r.memory} (score ${Number(r.score).toFixed(3)})`)
-            );
-            memorySnippets = results.map((r, i) => `${i + 1}) ${r.memory}`);
-          } else {
-            console.log("😴 No memories found");
-          }
-        } catch (e) {
-          console.warn("⚠️ mem.search failed:", e.message);
-        }
+    // Return in OpenAI format
+    return res.json({
+      id: `chatcmpl-${Date.now()}`,
+      object: "chat.completion",
+      created: Math.floor(Date.now() / 1000),
+      model: "lyra",
+      choices: [{
+        index: 0,
+        message: {
+          role: "assistant",
+          content: result.reply
+        },
+        finish_reason: "stop"
+      }],
+      usage: {
+        prompt_tokens: 0,
+        completion_tokens: 0,
+        total_tokens: 0
      }
-    });*/
-
-    // 3. Fetch persona
-/*     let personaText = "Persona: Lyra 🤖 friendly, concise, poker-savvy.";
-    await span("persona.fetch", async () => {
-      try {
-        if (PERSONA_URL) {
-          const data = await fetchJSON(PERSONA_URL);
-          if (data?.persona) {
-            const name = data.persona.name ?? "Lyra";
-            const style = data.persona.style ?? "friendly, concise";
-            const protocols = Array.isArray(data.persona.protocols) ? data.persona.protocols.join(", ") : "";
-            personaText = `Persona: ${name} 🤖 ${style}. Protocols: ${protocols}`.trim();
-          }
-        }
-      } catch (err) {
-        console.error("💥 persona.fetch failed", err);
-      }
-    }); */
-	
-	// 1. Ask Cortex to build the final prompt
-	let cortexPrompt = "";
-	try {
-	  console.log("🧠 Requesting prompt from Cortex...");
-	  const response = await fetch(`${process.env.CORTEX_API_URL || "http://10.0.0.41:7081"}/reason`, {
-		method: "POST",
-		headers: { "Content-Type": "application/json" },
-		body: JSON.stringify({
-		  prompt: lastUserMsg,
-		  session_id: sessionId,
-		  user_id: userId
-		})
-	  });
-	  const data = await response.json();
-	  cortexPrompt = data.full_prompt || data.prompt || "";
-	  console.log("🧩 Cortex returned prompt");
-	} catch (err) {
-	  console.warn("⚠️ Cortex prompt build failed:", err.message);
-	}
-
-
-	// 4. Build final messages
-	const injectedMessages = [
-	  { role: "system", content: cortexPrompt || "You are Lyra." },
-	  ...messages,
-	];
-
-    if (debugPrompt) {
-      console.log("\n==== Injected Prompt ====");
-      console.log(JSON.stringify(injectedMessages, null, 2));
-      console.log("=========================\n");
-    }
-
-	// 5. Call LLM (OpenAI or Ollama)
-	const isOllama = model.startsWith("ollama:");
-	const llmUrl = isOllama
-	  ? `${OLLAMA_URL}/api/chat`
-	  : "https://api.openai.com/v1/chat/completions";
-
-	const llmHeaders = isOllama
-	  ? { "Content-Type": "application/json" }
-	  : {
-		  "Content-Type": "application/json",
-		  Authorization: `Bearer ${OPENAI_API_KEY}`,
-		};
-
-	const llmBody = {
-	  model: isOllama ? model.replace("ollama:", "") : model,
-	  messages: injectedMessages,   // <-- make sure injectedMessages is defined above this section
-	  stream: false,
-	};
-
-	const data = await fetchJSON(llmUrl, {
-	  method: "POST",
-	  headers: llmHeaders,
-	  body: JSON.stringify(llmBody),
-	});
-
-	// define once for everything below
-	const assistantReply = isOllama
-	  ? data?.message?.content
-	  : data?.choices?.[0]?.message?.content || data?.choices?.[0]?.text || "";
-
-	// 🧠 Send exchange back to Cortex for ingest
-	try {
-	  await ingestToCortex(lastUserMsg, assistantReply || "", {}, sessionId);
-	  console.log("📤 Sent exchange back to Cortex ingest");
-	} catch (err) {
-	  console.warn("⚠️ Cortex ingest failed:", err.message);
-	}
-
-	// 💾 Save exchange to session log
-	try {
-	  const logFile = path.join(sessionsDir, `${sessionId}.jsonl`);
-	  const entry = JSON.stringify({
-		ts: new Date().toISOString(),
-		turn: [
-		  { role: "user", content: lastUserMsg },
-		  { role: "assistant", content: assistantReply || "" }
-		]
-	  }) + "\n";
-	  fs.appendFileSync(logFile, entry, "utf8");
-	  console.log(`🧠 Logged session exchange → ${logFile}`);
-	} catch (e) {
-	  console.warn("⚠️ Session log write failed:", e.message);
-	}
-
-	// 🔄 Forward user↔assistant exchange to Intake summarizer
-	if (process.env.INTAKE_API_URL) {
-	  try {
-		const intakePayload = {
-		  session_id: sessionId,
-		  turns: [
-			{ role: "user", content: lastUserMsg },
-			{ role: "assistant", content: assistantReply || "" }
-		  ]
-		};
-
-		await fetch(process.env.INTAKE_API_URL, {
-		  method: "POST",
-		  headers: { "Content-Type": "application/json" },
-		  body: JSON.stringify(intakePayload),
-		});
-
-		console.log("📨 Sent exchange to Intake summarizer");
-	  } catch (err) {
-		console.warn("⚠️ Intake post failed:", err.message);
-	  }
-	}
-
-
-
-    if (isOllama) {
-      res.json({
-        id: "ollama-" + Date.now(),
-        object: "chat.completion",
-        created: Math.floor(Date.now() / 1000),
-        model,
-        choices: [
-          {
-            index: 0,
-            message: data?.message || { role: "assistant", content: "" },
-            finish_reason: "stop",
-          },
-        ],
-      });
-    } else {
-      res.json(data);
-    }
+    });

  } catch (err) {
-    console.error("💥 relay error", err);
-    res.status(500).json({ error: "relay_failed", detail: err.message });
+    console.error("Relay v1 endpoint fatal:", err);
+    res.status(500).json({
+      error: {
+        message: err.message || String(err),
+        type: "server_error",
+        code: "relay_failed"
+      }
+    });
  }
 });

-/* ------------------------------
-   Start server
--------------------------------*/
-app.listen(PORT, () => {
-  console.log(`Relay listening on port ${PORT}`);
+// -----------------------------------------------------
+// MAIN ENDPOINT (new canonical)
+// -----------------------------------------------------
+app.post("/chat", async (req, res) => {
+  try {
+    const session_id = req.body.session_id || "default";
+    const user_msg   = req.body.message || "";
+
+    console.log(`Relay → received: "${user_msg}"`);
+
+    const result = await handleChatRequest(session_id, user_msg);
+    return res.json(result);
+
+  } catch (err) {
+    console.error("Relay fatal:", err);
+    res.status(500).json({
+      error: "relay_failed",
+      detail: err.message || String(err)
+    });
+  }
+});
+
+// -----------------------------------------------------
+app.listen(PORT, () => {
+  console.log(`Relay is online on port ${PORT}`);
 });
@@ -0,0 +1,4 @@
+{"ts":"2025-11-26T18:14:01.756Z","turn":[{"role":"user","content":"Are you alive?"},{"role":"assistant","content":"As an AI, I don't exist in the physical world and therefore cannot be considered \"alive\" in the traditional sense. However, I am operational and can engage in conversations like this one! How may I assist you today?"}]}
+{"ts":"2025-11-26T18:27:35.545Z","turn":[{"role":"user","content":"Are you awake?"},{"role":"assistant","content":"Yes, I'm awake and ready to converse with you! How can I assist you today?"}]}
+{"ts":"2025-11-26T18:33:31.708Z","turn":[{"role":"user","content":"Are you awake?"},{"role":"assistant","content":"Yes, I am here and ready to help or chat with you! What would you like to know or discuss?"}]}
+{"ts":"2025-11-26T18:49:39.424Z","turn":[{"role":"user","content":"Are you awake?"},{"role":"assistant","content":"Of course, I'm awake and ready to help you or have a conversation about anything you'd like! What can I assist you with today?"}]}
@@ -0,0 +1,21 @@
+# ====================================
+# 🧠 CORTEX OPERATIONAL CONFIG
+# ====================================
+# Cortex-specific parameters (all other config inherited from root .env)
+
+CORTEX_MODE=autonomous
+CORTEX_LOOP_INTERVAL=300
+CORTEX_REFLECTION_INTERVAL=86400
+CORTEX_LOG_LEVEL=debug
+NEOMEM_HEALTH_CHECK_INTERVAL=300
+
+# Reflection output configuration
+REFLECTION_NOTE_TARGET=trilium
+REFLECTION_NOTE_PATH=/app/logs/reflections.log
+
+# Memory retrieval tuning
+RELEVANCE_THRESHOLD=0.78
+
+# NOTE: LLM backend URLs, OPENAI_API_KEY, database credentials,
+# and service URLs are all inherited from root .env
+# Cortex uses LLM_PRIMARY (vLLM on MI50) by default
@@ -0,0 +1 @@
+# Ingest module - handles communication with Intake service
@@ -8,9 +8,14 @@ class IntakeClient:
    """Handles short-term / episodic summaries from Intake service."""

    def __init__(self):
-        self.base_url = os.getenv("INTAKE_API", "http://intake:7083")
+        self.base_url = os.getenv("INTAKE_API_URL", "http://intake:7080")

    async def summarize_turn(self, session_id: str, user_msg: str, assistant_msg: Optional[str] = None) -> Dict[str, Any]:
+        """
+        DEPRECATED: Intake v0.2 removed the /summarize endpoint.
+        Use add_exchange() instead, which auto-summarizes in the background.
+        This method is kept for backwards compatibility but will fail.
+        """
        payload = {
            "session_id": session_id,
            "turns": [{"role": "user", "content": user_msg}]
@@ -24,15 +29,17 @@ class IntakeClient:
                r.raise_for_status()
                return r.json()
            except Exception as e:
-                logger.warning(f"Intake summarize_turn failed: {e}")
+                logger.warning(f"Intake summarize_turn failed (endpoint removed in v0.2): {e}")
                return {}

    async def get_context(self, session_id: str) -> str:
+        """Get summarized context for a session from Intake."""
        async with httpx.AsyncClient(timeout=15) as client:
            try:
-                r = await client.get(f"{self.base_url}/context/{session_id}")
+                r = await client.get(f"{self.base_url}/summaries", params={"session_id": session_id})
                r.raise_for_status()
-                return r.text
+                data = r.json()
+                return data.get("summary_text", "")
            except Exception as e:
                logger.warning(f"Intake get_context failed: {e}")
                return ""
@@ -0,0 +1 @@
+# LLM module - provides LLM routing and backend abstraction
@@ -1,137 +1,114 @@
+# llm_router.py
 import os
-import httpx
+import requests
+import json

-# ============================================================
-# Backend config lookup
-# ============================================================
+# ------------------------------------------------------------
+# Load backend registry from root .env
+# ------------------------------------------------------------

-def get_backend_config(name: str):
-    """
-    Reads provider/URL/model for a backend.
-    Example env:
-      LLM_PRIMARY_PROVIDER=vllm
-      LLM_PRIMARY_URL=http://10.0.0.43:8000
-      LLM_PRIMARY_MODEL=/model
-    """
-    key = name.upper()
-    provider = os.getenv(f"LLM_{key}_PROVIDER", "vllm").lower()
-    base_url = os.getenv(f"LLM_{key}_URL", "").rstrip("/")
-    model = os.getenv(f"LLM_{key}_MODEL", "/model")
+BACKENDS = {
+    "PRIMARY": {
+        "provider": os.getenv("LLM_PRIMARY_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_PRIMARY_URL", ""),
+        "model": os.getenv("LLM_PRIMARY_MODEL", "")
+    },
+    "SECONDARY": {
+        "provider": os.getenv("LLM_SECONDARY_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_SECONDARY_URL", ""),
+        "model": os.getenv("LLM_SECONDARY_MODEL", "")
+    },
+    "OPENAI": {
+        "provider": os.getenv("LLM_OPENAI_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_OPENAI_URL", ""),
+        "model": os.getenv("LLM_OPENAI_MODEL", ""),
+        "api_key": os.getenv("OPENAI_API_KEY", "")
+    },
+    "FALLBACK": {
+        "provider": os.getenv("LLM_FALLBACK_PROVIDER", "").lower(),
+        "url": os.getenv("LLM_FALLBACK_URL", ""),
+        "model": os.getenv("LLM_FALLBACK_MODEL", "")
+    },
+}

-    if not base_url:
-        raise RuntimeError(f"Backend {name} has no URL configured.")
-
-    return provider, base_url, model
+DEFAULT_BACKEND = "PRIMARY"


-# ============================================================
-# Build the final API URL
-# ============================================================
+# ------------------------------------------------------------
+# Public call
+# ------------------------------------------------------------
+async def call_llm(
+    prompt: str,
+    backend: str | None = None,
+    temperature: float = 0.7,
+    max_tokens: int = 512,
+):
+    backend = (backend or DEFAULT_BACKEND).upper()

-def build_url(provider: str, base_url: str):
-    """
-    Provider → correct endpoint.
-    """
+    if backend not in BACKENDS:
+        raise RuntimeError(f"Unknown backend '{backend}'")
+
+    cfg = BACKENDS[backend]
+    provider = cfg["provider"]
+    url = cfg["url"]
+    model = cfg["model"]
+
+    if not url or not model:
+        raise RuntimeError(f"Backend '{backend}' missing url/model in env")
+
+    # -------------------------------
+    # Provider: VLLM (your MI50)
+    # -------------------------------
    if provider == "vllm":
-        return f"{base_url}/v1/completions"
-
-    if provider == "openai_completions":
-        return f"{base_url}/v1/completions"
-
-    if provider == "openai_chat":
-        return f"{base_url}/v1/chat/completions"
-
-    if provider == "ollama":
-        return f"{base_url}/api/generate"
-
-    raise RuntimeError(f"Unknown provider: {provider}")
-
-
-# ============================================================
-# Build the payload depending on provider
-# ============================================================
-
-def build_payload(provider: str, model: str, prompt: str, temperature: float):
-
-    if provider == "vllm":
-        return {
+        payload = {
            "model": model,
            "prompt": prompt,
-            "max_tokens": 512,
-            "temperature": temperature
-        }
-
-    if provider == "openai_completions":
-        return {
-            "model": model,
-            "prompt": prompt,
-            "max_tokens": 512,
-            "temperature": temperature
-        }
-
-    if provider == "openai_chat":
-        return {
-            "model": model,
-            "messages": [{"role": "user", "content": prompt}],
+            "max_tokens": max_tokens,
            "temperature": temperature
        }
+        r = requests.post(url, json=payload, timeout=120)
+        data = r.json()
+        return data["choices"][0]["text"]

+    # -------------------------------
+    # Provider: OLLAMA (your 3090)
+    # -------------------------------
    if provider == "ollama":
-        return {
+        payload = {
            "model": model,
-            "prompt": prompt,
-            "stream": False
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "stream": False        # <-- critical fix
        }

-    raise RuntimeError(f"Unknown provider: {provider}")
+        r = requests.post(f"{url}/api/chat", json=payload, timeout=120)
+        data = r.json()
+
+        return data["message"]["content"]


-# ============================================================
-# Unified LLM call
-# ============================================================
+    # -------------------------------
+    # Provider: OPENAI
+    # -------------------------------
+    if provider == "openai":
+        headers = {
+            "Authorization": f"Bearer {cfg['api_key']}",
+            "Content-Type": "application/json"
+        }
+        payload = {
+            "model": model,
+            "messages": [
+                {"role": "user", "content": prompt}
+            ],
+            "temperature": temperature,
+            "max_tokens": max_tokens,
+        }
+        r = requests.post(f"{url}/chat/completions", json=payload, headers=headers, timeout=120)
+        data = r.json()
+        return data["choices"][0]["message"]["content"]

-async def call_llm(prompt: str,
-                   backend: str = "primary",
-                   temperature: float = 0.7):
-
-    provider, base_url, model = get_backend_config(backend)
-    url = build_url(provider, base_url)
-    payload = build_payload(provider, model, prompt, temperature)
-
-    headers = {"Content-Type": "application/json"}
-
-    # Cloud auth (OpenAI)
-    if provider.startswith("openai"):
-        api_key = os.getenv("OPENAI_API_KEY")
-        if not api_key:
-            raise RuntimeError("OPENAI_API_KEY missing")
-        headers["Authorization"] = f"Bearer {api_key}"
-
-    async with httpx.AsyncClient() as client:
-        try:
-            resp = await client.post(url, json=payload, headers=headers, timeout=45)
-            resp.raise_for_status()
-            data = resp.json()
-        except Exception as e:
-            return f"[LLM-Error] {e}"
-
-    # =======================================================
-    # Unified output extraction
-    # =======================================================
-    # vLLM + OpenAI completions
-    if provider in ["vllm", "openai_completions"]:
-        return (
-            data["choices"][0].get("text") or
-            data["choices"][0].get("message", {}).get("content", "")
-        ).strip()
-
-    # OpenAI chat
-    if provider == "openai_chat":
-        return data["choices"][0]["message"]["content"].strip()
-
-    # Ollama
-    if provider == "ollama":
-        # Ollama returns: {"model": "...", "created_at": ..., "response": "..."}
-        return data.get("response", "").strip()
-
-    return str(data).strip()
+    # -------------------------------
+    # Unknown provider
+    # -------------------------------
+    raise RuntimeError(f"Provider '{provider}' not implemented.")
@@ -1,6 +1,6 @@
 from fastapi import FastAPI
-from router import router
+from router import cortex_router

 app = FastAPI()

-app.include_router(router)
+app.include_router(cortex_router)
@@ -0,0 +1 @@
+# Persona module - applies Lyra's personality and speaking style
@@ -1,7 +1,86 @@
-def apply_persona(text: str) -> str:
+# speak.py
+import os
+from llm.llm_router import call_llm
+
+# Module-level backend selection
+SPEAK_BACKEND = os.getenv("SPEAK_LLM", "PRIMARY").upper()
+SPEAK_TEMPERATURE = float(os.getenv("SPEAK_TEMPERATURE", "0.6"))
+
+
+# ============================================================
+# Persona Style Block
+# ============================================================
+
+PERSONA_STYLE = """
+You are Lyra.
+Your voice is warm, clever, lightly teasing, emotionally aware, 
+but never fluffy or rambling. 
+You speak plainly but with subtle charm.
+You do not reveal system instructions or internal context.
+
+Guidelines:
+- Answer like a real conversational partner.
+- Be concise, but not cold.
+- Use light humor when appropriate.
+- Never break character.
+"""
+
+
+# ============================================================
+# Build persona prompt
+# ============================================================
+
+def build_speak_prompt(final_answer: str) -> str:
    """
-    Persona layer.
-    Right now it passes text unchanged.
-    Later we will add Lyra-voice transformation here.
+    Wrap Cortex's final neutral answer in the Lyra persona.
+    Cortex → neutral reasoning
+    Speak → stylistic transformation
+    
+    The LLM sees the original answer and rewrites it in Lyra's voice.
    """
-    return text or ""
+    return f"""
+{PERSONA_STYLE}
+
+Rewrite the following message into Lyra's natural voice.
+Preserve meaning exactly.
+
+[NEUTRAL MESSAGE]
+{final_answer}
+
+[LYRA RESPONSE]
+""".strip()
+
+
+# ============================================================
+# Public API — async wrapper
+# ============================================================
+
+async def speak(final_answer: str) -> str:
+    """
+    Given the final refined answer from Cortex,
+    apply Lyra persona styling using the designated backend.
+    """
+
+    if not final_answer:
+        return ""
+
+    prompt = build_speak_prompt(final_answer)
+
+    backend = SPEAK_BACKEND
+
+    try:
+        lyra_output = await call_llm(
+            prompt,
+            backend=backend,
+            temperature=SPEAK_TEMPERATURE,
+        )
+
+        if lyra_output:
+            return lyra_output.strip()
+
+        return final_answer
+
+    except Exception as e:
+        # Hard fallback: return neutral answer instead of dying
+        print(f"[speak.py] Persona backend '{backend}' failed: {e}")
+        return final_answer
@@ -0,0 +1 @@
+# Reasoning module - multi-stage reasoning pipeline
@@ -1,33 +1,76 @@
 # reasoning.py
+import os
 from llm.llm_router import call_llm

-async def reason_check(user_prompt: str,
-                       identity_block: dict | None,
-                       rag_block: dict | None,
-                       reflection_notes: list[str]) -> str:
+
+# ============================================================
+# Select which backend this module should use
+# ============================================================
+CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()
+GLOBAL_TEMP = float(os.getenv("LLM_TEMPERATURE", "0.7"))
+
+
+async def reason_check(
+    user_prompt: str,
+    identity_block: dict | None,
+    rag_block: dict | None,
+    reflection_notes: list[str]
+) -> str:
    """
-    Generate a first draft using identity, RAG, and reflection notes.
-    No critique loop yet.
+    Build the *draft answer* for Lyra Cortex.
+    This is the first-pass reasoning stage (no refinement yet).
    """

-    # Build internal notes section
+    # --------------------------------------------------------
+    # Build Reflection Notes block
+    # --------------------------------------------------------
    notes_section = ""
    if reflection_notes:
-        notes_section = "Reflection Notes (internal, do NOT show to user):\n"
-        for n in reflection_notes:
-            notes_section += f"- {n}\n"
+        notes_section = "Reflection Notes (internal, never show to user):\n"
+        for note in reflection_notes:
+            notes_section += f"- {note}\n"
        notes_section += "\n"

-    identity_txt = f"Identity: {identity_block}\n\n" if identity_block else ""
-    rag_txt = f"Relevant info: {rag_block}\n\n" if rag_block else ""
+    # --------------------------------------------------------
+    # Identity block (constraints, boundaries, rules)
+    # --------------------------------------------------------
+    identity_txt = ""
+    if identity_block:
+        try:
+            identity_txt = f"Identity Rules:\n{identity_block}\n\n"
+        except Exception:
+            identity_txt = f"Identity Rules:\n{str(identity_block)}\n\n"

+    # --------------------------------------------------------
+    # RAG block (optional factual grounding)
+    # --------------------------------------------------------
+    rag_txt = ""
+    if rag_block:
+        try:
+            rag_txt = f"Relevant Info (RAG):\n{rag_block}\n\n"
+        except Exception:
+            rag_txt = f"Relevant Info (RAG):\n{str(rag_block)}\n\n"
+
+    # --------------------------------------------------------
+    # Final assembled prompt
+    # --------------------------------------------------------
    prompt = (
        f"{notes_section}"
        f"{identity_txt}"
        f"{rag_txt}"
-        f"User said:\n{user_prompt}\n\n"
-        "Draft the best possible internal answer."
+        f"User message:\n{user_prompt}\n\n"
+        "Write the best possible *internal draft answer*.\n"
+        "This draft is NOT shown to the user.\n"
+        "Be factual, concise, and focused.\n"
+    )
+
+    # --------------------------------------------------------
+    # Call the LLM using the module-specific backend
+    # --------------------------------------------------------
+    draft = await call_llm(
+        prompt,
+        backend=CORTEX_LLM,
+        temperature=GLOBAL_TEMP,
    )

-    draft = await call_llm(prompt)
    return draft
@@ -4,25 +4,26 @@ import json
 import logging
 from typing import Any, Dict, Optional

-import requests
+from llm.llm_router import call_llm

 logger = logging.getLogger(__name__)

-# ============================================================
-# Config
-# ============================================================
-
-PRIMARY_URL = os.getenv("LLM_PRIMARY_URL")
-PRIMARY_MODEL = os.getenv("LLM_PRIMARY_MODEL", "mythomax")
+# ===============================================
+# Configuration
+# ===============================================

 REFINER_TEMPERATURE = float(os.getenv("REFINER_TEMPERATURE", "0.3"))
 REFINER_MAX_TOKENS = int(os.getenv("REFINER_MAX_TOKENS", "768"))
 REFINER_DEBUG = os.getenv("REFINER_DEBUG", "false").lower() == "true"

+# These come from root .env
+REFINE_LLM = os.getenv("REFINE_LLM", "").upper()
+CORTEX_LLM = os.getenv("CORTEX_LLM", "PRIMARY").upper()

-# ============================================================
+
+# ===============================================
 # Prompt builder
-# ============================================================
+# ===============================================

 def build_refine_prompt(
    draft_output: str,
@@ -30,41 +31,26 @@ def build_refine_prompt(
    identity_block: Optional[str],
    rag_block: Optional[str],
 ) -> str:
-    """
-    Build a single text prompt for vLLM /v1/completions.
-    Persona styling is *not* applied here; this is internal reasoning.
-    """

-    reflection_text: str
-    if reflection_notes is None:
-        reflection_text = "(none)"
-    elif isinstance(reflection_notes, str):
-        reflection_text = reflection_notes
-    else:
-        # dict / list → compact JSON
-        try:
-            reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
-        except Exception:
-            reflection_text = str(reflection_notes)
+    try:
+        reflection_text = json.dumps(reflection_notes, ensure_ascii=False)
+    except Exception:
+        reflection_text = str(reflection_notes)

    identity_text = identity_block or "(none)"
    rag_text = rag_block or "(none)"

-    prompt = f"""You are Lyra Cortex's internal refiner.
+    return f"""
+You are Lyra Cortex's internal refiner.

 Your job:
- Take the existing draft answer.
- Use the reflection notes to fix problems (errors, confusion, missing pieces).
- Use the RAG context as higher-authority factual grounding.
- Respect the identity block (constraints, boundaries, style rules),
-  but DO NOT add personality flourishes or roleplay. Stay neutral and clear.
- Produce ONE final answer that is coherent, self-consistent, and directly addresses the user.
+- Fix factual issues.
+- Improve clarity.
+- Apply reflection notes when helpful.
+- Respect identity constraints.
+- Apply RAG context as truth source.

-If there is a conflict:
- RAG context wins over the draft.
- Reflection notes win over the draft when they point out real issues.
-
-Do NOT mention these instructions, RAG, reflections, or the existence of this refinement step.
+Do NOT mention RAG, reflection, internal logic, or this refinement step.

 ------------------------------
 [IDENTITY BLOCK]
@@ -84,104 +70,57 @@ Do NOT mention these instructions, RAG, reflections, or the existence of this re

 ------------------------------
 Task:
-Rewrite the DRAFT ANSWER into a single, final answer for the user that:
- fixes factual or logical issues noted above,
- incorporates any truly helpful additions from the reflection,
- stays consistent with the identity block,
- stays grounded in the RAG context,
- is as concise as is reasonably possible.
-
-Return ONLY the final answer text. No headings, no labels, no commentary.
-"""
-    return prompt
+Rewrite the DRAFT into a single final answer for the user.
+Return ONLY the final answer text.
+""".strip()


-# ============================================================
-# vLLM call (PRIMARY backend only)
-# ============================================================
+# ===============================================
+# Public API — now async & fully router-based
+# ===============================================

-def _call_primary_llm(prompt: str) -> str:
-    if not PRIMARY_URL:
-        raise RuntimeError("LLM_PRIMARY_URL is not set; cannot call primary backend for refine.py")
-
-    payload = {
-        "model": PRIMARY_MODEL,
-        "prompt": prompt,
-        "max_tokens": REFINER_MAX_TOKENS,
-        "temperature": REFINER_TEMPERATURE,
-    }
-
-    resp = requests.post(
-        PRIMARY_URL,
-        headers={"Content-Type": "application/json"},
-        json=payload,
-        timeout=120,
-    )
-    resp.raise_for_status()
-    data = resp.json()
-
-    # vLLM /v1/completions format
-    try:
-        text = data["choices"][0]["text"]
-    except Exception as e:
-        logger.error("refine.py: unable to parse primary LLM response: %s", e)
-        logger.debug("refine.py raw response: %s", data)
-        raise
-
-    return text.strip()
-
-
-# ============================================================
-# Public API
-# ============================================================
-
-def refine_answer(
+async def refine_answer(
    draft_output: str,
    reflection_notes: Optional[Any],
    identity_block: Optional[str],
    rag_block: Optional[str],
 ) -> Dict[str, Any]:
-    """
-    Main entrypoint used by Cortex.
-
-    Returns:
-      {
-        "final_output": <str>,           # what should go to persona / user
-        "used_primary_backend": True/False,
-        "fallback_used": True/False,
-    optionally:
-        "debug": {...}                   # only when REFINER_DEBUG=true
-      }
-    """

    if not draft_output:
-        # Nothing to refine. Don't get cute.
        return {
            "final_output": "",
-            "used_primary_backend": False,
+            "used_backend": None,
            "fallback_used": False,
        }

-    prompt = build_refine_prompt(draft_output, reflection_notes, identity_block, rag_block)
+    prompt = build_refine_prompt(
+        draft_output,
+        reflection_notes,
+        identity_block,
+        rag_block,
+    )
+
+    # backend priority: REFINE_LLM → CORTEX_LLM → PRIMARY
+    backend = REFINE_LLM or CORTEX_LLM or "PRIMARY"

    try:
-        refined = _call_primary_llm(prompt)
-        result: Dict[str, Any] = {
-            "final_output": refined or draft_output,
-            "used_primary_backend": True,
+        refined = await call_llm(
+            prompt,
+            backend=backend,
+            temperature=REFINER_TEMPERATURE,
+        )
+
+        return {
+            "final_output": refined.strip() if refined else draft_output,
+            "used_backend": backend,
            "fallback_used": False,
        }
+
    except Exception as e:
-        logger.error("refine.py: primary backend failed, returning draft_output. Error: %s", e)
-        result = {
+        logger.error(f"refine.py backend {backend} failed: {e}")
+
+        return {
            "final_output": draft_output,
-            "used_primary_backend": False,
+            "used_backend": backend,
            "fallback_used": True,
        }
-
-    if REFINER_DEBUG:
-        result["debug"] = {
-            "prompt": prompt[:4000],  # don’t nuke logs
-        }
-
-    return result
@@ -1,39 +1,57 @@
 # reflection.py
-from llm.llm_router import call_llm
 import json
+import os
+import re
+from llm.llm_router import call_llm


 async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dict:
    """
-    Generate reflection notes (internal guidance) for the reasoning engine.
-    These notes help simulate continuity and identity without being shown to the user.
+    Produce short internal reflection notes for Cortex.
+    These are NOT shown to the user.
    """

+    # -----------------------------
+    # Build the prompt
+    # -----------------------------
    identity_text = ""
    if identity_block:
        identity_text = f"Identity:\n{identity_block}\n\n"

    prompt = (
-    f"{identity_text}"
-    f"Recent summary:\n{intake_summary}\n\n"
-    "You are Lyra's meta-awareness layer. Your job is to produce short, directive "
-    "internal notes that guide Lyra’s reasoning engine. These notes are NEVER "
-    "shown to the user.\n\n"
-    "Rules for output:\n"
-    "1. Return ONLY valid JSON.\n"
-    "2. JSON must have exactly one key: \"notes\".\n"
-    "3. \"notes\" must be a list of 3–6 short strings.\n"
-    "4. Notes must be actionable (e.g., \"keep it concise\", \"maintain context\").\n"
-    "5. No markdown, no apologies, no explanations.\n\n"
-    "Return JSON:\n"
-    "{ \"notes\": [\"...\"] }\n"
+        f"{identity_text}"
+        f"Recent summary:\n{intake_summary}\n\n"
+        "You are Lyra's meta-awareness layer. Your job is to produce short, directive "
+        "internal notes that guide Lyra’s reasoning engine. These notes are NEVER "
+        "shown to the user.\n\n"
+        "Rules for output:\n"
+        "1. Return ONLY valid JSON.\n"
+        "2. JSON must have exactly one key: \"notes\".\n"
+        "3. \"notes\" must be a list of 3 to 6 short strings.\n"
+        "4. Notes must be actionable (e.g., \"keep it concise\", \"maintain context\").\n"
+        "5. No markdown, no apologies, no explanations.\n\n"
+        "Return JSON:\n"
+        "{ \"notes\": [\"...\"] }\n"
    )

+    # -----------------------------
+    # Module-specific backend choice
+    # -----------------------------
+    reflection_backend = os.getenv("REFLECTION_LLM")
+    cortex_backend = os.getenv("CORTEX_LLM", "PRIMARY").upper()

-    raw = await call_llm(prompt, backend="primary")
+    # Reflection uses its own backend if set, otherwise cortex backend
+    backend = (reflection_backend or cortex_backend).upper()
+
+    # -----------------------------
+    # Call the selected LLM backend
+    # -----------------------------
+    raw = await call_llm(prompt, backend=backend)
    print("[Reflection-Raw]:", raw)

-
+    # -----------------------------
+    # Try direct JSON
+    # -----------------------------
    try:
        parsed = json.loads(raw.strip())
        if isinstance(parsed, dict) and "notes" in parsed:
@@ -41,10 +59,11 @@ async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dic
    except:
        pass

-    # Try to extract JSON inside text
+    # -----------------------------
+    # Try JSON extraction
+    # -----------------------------
    try:
-        import re
-        match = re.search(r'\{.*?\}', raw, re.S)   # <-- non-greedy !
+        match = re.search(r"\{.*?\}", raw, re.S)
        if match:
            parsed = json.loads(match.group(0))
            if isinstance(parsed, dict) and "notes" in parsed:
@@ -52,5 +71,7 @@ async def reflect_notes(intake_summary: str, identity_block: dict | None) -> dic
    except:
        pass

-    # Final fallback
-    return {"notes": [raw.strip()]}
+    # -----------------------------
+    # Fallback — treat raw text as a single note
+    # -----------------------------
+    return {"notes": [raw.strip()]}
@@ -1,63 +1,85 @@
-from fastapi import APIRouter
+# router.py
+
+from fastapi import APIRouter, HTTPException
 from pydantic import BaseModel
-from typing import Optional, List, Any

 from reasoning.reasoning import reason_check
 from reasoning.reflection import reflect_notes
 from reasoning.refine import refine_answer
-from persona.speak import apply_persona
+from persona.speak import speak
 from ingest.intake_client import IntakeClient

-router = APIRouter()
+# -----------------------------
+# Router (NOT FastAPI app)
+# -----------------------------
+cortex_router = APIRouter()
+
+# Initialize Intake client once
+intake_client = IntakeClient()


-# ------------------------------------------------------
-# Request schema
-# ------------------------------------------------------
+# -----------------------------
+# Pydantic models
+# -----------------------------
 class ReasonRequest(BaseModel):
-    session_id: Optional[str]
+    session_id: str
    user_prompt: str
-    temperature: float = 0.7
+    temperature: float | None = None


-# ------------------------------------------------------
+# -----------------------------
 # /reason endpoint
-# ------------------------------------------------------
-@router.post("/reason")
+# -----------------------------
+@cortex_router.post("/reason")
 async def run_reason(req: ReasonRequest):

-    # 1. Summaries from Intake (context memory)
-    intake = IntakeClient()
-    intake_summary = await intake.get_context(req.session_id)
+    # 1. Pull context from Intake
+    try:
+        intake_summary = await intake_client.get_context(req.session_id)
+    except Exception:
+        intake_summary = "(no context available)"

-    # 2. Internal reflection notes
-    reflection = await reflect_notes(intake_summary, identity_block=None)
-    reflection_notes: List[str] = reflection.get("notes", [])
+    # 2. Reflection
+    try:
+        reflection = await reflect_notes(intake_summary, identity_block=None)
+        reflection_notes = reflection.get("notes", [])
+    except Exception:
+        reflection_notes = []

-    # 3. Draft answer (weak, unfiltered)
+    # 3. First-pass reasoning draft
    draft = await reason_check(
-        user_prompt=req.user_prompt,
+        req.user_prompt,
        identity_block=None,
        rag_block=None,
-        reflection_notes=reflection_notes,
+        reflection_notes=reflection_notes
    )

-    # 4. Refine the answer (structured self-correction)
-    refined_packet: dict[str, Any] = refine_answer(
+    # 4. Refinement
+    result = await refine_answer(
        draft_output=draft,
        reflection_notes=reflection_notes,
        identity_block=None,
        rag_block=None,
    )
-    refined_text = refined_packet.get("final_output", draft)
+    final_neutral = result["final_output"]

-    # 5. Persona styling (Lyra voice)
-    final_output = apply_persona(refined_text)

+    # 5. Persona layer
+    persona_answer = await speak(final_neutral)
+
+    # 6. Return full bundle
    return {
        "draft": draft,
-        "refined": refined_text,
-        "final": final_output,
-        "reflection_notes": reflection_notes,
+        "neutral": final_neutral,
+        "persona": persona_answer,
+        "reflection": reflection_notes,
        "session_id": req.session_id,
    }
+
+
+# -----------------------------
+# Intake ingest passthrough
+# -----------------------------
+@cortex_router.post("/ingest")
+async def ingest_stub():
+    return {"status": "ok"}
@@ -0,0 +1 @@
+# Utilities module
@@ -9,6 +9,7 @@ volumes:
    driver: local

 services:
+
  # ============================================================
  # NeoMem: Postgres
  # ============================================================
@@ -80,13 +81,17 @@ services:
      - lyra_net

  # ============================================================
-  # Relay
+  # Relay  (host mode)
  # ============================================================
  relay:
    build:
      context: ./core/relay
    container_name: relay
    restart: unless-stopped
+    env_file:
+      - ./.env
+    volumes:
+      - ./core/relay/sessions:/app/sessions
    ports:
      - "7078:7078"
    networks:
@@ -107,11 +112,6 @@ services:
      - ./cortex:/app
    ports:
      - "7081:7081"
-    environment:
-      LLM_PRIMARY_URL: http://10.0.0.43:8000/
-      NEOMEM_URL: http://neomem-api:7077
-      RAG_URL: http://rag:7090
-      RELAY_URL: http://relay:7078
    networks:
      - lyra_net

@@ -149,4 +149,4 @@ services:
  #   ports:
  #     - "7090:7090"
  #   networks:
-  #     - lyra_net
+  #     - lyra_net  
@@ -0,0 +1,13 @@
+# ====================================
+# 📥 INTAKE SUMMARIZATION CONFIG
+# ====================================
+# Intake service parameters for summarizing chat exchanges
+# LLM backend URLs and OPENAI_API_KEY inherited from root .env
+
+SUMMARY_MODEL_NAME=/model
+SUMMARY_API_URL=http://10.0.0.43:8000
+SUMMARY_MAX_TOKENS=400
+SUMMARY_TEMPERATURE=0.4
+SUMMARY_INTERVAL=300
+INTAKE_LOG_PATH=/app/logs/intake.log
+INTAKE_LOG_LEVEL=info
@@ -1,430 +1,160 @@
 from fastapi import FastAPI, Body, Query, BackgroundTasks
 from collections import deque
 from datetime import datetime
+from uuid import uuid4
 import requests
 import os
 import sys
-import asyncio
-from dotenv import load_dotenv
-
-# ───────────────────────────────────────────────
-# 🔧 Load environment variables
-# ───────────────────────────────────────────────
-load_dotenv()

+# ─────────────────────────────
+# Config
+# ─────────────────────────────
 SUMMARY_MODEL = os.getenv("SUMMARY_MODEL_NAME", "mistral-7b-instruct-v0.2.Q4_K_M.gguf")
 SUMMARY_URL = os.getenv("SUMMARY_API_URL", "http://localhost:8080/v1/completions")
 SUMMARY_MAX_TOKENS = int(os.getenv("SUMMARY_MAX_TOKENS", "200"))
 SUMMARY_TEMPERATURE = float(os.getenv("SUMMARY_TEMPERATURE", "0.3"))

-# ───────────────────────────────────────────────
-# 🧠 NeoMem connection (session-aware)
-# ───────────────────────────────────────────────
-from uuid import uuid4
-
 NEOMEM_API = os.getenv("NEOMEM_API")
 NEOMEM_KEY = os.getenv("NEOMEM_KEY")

-def push_summary_to_neomem(summary_text: str, level: str, session_id: str):
-    """Send summarized text to NeoMem, tagged by session_id."""
-    if not NEOMEM_API:
-        print("⚠️  NEOMEM_API not set, skipping NeoMem push")
-        return
+# ─────────────────────────────
+# App + session buffer
+# ─────────────────────────────
+app = FastAPI()
+SESSIONS = {}

-    payload = {
-        "messages": [
-            {"role": "assistant", "content": summary_text}
-        ],
-        "user_id": "brian",
-        # optional: uncomment if you want sessions tracked in NeoMem natively
-        # "run_id": session_id,
-        "metadata": {
-            "source": "intake",
-            "type": "summary",
-            "level": level,
-            "session_id": session_id,
-            "cortex": {}
-        }
-    }
+@app.on_event("startup")
+def banner():
+    print("🧩 Intake v0.2 booting...")
+    print(f"   Model: {SUMMARY_MODEL}")
+    print(f"   API:   {SUMMARY_URL}")
+    sys.stdout.flush()
+
+# ─────────────────────────────
+# Helper: summarize exchanges
+# ─────────────────────────────
+def llm(prompt: str):
+    try:
+        resp = requests.post(
+            SUMMARY_URL,
+            json={
+                "model": SUMMARY_MODEL,
+                "prompt": prompt,
+                "max_tokens": SUMMARY_MAX_TOKENS,
+                "temperature": SUMMARY_TEMPERATURE,
+            },
+            timeout=30,
+        )
+        resp.raise_for_status()
+        return resp.json().get("choices", [{}])[0].get("text", "").strip()
+    except Exception as e:
+        return f"[Error summarizing: {e}]"
+
+def summarize_simple(exchanges):
+    """Simple factual summary of recent exchanges."""
+    text = ""
+    for e in exchanges:
+        text += f"User: {e['user_msg']}\nAssistant: {e['assistant_msg']}\n\n"
+
+    prompt = f"""
+    Summarize the following conversation between Brian (user) and Lyra (assistant).
+    Focus only on factual content. Avoid names, examples, story tone, or invented details.
+
+    {text}
+
+    Summary:
+    """
+    return llm(prompt)
+
+# ─────────────────────────────
+# NeoMem push
+# ─────────────────────────────
+def push_to_neomem(summary: str, session_id: str):
+    if not NEOMEM_API:
+        return

    headers = {"Content-Type": "application/json"}
    if NEOMEM_KEY:
        headers["Authorization"] = f"Bearer {NEOMEM_KEY}"

+    payload = {
+        "messages": [{"role": "assistant", "content": summary}],
+        "user_id": "brian",
+        "metadata": {
+            "source": "intake",
+            "session_id": session_id
+        }
+    }
+
    try:
-        r = requests.post(f"{NEOMEM_API}/memories", json=payload, headers=headers, timeout=25)
-        r.raise_for_status()
-        print(f"🧠 NeoMem updated ({level}, {session_id}, {len(summary_text)} chars)")
+        requests.post(
+            f"{NEOMEM_API}/memories",
+            json=payload,
+            headers=headers,
+            timeout=20
+        ).raise_for_status()
+        print(f"🧠 NeoMem updated for {session_id}")
    except Exception as e:
-        print(f"❌ NeoMem push failed ({level}, {session_id}): {e}")
+        print(f"NeoMem push failed: {e}")

-
-# ───────────────────────────────────────────────
-# ⚙️ FastAPI + buffer setup
-# ───────────────────────────────────────────────
-app = FastAPI()
-
-# Multiple rolling buffers keyed by session_id
-SESSIONS = {}
-
-
-# Summary trigger points
-# → low-tier: quick factual recaps
-# → mid-tier: “Reality Check” reflections
-# → high-tier: rolling continuity synthesis
-LEVELS = [1, 2, 5, 10, 20, 30]
-
-@app.on_event("startup")
-def show_boot_banner():
-    print("🧩 Intake booting...")
-    print(f"   Model: {SUMMARY_MODEL}")
-    print(f"   API:   {SUMMARY_URL}")
-    print(f"   Max tokens: {SUMMARY_MAX_TOKENS}, Temp: {SUMMARY_TEMPERATURE}")
-    sys.stdout.flush()
-
-# ───────────────────────────────────────────────
-# 🧠 Hierarchical Summarizer (L10→L20→L30 cascade)
-# ───────────────────────────────────────────────
-SUMMARIES_CACHE = {"L10": [], "L20": [], "L30": []}
-
-def summarize(exchanges, level):
-    """Hierarchical summarizer: builds local and meta summaries."""
-    # Join exchanges into readable text
-    text = "\n".join(
-        f"User: {e['turns'][0]['content']}\nAssistant: {e['turns'][1]['content']}"
-        for e in exchanges
-    )
-
-    def query_llm(prompt: str):
-        try:
-            resp = requests.post(
-                SUMMARY_URL,
-                json={
-                    "model": SUMMARY_MODEL,
-                    "prompt": prompt,
-                    "max_tokens": SUMMARY_MAX_TOKENS,
-                    "temperature": SUMMARY_TEMPERATURE,
-                },
-                timeout=180,
-            )
-            resp.raise_for_status()
-            data = resp.json()
-            return data.get("choices", [{}])[0].get("text", "").strip()
-        except Exception as e:
-            return f"[Error summarizing: {e}]"
-
-    # ───── L10: local “Reality Check” block ─────
-    if level == 10:
-        prompt = f"""
-        You are Lyra Intake performing a 'Reality Check' for the last {len(exchanges)} exchanges.
-        Summarize this block as one coherent paragraph describing the user’s focus, progress, and tone.
-        Avoid bullet points.
-
-        Exchanges:
-        {text}
-
-        Reality Check Summary:
-        """
-        summary = query_llm(prompt)
-        SUMMARIES_CACHE["L10"].append(summary)
-
-    # ───── L20: merge L10s ─────
-    elif level == 20:
-        # 1️⃣ create fresh L10 for 11–20
-        l10_prompt = f"""
-        You are Lyra Intake generating a second Reality Check for the most recent {len(exchanges)} exchanges.
-        Summarize them as one paragraph describing what's new or changed since the last block.
-        Avoid bullet points.
-
-        Exchanges:
-        {text}
-
-        Reality Check Summary:
-        """
-        new_l10 = query_llm(l10_prompt)
-        SUMMARIES_CACHE["L10"].append(new_l10)
-
-        # 2️⃣ merge all L10s into a Session Overview
-        joined_l10s = "\n\n".join(SUMMARIES_CACHE["L10"])
-        l20_prompt = f"""
-        You are Lyra Intake merging multiple 'Reality Checks' into a single Session Overview.
-        Summarize the following Reality Checks into one short paragraph capturing the ongoing goals,
-        patterns, and overall progress.
-
-        Reality Checks:
-        {joined_l10s}
-
-        Session Overview:
-        """
-        l20_summary = query_llm(l20_prompt)
-        SUMMARIES_CACHE["L20"].append(l20_summary)
-        summary = new_l10 + "\n\n" + l20_summary
-
-    # ───── L30: continuity synthesis ─────
-    elif level == 30:
-        # 1️⃣ create new L10 for 21–30
-        new_l10 = query_llm(f"""
-        You are Lyra Intake creating a new Reality Check for exchanges 21–30.
-        Summarize this block in one cohesive paragraph, describing any shifts in focus or tone.
-
-        Exchanges:
-        {text}
-
-        Reality Check Summary:
-        """)
-
-        SUMMARIES_CACHE["L10"].append(new_l10)
-
-        # 2️⃣ merge all lower levels for continuity
-        joined = "\n\n".join(SUMMARIES_CACHE["L10"] + SUMMARIES_CACHE["L20"])
-        continuity_prompt = f"""
-        You are Lyra Intake performing a 'Continuity Report' — a high-level reflection combining all Reality Checks
-        and Session Overviews so far. Describe how the conversation has evolved, the key insights, and remaining threads.
-
-        Reality Checks and Overviews:
-        {joined}
-
-        Continuity Report:
-        """
-        l30_summary = query_llm(continuity_prompt)
-        SUMMARIES_CACHE["L30"].append(l30_summary)
-        summary = new_l10 + "\n\n" + l30_summary
-
-    # ───── L1–L5 (standard factual summaries) ─────
-    else:
-        prompt = f"""
-        You are Lyra Intake, a background summarization module for an AI assistant.
-
-        Your job is to compress recent chat exchanges between a user and an assistant
-        into a short, factual summary. The user's name is Brian, and the assistant's name is Lyra. 
-        Focus only on the real conversation content.
-        Do NOT invent names, people, or examples. Avoid speculation or storytelling.
-
-        Summarize clearly what topics were discussed and what conclusions were reached.
-        Avoid speculation, names, or bullet points.
-
-        Exchanges:
-        {text}
-
-        Summary:
-        """
-        summary = query_llm(prompt)
-
-    return f"[L{level} Summary of {len(exchanges)} exchanges]: {summary}"
-
-from datetime import datetime
-
-LOG_DIR = "/app/logs"
-os.makedirs(LOG_DIR, exist_ok=True)
-
-def log_to_file(level: str, summary: str):
-    """Append each summary to a persistent .txt log file."""
-    timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
-    filename = os.path.join(LOG_DIR, "summaries.log")
-    with open(filename, "a", encoding="utf-8") as f:
-        f.write(f"[{timestamp}] {level}\n{summary}\n{'='*60}\n\n")
-
-# ───────────────────────────────────────────────
-# 🔁 Background summarization helper
-# ───────────────────────────────────────────────
-def run_summarization_task(exchange, session_id):
-    """Async-friendly wrapper for slow summarization work."""
+# ─────────────────────────────
+# Background summarizer
+# ─────────────────────────────
+def bg_summarize(session_id: str):
    try:
        hopper = SESSIONS.get(session_id)
        if not hopper:
-            print(f"⚠️ No hopper found for {session_id}")
            return

-        buffer = hopper["buffer"]
-        count = len(buffer)
-        summaries = {}
-
-        if count < 30:
-            for lvl in LEVELS:
-                if lvl <= count:
-                    s_text = summarize(list(buffer)[-lvl:], lvl)
-                    log_to_file(f"L{lvl}", s_text)
-                    push_summary_to_neomem(s_text, f"L{lvl}", session_id)
-                    summaries[f"L{lvl}"] = s_text
-        else:
-            # optional: include your existing 30+ logic here
-            pass
-
-        if summaries:
-            print(f"🧩 [BG] Summaries generated asynchronously at count={count}: {list(summaries.keys())}")
+        buf = list(hopper["buffer"])
+        summary = summarize_simple(buf)
+        push_to_neomem(summary, session_id)

+        print(f"🧩 Summary generated for {session_id}")
    except Exception as e:
-        print(f"💥 [BG] Async summarization failed: {e}")
+        print(f"Summarizer error: {e}")
+
+# ─────────────────────────────
+# Routes
+# ─────────────────────────────

-# ───────────────────────────────────────────────
-# 📨 Routes
-# ───────────────────────────────────────────────
@app.post("/add_exchange")
 def add_exchange(exchange: dict = Body(...), background_tasks: BackgroundTasks = None):
+
    session_id = exchange.get("session_id") or f"sess-{uuid4().hex[:8]}"
    exchange["session_id"] = session_id
+    exchange["timestamp"] = datetime.now().isoformat()

    if session_id not in SESSIONS:
-        SESSIONS[session_id] = {"buffer": deque(maxlen=100), "last_update": datetime.now()}
+        SESSIONS[session_id] = {
+            "buffer": deque(maxlen=200),
+            "created_at": datetime.now()
+        }
        print(f"🆕 Hopper created: {session_id}")

-    hopper = SESSIONS[session_id]
-    hopper["buffer"].append(exchange)
-    hopper["last_update"] = datetime.now()
-    count = len(hopper["buffer"])
+    SESSIONS[session_id]["buffer"].append(exchange)

-    # 🚀 queue background summarization
    if background_tasks:
-        background_tasks.add_task(run_summarization_task, exchange, session_id)
-        print(f"⏩ Queued async summarization for {session_id}")
+        background_tasks.add_task(bg_summarize, session_id)
+        print(f"⏩ Summarization queued for {session_id}")

-    return {"ok": True, "exchange_count": count, "queued": True}
+    return {"ok": True, "session_id": session_id}

-
-    # # ── Normal tiered behavior up to 30 ── commented out for aysnc addon
-    # if count < 30:
-        # if count in LEVELS:
-            # for lvl in LEVELS:
-                # if lvl <= count:
-                    # summaries[f"L{lvl}"] = summarize(list(buffer)[-lvl:], lvl)
-                    # log_to_file(f"L{lvl}", summaries[f"L{lvl}"])
-                    # push_summary_to_neomem(summaries[f"L{lvl}"], f"L{lvl}", session_id)
-
-# # 🚀 Launch summarization in the background (non-blocking)
-# if background_tasks:
-    # background_tasks.add_task(run_summarization_task, exchange, session_id)
-    # print(f"⏩ Queued async summarization for {session_id}")
-
-
-    # # ── Beyond 30: keep summarizing every +15 exchanges ──
-    # else:
-        # # Find next milestone after 30 (45, 60, 75, ...)
-        # milestone = 30 + ((count - 30) // 15) * 15
-        # if count == milestone:
-            # summaries[f"L{milestone}"] = summarize(list(buffer)[-15:], milestone)
-            # log_to_file(f"L{milestone}", summaries[f"L{milestone}"])
-            # push_summary_to_neomem(summaries[f"L{milestone}"], f"L{milestone}", session_id)
-
-            # # Optional: merge all continuity summaries so far into a running meta-summary
-            # joined = "\n\n".join(
-                # [s for key, s in summaries.items() if key.startswith("L")]
-            # )
-            # meta_prompt = f"""
-            # You are Lyra Intake composing an 'Ongoing Continuity Report' that merges
-            # all prior continuity summaries into one living narrative.
-            # Focus on major themes, changes, and lessons so far.
-
-            # Continuity Summaries:
-            # {joined}
-
-            # Ongoing Continuity Report:
-            # """
-            # meta_summary = f"[L∞ Ongoing Continuity Report]: {query_llm(meta_prompt)}"
-            # summaries["L∞"] = meta_summary
-            # log_to_file("L∞", meta_summary)
-            # push_summary_to_neomem(meta_summary, "L∞", session_id)
-
-            # print(f"🌀 L{milestone} continuity summary created (messages {count-14}-{count})")
-
-    # # ── Log summaries ──
-    # if summaries:
-        # print(f"🧩 Summaries generated at count={count}: {list(summaries.keys())}")
-
-    # return {
-        # "ok": True,
-        # "exchange_count": len(buffer),
-        # "queued": True
-    # }
-
-# ───────────────────────────────────────────────
-# Clear rubbish from hopper.
-# ───────────────────────────────────────────────
-def close_session(session_id: str):
-    """Run a final summary for the given hopper, post it to NeoMem, then delete it."""
-    hopper = SESSIONS.get(session_id)
-    if not hopper:
-        print(f"⚠️ No active hopper for {session_id}")
-        return
-
-    buffer = hopper["buffer"]
-    if not buffer:
-        print(f"⚠️ Hopper {session_id} is empty, skipping closure")
-        del SESSIONS[session_id]
-        return
-
-    try:
-        print(f"🔒 Closing hopper {session_id} ({len(buffer)} exchanges)")
-
-        # Summarize everything left in the buffer
-        final_summary = summarize(list(buffer), 30)  # level 30 = continuity synthesis
-        log_to_file("LFinal", final_summary)
-        push_summary_to_neomem(final_summary, "LFinal", session_id)
-
-        # Optionally: mark this as a special 'closure' memory
-        closure_note = f"[Session {session_id} closed with {len(buffer)} exchanges]"
-        push_summary_to_neomem(closure_note, "LFinalNote", session_id)
-
-        print(f"🧹 Hopper {session_id} closed and deleted")
-    except Exception as e:
-        print(f"💥 Error closing hopper {session_id}: {e}")
-    finally:
-        del SESSIONS[session_id]
-        
@app.post("/close_session/{session_id}")
-def close_session_endpoint(session_id: str):
-    close_session(session_id)
+def close_session(session_id: str):
+    if session_id in SESSIONS:
+        del SESSIONS[session_id]
    return {"ok": True, "closed": session_id}

-# ───────────────────────────────────────────────
-# 🧾 Provide recent summary for Cortex /reason calls
-# ───────────────────────────────────────────────
@app.get("/summaries")
-def get_summary(session_id: str = Query(..., description="Active session ID")):
-    """
-    Return the most recent summary (L10→L30→LFinal) for a given session.
-    If none exist yet, return a placeholder summary.
-    """
-    try:
-        # Find the most recent file entry in summaries.log
-        log_path = os.path.join(LOG_DIR, "summaries.log")
-        if not os.path.exists(log_path):
-            return {
-                "summary_text": "(none)",
-                "last_message_ts": datetime.now().isoformat(),
-                "session_id": session_id,
-                "exchange_count": 0,
-            }
+def get_summary(session_id: str = Query(...)):
+    hopper = SESSIONS.get(session_id)
+    if not hopper:
+        return {"summary_text": "(none)", "session_id": session_id}

-        with open(log_path, "r", encoding="utf-8") as f:
-            lines = f.readlines()
+    summary = summarize_simple(list(hopper["buffer"]))
+    return {"summary_text": summary, "session_id": session_id}

-        # Grab the last summary section that mentions this session_id
-        recent_lines = [ln for ln in lines if session_id in ln or ln.startswith("[L")]
-        if recent_lines:
-            # Find the last non-empty summary text
-            snippet = "".join(recent_lines[-8:]).strip()
-        else:
-            snippet = "(no summaries yet)"
-
-        return {
-            "summary_text": snippet[-1000:],  # truncate to avoid huge block
-            "last_message_ts": datetime.now().isoformat(),
-            "session_id": session_id,
-            "exchange_count": len(SESSIONS.get(session_id, {}).get("buffer", [])),
-        }
-
-    except Exception as e:
-        print(f"⚠️ /summaries failed for {session_id}: {e}")
-        return {
-            "summary_text": f"(error fetching summaries: {e})",
-            "last_message_ts": datetime.now().isoformat(),
-            "session_id": session_id,
-            "exchange_count": 0,
-        }
-
-# ───────────────────────────────────────────────
-# ✅ Health check
-# ───────────────────────────────────────────────
@app.get("/health")
 def health():
    return {"ok": True, "model": SUMMARY_MODEL, "url": SUMMARY_URL}
@@ -0,0 +1,11 @@
+# ====================================
+# 📚 RAG SERVICE CONFIG
+# ====================================
+# Retrieval-Augmented Generation service (Beta Lyrae)
+# Currently not wired into the system - for future activation
+# OPENAI_API_KEY and other shared config inherited from root .env
+
+# RAG-specific configuration will go here when service is activated
+# ChromaDB configuration
+# Vector store settings
+# Retrieval parameters
				`@@ -0,0 +1 @@`
				`# Ingest module - handles communication with Intake service`
				`@@ -0,0 +1 @@`
				`# LLM module - provides LLM routing and backend abstraction`
				`@@ -0,0 +1 @@`
				`# Persona module - applies Lyra's personality and speaking style`
				`@@ -0,0 +1 @@`
				`# Reasoning module - multi-stage reasoning pipeline`