2026-01-18 02:46:25 -05:00
7 changed files with 910 additions and 1113 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/cortex/Dockerfile
+++ b/cortex/Dockerfile
@@ -4,4 +4,6 @@ COPY requirements.txt .
 RUN pip install -r requirements.txt
 COPY . .
 EXPOSE 7081
 # NOTE: Running with single worker to maintain SESSIONS global state in Intake.
 # If scaling to multiple workers, migrate SESSIONS to Redis or shared storage.
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7081"]
--- a/cortex/context.py
+++ b/cortex/context.py
@@ -84,6 +84,7 @@ def _init_session(session_id: str) -> Dict[str, Any]:
        "mood": "neutral",  # Future: mood tracking
        "active_project": None,  # Future: project context
        "message_count": 0,
        "message_history": [],
    }
@@ -275,6 +276,13 @@ async def collect_context(session_id: str, user_prompt: str) -> Dict[str, Any]:
    state["last_user_message"] = user_prompt
    state["last_timestamp"] = now
    state["message_count"] += 1
    # Save user turn to history
    state["message_history"].append({
    "user": user_prompt,
    "assistant": ""   # assistant reply filled later by update_last_assistant_message()
    })
    # F. Assemble unified context
    context_state = {
@@ -311,20 +319,27 @@ async def collect_context(session_id: str, user_prompt: str) -> Dict[str, Any]:
 # -----------------------------
 def update_last_assistant_message(session_id: str, message: str) -> None:
    """
-    Update session state with assistant's response.
+    Update session state with assistant's response and complete
-
+    the last turn inside message_history.
    Called by router.py after persona layer completes.
    Args:
        session_id: Session identifier
        message: Assistant's final response text
    """
-    if session_id in SESSION_STATE:
+    session = SESSION_STATE.get(session_id)
-        SESSION_STATE[session_id]["last_assistant_message"] = message
+    if not session:
        SESSION_STATE[session_id]["last_timestamp"] = datetime.now()
        logger.debug(f"Updated assistant message for session {session_id}")
    else:
        logger.warning(f"Attempted to update non-existent session: {session_id}")
        return
    # Update last assistant message + timestamp
    session["last_assistant_message"] = message
    session["last_timestamp"] = datetime.now()
    # Fill in assistant reply for the most recent turn
    history = session.get("message_history", [])
    if history:
        # history entry already contains {"user": "...", "assistant": "...?"}
        history[-1]["assistant"] = message
    if VERBOSE_DEBUG:
        logger.debug(f"Updated assistant message for session {session_id}")
 def get_session_state(session_id: str) -> Optional[Dict[str, Any]]:
--- a/cortex/intake/init.py
+++ b/cortex/intake/init.py
@@ -0,0 +1,18 @@
 """
 Intake module - short-term memory summarization.
 Runs inside the Cortex container as a pure Python module.
 No standalone API server - called internally by Cortex.
 """
 from .intake import (
    SESSIONS,
    add_exchange_internal,
    summarize_context,
 )
 __all__ = [
    "SESSIONS",
    "add_exchange_internal",
    "summarize_context",
 ]
--- a/cortex/intake/intake.py
+++ b/cortex/intake/intake.py
@@ -1,18 +1,29 @@
 import os
 import json
 from datetime import datetime
 from typing import List, Dict, Any, TYPE_CHECKING
 from collections import deque
 from llm.llm_router import call_llm
 # -------------------------------------------------------------------
 # Global Short-Term Memory (new Intake)
 # -------------------------------------------------------------------
 SESSIONS: dict[str, dict] = {}   # session_id → { buffer: deque, created_at: timestamp }
 # Diagnostic: Verify module loads only once
 print(f"[Intake Module Init] SESSIONS object id: {id(SESSIONS)}, module: {__name__}")
 # L10 / L20 history lives here too
 L10_HISTORY: Dict[str, list[str]] = {}
 L20_HISTORY: Dict[str, list[str]] = {}
 from llm.llm_router import call_llm  # Use Cortex's shared LLM router
 if TYPE_CHECKING:
    # Only for type hints — do NOT redefine SESSIONS here
    from collections import deque as _deque
    SESSIONS: dict
    L10_HISTORY: dict
    L20_HISTORY: dict
    def bg_summarize(session_id: str) -> None: ...
 from llm.llm_router import call_llm  # use Cortex's shared router
 # ─────────────────────────────
 # Config
 # ─────────────────────────────
@@ -220,20 +231,24 @@ def push_to_neomem(summary: str, session_id: str, level: str) -> None:
 # ─────────────────────────────
 # Main entrypoint for Cortex
 # ─────────────────────────────
-
+async def summarize_context(session_id: str, exchanges: list[dict]):
 async def summarize_context(
    session_id: str,
    exchanges: List[Dict[str, Any]],
 ) -> Dict[str, Any]:
    """
-    Main API used by Cortex:
+    Internal summarizer that uses Cortex's LLM router.
    Produces L1 / L5 / L10 / L20 / L30 summaries.
-        summaries = await summarize_context(session_id, exchanges)
+    Args:
-
+        session_id: The conversation/session ID
-    `exchanges` should be the recent conversation buffer for that session.
+        exchanges: A list of {"user_msg": ..., "assistant_msg": ..., "timestamp": ...}
    """
-    buf = list(exchanges)
+
-    if not buf:
+    # Build raw conversation text
    convo_lines = []
    for ex in exchanges:
        convo_lines.append(f"User: {ex.get('user_msg','')}")
        convo_lines.append(f"Assistant: {ex.get('assistant_msg','')}")
    convo_text = "\n".join(convo_lines)
    if not convo_text.strip():
        return {
            "session_id": session_id,
            "exchange_count": 0,
@@ -242,31 +257,72 @@ async def summarize_context(
            "L10": "",
            "L20": "",
            "L30": "",
-            "last_updated": None,
+            "last_updated": datetime.now().isoformat()
        }
-    # Base levels
+    # Prompt the LLM (internal — no HTTP)
-    L1 = await summarize_L1(buf)
+    prompt = f"""
-    L5 = await summarize_L5(buf)
+Summarize the conversation below into multiple compression levels.
    L10 = await summarize_L10(session_id, buf)
    L20 = await summarize_L20(session_id)
    L30 = await summarize_L30(session_id)
-    # Push the "interesting" tiers into NeoMem
+Conversation:
-    push_to_neomem(L10, session_id, "L10")
+----------------
-    push_to_neomem(L20, session_id, "L20")
+{convo_text}
-    push_to_neomem(L30, session_id, "L30")
+----------------
-    return {
+Output strictly in JSON with keys:
-        "session_id": session_id,
+L1  → ultra short summary (1–2 sentences max)
-        "exchange_count": len(buf),
+L5  → short summary
-        "L1": L1,
+L10 → medium summary
-        "L5": L5,
+L20 → detailed overview
-        "L10": L10,
+L30 → full detailed summary
-        "L20": L20,
+
-        "L30": L30,
+JSON only. No text outside JSON.
-        "last_updated": datetime.now().isoformat(),
+"""
-    }
+
    try:
        llm_response = await call_llm(
            prompt,
            temperature=0.2
        )
        # LLM should return JSON, parse it
        summary = json.loads(llm_response)
        return {
            "session_id": session_id,
            "exchange_count": len(exchanges),
            "L1": summary.get("L1", ""),
            "L5": summary.get("L5", ""),
            "L10": summary.get("L10", ""),
            "L20": summary.get("L20", ""),
            "L30": summary.get("L30", ""),
            "last_updated": datetime.now().isoformat()
        }
    except Exception as e:
        return {
            "session_id": session_id,
            "exchange_count": len(exchanges),
            "L1": f"[Error summarizing: {str(e)}]",
            "L5": "",
            "L10": "",
            "L20": "",
            "L30": "",
            "last_updated": datetime.now().isoformat()
        }
 # ─────────────────────────────────
 # Background summarization stub
 # ─────────────────────────────────
 def bg_summarize(session_id: str):
    """
    Placeholder for background summarization.
    Actual summarization happens during /reason via summarize_context().
    This function exists to prevent NameError when called from add_exchange_internal().
    """
    print(f"[Intake] Exchange added for {session_id}. Will summarize on next /reason call.")
 # ─────────────────────────────
 # Internal entrypoint for Cortex
@@ -283,15 +339,23 @@ def add_exchange_internal(exchange: dict):
    exchange["timestamp"] = datetime.now().isoformat()
    # DEBUG: Verify we're using the module-level SESSIONS
    print(f"[add_exchange_internal] SESSIONS object id: {id(SESSIONS)}, current sessions: {list(SESSIONS.keys())}")
    # Ensure session exists
    if session_id not in SESSIONS:
        SESSIONS[session_id] = {
            "buffer": deque(maxlen=200),
            "created_at": datetime.now()
        }
        print(f"[add_exchange_internal] Created new session: {session_id}")
    else:
        print(f"[add_exchange_internal] Using existing session: {session_id}")
    # Append exchange into the rolling buffer
    SESSIONS[session_id]["buffer"].append(exchange)
    buffer_len = len(SESSIONS[session_id]["buffer"])
    print(f"[add_exchange_internal] Added exchange to {session_id}, buffer now has {buffer_len} items")
    # Trigger summarization immediately
    try:
--- a/cortex/router.py
+++ b/cortex/router.py
@@ -197,26 +197,110 @@ class IngestPayload(BaseModel):
    user_msg: str
    assistant_msg: str
@cortex_router.post("/ingest")
-async def ingest_stub():
+async def ingest(payload: IngestPayload):
-    # Intake is internal now — this endpoint is only for compatibility.
+    """
-    return {"status": "ok", "note": "intake is internal now"}
+    Receives (session_id, user_msg, assistant_msg) from Relay
    and pushes directly into Intake's in-memory buffer.
-
+    Uses lenient error handling - always returns success to avoid
-    # 1. Update Cortex session state
+    breaking the chat pipeline.
-    update_last_assistant_message(payload.session_id, payload.assistant_msg)
+    """
    # 2. Feed Intake internally (no HTTP)
    try:
        # 1. Update Cortex session state
        update_last_assistant_message(payload.session_id, payload.assistant_msg)
    except Exception as e:
        logger.warning(f"[INGEST] Failed to update session state: {e}")
        # Continue anyway (lenient mode)
    try:
        # 2. Feed Intake internally (no HTTP)
        add_exchange_internal({
            "session_id": payload.session_id,
            "user_msg": payload.user_msg,
            "assistant_msg": payload.assistant_msg,
        })
        logger.debug(f"[INGEST] Added exchange to Intake for {payload.session_id}")
    except Exception as e:
-        logger.warning(f"[INGEST] Failed to add exchange to Intake: {e}")
+        logger.warning(f"[INGEST] Failed to add to Intake: {e}")
        # Continue anyway (lenient mode)
-    return {"ok": True, "session_id": payload.session_id}
+    # Always return success (user requirement: never fail chat pipeline)
    return {
        "status": "ok",
        "session_id": payload.session_id
    }
 # -----------------------------
 # Debug endpoint: summarized context
 # -----------------------------
@cortex_router.get("/debug/summary")
 async def debug_summary(session_id: str):
    """
    Diagnostic endpoint that runs Intake's summarize_context() for a session.
    Shows exactly what L1/L5/L10/L20/L30 summaries would look like
    inside the actual Uvicorn worker, using the real SESSIONS buffer.
    """
    from intake.intake import SESSIONS, summarize_context
    # Validate session
    session = SESSIONS.get(session_id)
    if not session:
        return {"error": "session not found", "session_id": session_id}
    # Convert deque into the structure summarize_context expects
    buffer = session["buffer"]
    exchanges = [
        {
            "user_msg": ex.get("user_msg", ""),
            "assistant_msg": ex.get("assistant_msg", ""),
        }
        for ex in buffer
    ]
    # 🔥 CRITICAL FIX — summarize_context is async
    summary = await summarize_context(session_id, exchanges)
    return {
        "session_id": session_id,
        "buffer_size": len(buffer),
        "exchanges_preview": exchanges[-5:],   # last 5 items
        "summary": summary
    }
 # -----------------------------
 # Debug endpoint for SESSIONS
 # -----------------------------
@cortex_router.get("/debug/sessions")
 async def debug_sessions():
    """
    Diagnostic endpoint to inspect SESSIONS from within the running Uvicorn worker.
    This shows the actual state of the in-memory SESSIONS dict.
    """
    from intake.intake import SESSIONS
    sessions_data = {}
    for session_id, session_info in SESSIONS.items():
        buffer = session_info["buffer"]
        sessions_data[session_id] = {
            "created_at": session_info["created_at"].isoformat(),
            "buffer_size": len(buffer),
            "buffer_maxlen": buffer.maxlen,
            "recent_exchanges": [
                {
                    "user_msg": ex.get("user_msg", "")[:100],
                    "assistant_msg": ex.get("assistant_msg", "")[:100],
                    "timestamp": ex.get("timestamp", "")
                }
                for ex in list(buffer)[-5:]  # Last 5 exchanges
            ]
        }
    return {
        "sessions_object_id": id(SESSIONS),
        "total_sessions": len(SESSIONS),
        "sessions": sessions_data
    }
--- a/vllm-mi50.md
+++ b/vllm-mi50.md
@@ -1,416 +0,0 @@
 Here you go — a **clean, polished, ready-to-drop-into-Trilium or GitHub** Markdown file.
 If you want, I can also auto-generate a matching `/docs/vllm-mi50/` folder structure and a mini-ToC.
 ---
 # **MI50 + vLLM + Proxmox LXC Setup Guide**
 ### *End-to-End Field Manual for gfx906 LLM Serving*
 **Version:** 1.0
 **Last updated:** 2025-11-17
 ---
 ## **📌 Overview**
 This guide documents how to run a **vLLM OpenAI-compatible server** on an
 **AMD Instinct MI50 (gfx906)** inside a **Proxmox LXC container**, expose it over LAN,
 and wire it into **Project Lyra's Cortex reasoning layer**.
 This file is long, specific, and intentionally leaves *nothing* out so you never have to rediscover ROCm pain rituals again.
 ---
 ## **1. What This Stack Looks Like**
 ```
 Proxmox Host
 ├─ AMD Instinct MI50 (gfx906)
 ├─ AMDGPU + ROCm stack
 └─ LXC Container (CT 201: cortex-gpu)
      ├─ Ubuntu 24.04
      ├─ Docker + docker compose
      ├─ vLLM inside Docker (nalanzeyu/vllm-gfx906)
      ├─ GPU passthrough via /dev/kfd + /dev/dri + PCI bind
      └─ vLLM API exposed on :8000
 Lyra Cortex (VM/Server)
 └─ LLM_PRIMARY_URL=http://10.0.0.43:8000
 ```
 ---
 ## **2. Proxmox Host — GPU Setup**
 ### **2.1 Confirm MI50 exists**
 ```bash
 lspci -nn | grep -i 'vega\|instinct\|radeon'
 ```
 You should see something like:
 ```
 0a:00.0 Display controller: AMD Instinct MI50 (gfx906)
 ```
 ### **2.2 Load AMDGPU driver**
 The main pitfall after **any host reboot**.
 ```bash
 modprobe amdgpu
 ```
 If you skip this, the LXC container won't see the GPU.
 ---
 ## **3. LXC Container Configuration (CT 201)**
 The container ID is **201**.
 Config file is at:
 ```
 /etc/pve/lxc/201.conf
 ```
 ### **3.1 Working 201.conf**
 Paste this *exact* version:
 ```ini
 arch: amd64
 cores: 4
 hostname: cortex-gpu
 memory: 16384
 swap: 512
 ostype: ubuntu
 onboot: 1
 startup: order=2,up=10,down=10
 net0: name=eth0,bridge=vmbr0,hwaddr=BC:24:11:C6:3E:88,ip=dhcp,type=veth
 rootfs: local-lvm:vm-201-disk-0,size=200G
 unprivileged: 0
 # Docker in LXC requires this
 features: keyctl=1,nesting=1
 lxc.apparmor.profile: unconfined
 lxc.cap.drop:
 # --- GPU passthrough for ROCm (MI50) ---
 lxc.mount.entry: /dev/kfd dev/kfd none bind,optional,create=file,mode=0666
 lxc.mount.entry: /dev/dri dev/dri none bind,optional,create=dir
 lxc.mount.entry: /sys/class/drm sys/class/drm none bind,ro,optional,create=dir
 lxc.mount.entry: /opt/rocm /opt/rocm none bind,ro,optional,create=dir
 # Bind the MI50 PCI device
 lxc.mount.entry: /dev/bus/pci/0000:0a:00.0 dev/bus/pci/0000:0a:00.0 none bind,optional,create=file
 # Allow GPU-related character devices
 lxc.cgroup2.devices.allow: c 226:* rwm
 lxc.cgroup2.devices.allow: c 29:* rwm
 lxc.cgroup2.devices.allow: c 189:* rwm
 lxc.cgroup2.devices.allow: c 238:* rwm
 lxc.cgroup2.devices.allow: c 241:* rwm
 lxc.cgroup2.devices.allow: c 242:* rwm
 lxc.cgroup2.devices.allow: c 243:* rwm
 lxc.cgroup2.devices.allow: c 244:* rwm
 lxc.cgroup2.devices.allow: c 245:* rwm
 lxc.cgroup2.devices.allow: c 246:* rwm
 lxc.cgroup2.devices.allow: c 247:* rwm
 lxc.cgroup2.devices.allow: c 248:* rwm
 lxc.cgroup2.devices.allow: c 249:* rwm
 lxc.cgroup2.devices.allow: c 250:* rwm
 lxc.cgroup2.devices.allow: c 510:0 rwm
 ```
 ### **3.2 Restart sequence**
 ```bash
 pct stop 201
 modprobe amdgpu
 pct start 201
 pct enter 201
 ```
 ---
 ## **4. Inside CT 201 — Verifying ROCm + GPU Visibility**
 ### **4.1 Check device nodes**
 ```bash
 ls -l /dev/kfd
 ls -l /dev/dri
 ls -l /opt/rocm
 ```
 All must exist.
 ### **4.2 Validate GPU via rocminfo**
 ```bash
 /opt/rocm/bin/rocminfo | grep -i gfx
 ```
 You need to see:
 ```
 gfx906
 ```
 If you see **nothing**, the GPU isn’t passed through — restart and re-check the host steps.
 ---
 ## **5. Install Docker in the LXC (Ubuntu 24.04)**
 This container runs Docker inside LXC (nesting enabled).
 ```bash
 apt update
 apt install -y ca-certificates curl gnupg
 install -m 0755 -d /etc/apt/keyrings
 curl -fsSL https://download.docker.com/linux/ubuntu/gpg \
  | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
 chmod a+r /etc/apt/keyrings/docker.gpg
 echo \
  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
  https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo $VERSION_CODENAME) stable" \
  > /etc/apt/sources.list.d/docker.list
 apt update
 apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
 ```
 Check:
 ```bash
 docker --version
 docker compose version
 ```
 ---
 ## **6. Running vLLM Inside CT 201 via Docker**
 ### **6.1 Create directory**
 ```bash
 mkdir -p /root/vllm
 cd /root/vllm
 ```
 ### **6.2 docker-compose.yml**
 Save this exact file as `/root/vllm/docker-compose.yml`:
 ```yaml
 version: "3.9"
 services:
  vllm-mi50:
    image: nalanzeyu/vllm-gfx906:latest
    container_name: vllm-mi50
    restart: unless-stopped
    ports:
      - "8000:8000"
    environment:
      VLLM_ROLE: "APIServer"
      VLLM_MODEL: "/model"
      VLLM_LOGGING_LEVEL: "INFO"
    command: >
      vllm serve /model
      --host 0.0.0.0
      --port 8000
      --dtype float16
      --max-model-len 4096
      --api-type openai
    devices:
      - "/dev/kfd:/dev/kfd"
      - "/dev/dri:/dev/dri"
    volumes:
      - /opt/rocm:/opt/rocm:ro
 ```
 ### **6.3 Start vLLM**
 ```bash
 docker compose up -d
 docker compose logs -f
 ```
 When healthy, you’ll see:
 ```
 (APIServer) Application startup complete.
 ```
 and periodic throughput logs.
 ---
 ## **7. Test vLLM API**
 ### **7.1 From Proxmox host**
 ```bash
 curl -X POST http://10.0.0.43:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{"model":"/model","prompt":"ping","max_tokens":5}'
 ```
 Should respond like:
 ```json
 {"choices":[{"text":"-pong"}]}
 ```
 ### **7.2 From Cortex machine**
 ```bash
 curl -X POST http://10.0.0.43:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{"model":"/model","prompt":"ping from cortex","max_tokens":5}'
 ```
 ---
 ## **8. Wiring into Lyra Cortex**
 In `cortex` container’s `docker-compose.yml`:
 ```yaml
 environment:
  LLM_PRIMARY_URL: http://10.0.0.43:8000
 ```
 Not `/v1/completions` because the router appends that automatically.
 In `cortex/.env`:
 ```env
 LLM_FORCE_BACKEND=primary
 LLM_MODEL=/model
 ```
 Test:
 ```bash
 curl -X POST http://10.0.0.41:7081/reason \
  -H "Content-Type: application/json" \
  -d '{"prompt":"test vllm","session_id":"dev"}'
 ```
 If you get a meaningful response: **Cortex → vLLM is online**.
 ---
 ## **9. Common Failure Modes (And Fixes)**
 ### **9.1 “Failed to infer device type”**
 vLLM cannot see any ROCm devices.
 Fix:
 ```bash
 # On host
 modprobe amdgpu
 pct stop 201
 pct start 201
 # In container
 /opt/rocm/bin/rocminfo | grep -i gfx
 docker compose up -d
 ```
 ### **9.2 GPU disappears after reboot**
 Same fix:
 ```bash
 modprobe amdgpu
 pct stop 201
 pct start 201
 ```
 ### **9.3 Invalid image name**
 If you see pull errors:
 ```
 pull access denied for nalanzeuy...
 ```
 Use:
 ```
 image: nalanzeyu/vllm-gfx906
 ```
 ### **9.4 Double `/v1` in URL**
 Ensure:
 ```
 LLM_PRIMARY_URL=http://10.0.0.43:8000
 ```
 Router appends `/v1/completions`.
 ---
 ## **10. Daily / Reboot Ritual**
 ### **On Proxmox host**
 ```bash
 modprobe amdgpu
 pct stop 201
 pct start 201
 ```
 ### **Inside CT 201**
 ```bash
 /opt/rocm/bin/rocminfo | grep -i gfx
 cd /root/vllm
 docker compose up -d
 docker compose logs -f
 ```
 ### **Test API**
 ```bash
 curl -X POST http://10.0.0.43:8000/v1/completions \
  -H "Content-Type: application/json" \
  -d '{"model":"/model","prompt":"ping","max_tokens":5}'
 ```
 ---
 ## **11. Summary**
 You now have:
 * **MI50 (gfx906)** correctly passed into LXC
 * **ROCm** inside the container via bind mounts
 * **vLLM** running inside Docker in the LXC
 * **OpenAI-compatible API** on port 8000
 * **Lyra Cortex** using it automatically as primary backend
 This is a complete, reproducible setup that survives reboots (with the modprobe ritual) and allows you to upgrade/replace models anytime.
 ---
 If you want, I can generate:
 * A `/docs/vllm-mi50/README.md`
 * A "vLLM Gotchas" document
 * A quick-reference cheat sheet
 * A troubleshooting decision tree
 Just say the word.