cortex rework continued.

2025-12-11 02:50:23 -05:00
parent 8c914906e5
commit 5ed3fd0982
7 changed files with 910 additions and 1113 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
--- a/cortex/Dockerfile
+++ b/cortex/Dockerfile
@@ -4,4 +4,6 @@ COPY requirements.txt .
 RUN pip install -r requirements.txt
 COPY . .
 EXPOSE 7081
+# NOTE: Running with single worker to maintain SESSIONS global state in Intake.
+# If scaling to multiple workers, migrate SESSIONS to Redis or shared storage.
 CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7081"]
--- a/cortex/context.py
+++ b/cortex/context.py
@@ -84,6 +84,7 @@ def _init_session(session_id: str) -> Dict[str, Any]:
        "mood": "neutral",  # Future: mood tracking
        "active_project": None,  # Future: project context
        "message_count": 0,
+        "message_history": [],
    }


@@ -275,6 +276,13 @@ async def collect_context(session_id: str, user_prompt: str) -> Dict[str, Any]:
    state["last_user_message"] = user_prompt
    state["last_timestamp"] = now
    state["message_count"] += 1
+    # Save user turn to history
+    state["message_history"].append({
+    "user": user_prompt,
+    "assistant": ""   # assistant reply filled later by update_last_assistant_message()
+    })
+
+

    # F. Assemble unified context
    context_state = {
@@ -311,20 +319,27 @@ async def collect_context(session_id: str, user_prompt: str) -> Dict[str, Any]:
 # -----------------------------
 def update_last_assistant_message(session_id: str, message: str) -> None:
    """
-    Update session state with assistant's response.
-
-    Called by router.py after persona layer completes.
-
-    Args:
-        session_id: Session identifier
-        message: Assistant's final response text
+    Update session state with assistant's response and complete
+    the last turn inside message_history.
    """
-    if session_id in SESSION_STATE:
-        SESSION_STATE[session_id]["last_assistant_message"] = message
-        SESSION_STATE[session_id]["last_timestamp"] = datetime.now()
-        logger.debug(f"Updated assistant message for session {session_id}")
-    else:
+    session = SESSION_STATE.get(session_id)
+    if not session:
        logger.warning(f"Attempted to update non-existent session: {session_id}")
+        return
+
+    # Update last assistant message + timestamp
+    session["last_assistant_message"] = message
+    session["last_timestamp"] = datetime.now()
+
+    # Fill in assistant reply for the most recent turn
+    history = session.get("message_history", [])
+    if history:
+        # history entry already contains {"user": "...", "assistant": "...?"}
+        history[-1]["assistant"] = message
+
+    if VERBOSE_DEBUG:
+        logger.debug(f"Updated assistant message for session {session_id}")
+


 def get_session_state(session_id: str) -> Optional[Dict[str, Any]]:
--- a/cortex/intake/init.py
+++ b/cortex/intake/init.py
@@ -0,0 +1,18 @@
+"""
+Intake module - short-term memory summarization.
+
+Runs inside the Cortex container as a pure Python module.
+No standalone API server - called internally by Cortex.
+"""
+
+from .intake import (
+    SESSIONS,
+    add_exchange_internal,
+    summarize_context,
+)
+
+__all__ = [
+    "SESSIONS",
+    "add_exchange_internal",
+    "summarize_context",
+]
--- a/cortex/intake/intake.py
+++ b/cortex/intake/intake.py
@@ -1,18 +1,29 @@
 import os
+import json
 from datetime import datetime
 from typing import List, Dict, Any, TYPE_CHECKING
 from collections import deque
+from llm.llm_router import call_llm

+# -------------------------------------------------------------------
+# Global Short-Term Memory (new Intake)
+# -------------------------------------------------------------------
+SESSIONS: dict[str, dict] = {}   # session_id → { buffer: deque, created_at: timestamp }
+
+# Diagnostic: Verify module loads only once
+print(f"[Intake Module Init] SESSIONS object id: {id(SESSIONS)}, module: {__name__}")
+
+# L10 / L20 history lives here too
+L10_HISTORY: Dict[str, list[str]] = {}
+L20_HISTORY: Dict[str, list[str]] = {}
+
+from llm.llm_router import call_llm  # Use Cortex's shared LLM router

 if TYPE_CHECKING:
+    # Only for type hints — do NOT redefine SESSIONS here
    from collections import deque as _deque
-    SESSIONS: dict
-    L10_HISTORY: dict
-    L20_HISTORY: dict
    def bg_summarize(session_id: str) -> None: ...

-from llm.llm_router import call_llm  # use Cortex's shared router
-
 # ─────────────────────────────
 # Config
 # ─────────────────────────────
@@ -220,20 +231,24 @@ def push_to_neomem(summary: str, session_id: str, level: str) -> None:
 # ─────────────────────────────
 # Main entrypoint for Cortex
 # ─────────────────────────────
-
-async def summarize_context(
-    session_id: str,
-    exchanges: List[Dict[str, Any]],
-) -> Dict[str, Any]:
+async def summarize_context(session_id: str, exchanges: list[dict]):
    """
-    Main API used by Cortex:
+    Internal summarizer that uses Cortex's LLM router.
+    Produces L1 / L5 / L10 / L20 / L30 summaries.

-        summaries = await summarize_context(session_id, exchanges)
-
-    `exchanges` should be the recent conversation buffer for that session.
+    Args:
+        session_id: The conversation/session ID
+        exchanges: A list of {"user_msg": ..., "assistant_msg": ..., "timestamp": ...}
    """
-    buf = list(exchanges)
-    if not buf:
+
+    # Build raw conversation text
+    convo_lines = []
+    for ex in exchanges:
+        convo_lines.append(f"User: {ex.get('user_msg','')}")
+        convo_lines.append(f"Assistant: {ex.get('assistant_msg','')}")
+    convo_text = "\n".join(convo_lines)
+
+    if not convo_text.strip():
        return {
            "session_id": session_id,
            "exchange_count": 0,
@@ -242,31 +257,72 @@ async def summarize_context(
            "L10": "",
            "L20": "",
            "L30": "",
-            "last_updated": None,
+            "last_updated": datetime.now().isoformat()
        }

-    # Base levels
-    L1 = await summarize_L1(buf)
-    L5 = await summarize_L5(buf)
-    L10 = await summarize_L10(session_id, buf)
-    L20 = await summarize_L20(session_id)
-    L30 = await summarize_L30(session_id)
+    # Prompt the LLM (internal — no HTTP)
+    prompt = f"""
+Summarize the conversation below into multiple compression levels.

-    # Push the "interesting" tiers into NeoMem
-    push_to_neomem(L10, session_id, "L10")
-    push_to_neomem(L20, session_id, "L20")
-    push_to_neomem(L30, session_id, "L30")
+Conversation:
+----------------
+{convo_text}
+----------------

-    return {
-        "session_id": session_id,
-        "exchange_count": len(buf),
-        "L1": L1,
-        "L5": L5,
-        "L10": L10,
-        "L20": L20,
-        "L30": L30,
-        "last_updated": datetime.now().isoformat(),
-    }
+Output strictly in JSON with keys:
+L1  → ultra short summary (1–2 sentences max)
+L5  → short summary
+L10 → medium summary
+L20 → detailed overview
+L30 → full detailed summary
+
+JSON only. No text outside JSON.
+"""
+
+    try:
+        llm_response = await call_llm(
+            prompt,
+            temperature=0.2
+        )
+
+
+        # LLM should return JSON, parse it
+        summary = json.loads(llm_response)
+
+        return {
+            "session_id": session_id,
+            "exchange_count": len(exchanges),
+            "L1": summary.get("L1", ""),
+            "L5": summary.get("L5", ""),
+            "L10": summary.get("L10", ""),
+            "L20": summary.get("L20", ""),
+            "L30": summary.get("L30", ""),
+            "last_updated": datetime.now().isoformat()
+        }
+
+    except Exception as e:
+        return {
+            "session_id": session_id,
+            "exchange_count": len(exchanges),
+            "L1": f"[Error summarizing: {str(e)}]",
+            "L5": "",
+            "L10": "",
+            "L20": "",
+            "L30": "",
+            "last_updated": datetime.now().isoformat()
+        }
+
+# ─────────────────────────────────
+# Background summarization stub
+# ─────────────────────────────────
+def bg_summarize(session_id: str):
+    """
+    Placeholder for background summarization.
+    Actual summarization happens during /reason via summarize_context().
+
+    This function exists to prevent NameError when called from add_exchange_internal().
+    """
+    print(f"[Intake] Exchange added for {session_id}. Will summarize on next /reason call.")

 # ─────────────────────────────
 # Internal entrypoint for Cortex
@@ -283,15 +339,23 @@ def add_exchange_internal(exchange: dict):

    exchange["timestamp"] = datetime.now().isoformat()

+    # DEBUG: Verify we're using the module-level SESSIONS
+    print(f"[add_exchange_internal] SESSIONS object id: {id(SESSIONS)}, current sessions: {list(SESSIONS.keys())}")
+
    # Ensure session exists
    if session_id not in SESSIONS:
        SESSIONS[session_id] = {
            "buffer": deque(maxlen=200),
            "created_at": datetime.now()
        }
+        print(f"[add_exchange_internal] Created new session: {session_id}")
+    else:
+        print(f"[add_exchange_internal] Using existing session: {session_id}")

    # Append exchange into the rolling buffer
    SESSIONS[session_id]["buffer"].append(exchange)
+    buffer_len = len(SESSIONS[session_id]["buffer"])
+    print(f"[add_exchange_internal] Added exchange to {session_id}, buffer now has {buffer_len} items")

    # Trigger summarization immediately
    try:
--- a/cortex/router.py
+++ b/cortex/router.py
@@ -197,26 +197,110 @@ class IngestPayload(BaseModel):
    user_msg: str
    assistant_msg: str

+
@cortex_router.post("/ingest")
-async def ingest_stub():
-    # Intake is internal now — this endpoint is only for compatibility.
-    return {"status": "ok", "note": "intake is internal now"}
+async def ingest(payload: IngestPayload):
+    """
+    Receives (session_id, user_msg, assistant_msg) from Relay
+    and pushes directly into Intake's in-memory buffer.

-
-    # 1. Update Cortex session state
-    update_last_assistant_message(payload.session_id, payload.assistant_msg)
-
-    # 2. Feed Intake internally (no HTTP)
+    Uses lenient error handling - always returns success to avoid
+    breaking the chat pipeline.
+    """
    try:
+        # 1. Update Cortex session state
+        update_last_assistant_message(payload.session_id, payload.assistant_msg)
+    except Exception as e:
+        logger.warning(f"[INGEST] Failed to update session state: {e}")
+        # Continue anyway (lenient mode)
+
+    try:
+        # 2. Feed Intake internally (no HTTP)
        add_exchange_internal({
            "session_id": payload.session_id,
            "user_msg": payload.user_msg,
            "assistant_msg": payload.assistant_msg,
        })
-
        logger.debug(f"[INGEST] Added exchange to Intake for {payload.session_id}")
    except Exception as e:
-        logger.warning(f"[INGEST] Failed to add exchange to Intake: {e}")
+        logger.warning(f"[INGEST] Failed to add to Intake: {e}")
+        # Continue anyway (lenient mode)

-    return {"ok": True, "session_id": payload.session_id}
+    # Always return success (user requirement: never fail chat pipeline)
+    return {
+        "status": "ok",
+        "session_id": payload.session_id
+    }
+
+# -----------------------------
+# Debug endpoint: summarized context
+# -----------------------------
+@cortex_router.get("/debug/summary")
+async def debug_summary(session_id: str):
+    """
+    Diagnostic endpoint that runs Intake's summarize_context() for a session.
+
+    Shows exactly what L1/L5/L10/L20/L30 summaries would look like
+    inside the actual Uvicorn worker, using the real SESSIONS buffer.
+    """
+    from intake.intake import SESSIONS, summarize_context
+
+    # Validate session
+    session = SESSIONS.get(session_id)
+    if not session:
+        return {"error": "session not found", "session_id": session_id}
+
+    # Convert deque into the structure summarize_context expects
+    buffer = session["buffer"]
+    exchanges = [
+        {
+            "user_msg": ex.get("user_msg", ""),
+            "assistant_msg": ex.get("assistant_msg", ""),
+        }
+        for ex in buffer
+    ]
+
+    # 🔥 CRITICAL FIX — summarize_context is async
+    summary = await summarize_context(session_id, exchanges)
+
+    return {
+        "session_id": session_id,
+        "buffer_size": len(buffer),
+        "exchanges_preview": exchanges[-5:],   # last 5 items
+        "summary": summary
+    }
+
+# -----------------------------
+# Debug endpoint for SESSIONS
+# -----------------------------
+@cortex_router.get("/debug/sessions")
+async def debug_sessions():
+    """
+    Diagnostic endpoint to inspect SESSIONS from within the running Uvicorn worker.
+    This shows the actual state of the in-memory SESSIONS dict.
+    """
+    from intake.intake import SESSIONS
+
+    sessions_data = {}
+    for session_id, session_info in SESSIONS.items():
+        buffer = session_info["buffer"]
+        sessions_data[session_id] = {
+            "created_at": session_info["created_at"].isoformat(),
+            "buffer_size": len(buffer),
+            "buffer_maxlen": buffer.maxlen,
+            "recent_exchanges": [
+                {
+                    "user_msg": ex.get("user_msg", "")[:100],
+                    "assistant_msg": ex.get("assistant_msg", "")[:100],
+                    "timestamp": ex.get("timestamp", "")
+                }
+                for ex in list(buffer)[-5:]  # Last 5 exchanges
+            ]
+        }
+
+    return {
+        "sessions_object_id": id(SESSIONS),
+        "total_sessions": len(SESSIONS),
+        "sessions": sessions_data
+    }

--- a/vllm-mi50.md
+++ b/vllm-mi50.md
@@ -1,416 +0,0 @@
-Here you go — a **clean, polished, ready-to-drop-into-Trilium or GitHub** Markdown file.
-
-If you want, I can also auto-generate a matching `/docs/vllm-mi50/` folder structure and a mini-ToC.
-
---
-
-# **MI50 + vLLM + Proxmox LXC Setup Guide**
-
-### *End-to-End Field Manual for gfx906 LLM Serving*
-
-**Version:** 1.0
-**Last updated:** 2025-11-17
-
---
-
-## **📌 Overview**
-
-This guide documents how to run a **vLLM OpenAI-compatible server** on an
-**AMD Instinct MI50 (gfx906)** inside a **Proxmox LXC container**, expose it over LAN,
-and wire it into **Project Lyra's Cortex reasoning layer**.
-
-This file is long, specific, and intentionally leaves *nothing* out so you never have to rediscover ROCm pain rituals again.
-
---
-
-## **1. What This Stack Looks Like**
-
-```
-Proxmox Host
- ├─ AMD Instinct MI50 (gfx906)
- ├─ AMDGPU + ROCm stack
- └─ LXC Container (CT 201: cortex-gpu)
-      ├─ Ubuntu 24.04
-      ├─ Docker + docker compose
-      ├─ vLLM inside Docker (nalanzeyu/vllm-gfx906)
-      ├─ GPU passthrough via /dev/kfd + /dev/dri + PCI bind
-      └─ vLLM API exposed on :8000
-Lyra Cortex (VM/Server)
- └─ LLM_PRIMARY_URL=http://10.0.0.43:8000
-```
-
---
-
-## **2. Proxmox Host — GPU Setup**
-
-### **2.1 Confirm MI50 exists**
-
-```bash
-lspci -nn | grep -i 'vega\|instinct\|radeon'
-```
-
-You should see something like:
-
-```
-0a:00.0 Display controller: AMD Instinct MI50 (gfx906)
-```
-
-### **2.2 Load AMDGPU driver**
-
-The main pitfall after **any host reboot**.
-
-```bash
-modprobe amdgpu
-```
-
-If you skip this, the LXC container won't see the GPU.
-
---
-
-## **3. LXC Container Configuration (CT 201)**
-
-The container ID is **201**.
-Config file is at:
-
-```
-/etc/pve/lxc/201.conf
-```
-
-### **3.1 Working 201.conf**
-
-Paste this *exact* version:
-
-```ini
-arch: amd64
-cores: 4
-hostname: cortex-gpu
-memory: 16384
-swap: 512
-ostype: ubuntu
-onboot: 1
-startup: order=2,up=10,down=10
-net0: name=eth0,bridge=vmbr0,hwaddr=BC:24:11:C6:3E:88,ip=dhcp,type=veth
-rootfs: local-lvm:vm-201-disk-0,size=200G
-unprivileged: 0
-
-# Docker in LXC requires this
-features: keyctl=1,nesting=1
-lxc.apparmor.profile: unconfined
-lxc.cap.drop:
-
-# --- GPU passthrough for ROCm (MI50) ---
-lxc.mount.entry: /dev/kfd dev/kfd none bind,optional,create=file,mode=0666
-lxc.mount.entry: /dev/dri dev/dri none bind,optional,create=dir
-lxc.mount.entry: /sys/class/drm sys/class/drm none bind,ro,optional,create=dir
-lxc.mount.entry: /opt/rocm /opt/rocm none bind,ro,optional,create=dir
-
-# Bind the MI50 PCI device
-lxc.mount.entry: /dev/bus/pci/0000:0a:00.0 dev/bus/pci/0000:0a:00.0 none bind,optional,create=file
-
-# Allow GPU-related character devices
-lxc.cgroup2.devices.allow: c 226:* rwm
-lxc.cgroup2.devices.allow: c 29:* rwm
-lxc.cgroup2.devices.allow: c 189:* rwm
-lxc.cgroup2.devices.allow: c 238:* rwm
-lxc.cgroup2.devices.allow: c 241:* rwm
-lxc.cgroup2.devices.allow: c 242:* rwm
-lxc.cgroup2.devices.allow: c 243:* rwm
-lxc.cgroup2.devices.allow: c 244:* rwm
-lxc.cgroup2.devices.allow: c 245:* rwm
-lxc.cgroup2.devices.allow: c 246:* rwm
-lxc.cgroup2.devices.allow: c 247:* rwm
-lxc.cgroup2.devices.allow: c 248:* rwm
-lxc.cgroup2.devices.allow: c 249:* rwm
-lxc.cgroup2.devices.allow: c 250:* rwm
-lxc.cgroup2.devices.allow: c 510:0 rwm
-```
-
-### **3.2 Restart sequence**
-
-```bash
-pct stop 201
-modprobe amdgpu
-pct start 201
-pct enter 201
-```
-
---
-
-## **4. Inside CT 201 — Verifying ROCm + GPU Visibility**
-
-### **4.1 Check device nodes**
-
-```bash
-ls -l /dev/kfd
-ls -l /dev/dri
-ls -l /opt/rocm
-```
-
-All must exist.
-
-### **4.2 Validate GPU via rocminfo**
-
-```bash
-/opt/rocm/bin/rocminfo | grep -i gfx
-```
-
-You need to see:
-
-```
-gfx906
-```
-
-If you see **nothing**, the GPU isn’t passed through — restart and re-check the host steps.
-
---
-
-## **5. Install Docker in the LXC (Ubuntu 24.04)**
-
-This container runs Docker inside LXC (nesting enabled).
-
-```bash
-apt update
-apt install -y ca-certificates curl gnupg
-
-install -m 0755 -d /etc/apt/keyrings
-curl -fsSL https://download.docker.com/linux/ubuntu/gpg \
-  | gpg --dearmor -o /etc/apt/keyrings/docker.gpg
-chmod a+r /etc/apt/keyrings/docker.gpg
-
-echo \
-  "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
-  https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo $VERSION_CODENAME) stable" \
-  > /etc/apt/sources.list.d/docker.list
-
-apt update
-apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
-```
-
-Check:
-
-```bash
-docker --version
-docker compose version
-```
-
---
-
-## **6. Running vLLM Inside CT 201 via Docker**
-
-### **6.1 Create directory**
-
-```bash
-mkdir -p /root/vllm
-cd /root/vllm
-```
-
-### **6.2 docker-compose.yml**
-
-Save this exact file as `/root/vllm/docker-compose.yml`:
-
-```yaml
-version: "3.9"
-
-services:
-  vllm-mi50:
-    image: nalanzeyu/vllm-gfx906:latest
-    container_name: vllm-mi50
-    restart: unless-stopped
-    ports:
-      - "8000:8000"
-    environment:
-      VLLM_ROLE: "APIServer"
-      VLLM_MODEL: "/model"
-      VLLM_LOGGING_LEVEL: "INFO"
-    command: >
-      vllm serve /model
-      --host 0.0.0.0
-      --port 8000
-      --dtype float16
-      --max-model-len 4096
-      --api-type openai
-    devices:
-      - "/dev/kfd:/dev/kfd"
-      - "/dev/dri:/dev/dri"
-    volumes:
-      - /opt/rocm:/opt/rocm:ro
-```
-
-### **6.3 Start vLLM**
-
-```bash
-docker compose up -d
-docker compose logs -f
-```
-
-When healthy, you’ll see:
-
-```
-(APIServer) Application startup complete.
-```
-
-and periodic throughput logs.
-
---
-
-## **7. Test vLLM API**
-
-### **7.1 From Proxmox host**
-
-```bash
-curl -X POST http://10.0.0.43:8000/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model":"/model","prompt":"ping","max_tokens":5}'
-```
-
-Should respond like:
-
-```json
-{"choices":[{"text":"-pong"}]}
-```
-
-### **7.2 From Cortex machine**
-
-```bash
-curl -X POST http://10.0.0.43:8000/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model":"/model","prompt":"ping from cortex","max_tokens":5}'
-```
-
---
-
-## **8. Wiring into Lyra Cortex**
-
-In `cortex` container’s `docker-compose.yml`:
-
-```yaml
-environment:
-  LLM_PRIMARY_URL: http://10.0.0.43:8000
-```
-
-Not `/v1/completions` because the router appends that automatically.
-
-In `cortex/.env`:
-
-```env
-LLM_FORCE_BACKEND=primary
-LLM_MODEL=/model
-```
-
-Test:
-
-```bash
-curl -X POST http://10.0.0.41:7081/reason \
-  -H "Content-Type: application/json" \
-  -d '{"prompt":"test vllm","session_id":"dev"}'
-```
-
-If you get a meaningful response: **Cortex → vLLM is online**.
-
---
-
-## **9. Common Failure Modes (And Fixes)**
-
-### **9.1 “Failed to infer device type”**
-
-vLLM cannot see any ROCm devices.
-
-Fix:
-
-```bash
-# On host
-modprobe amdgpu
-pct stop 201
-pct start 201
-# In container
-/opt/rocm/bin/rocminfo | grep -i gfx
-docker compose up -d
-```
-
-### **9.2 GPU disappears after reboot**
-
-Same fix:
-
-```bash
-modprobe amdgpu
-pct stop 201
-pct start 201
-```
-
-### **9.3 Invalid image name**
-
-If you see pull errors:
-
-```
-pull access denied for nalanzeuy...
-```
-
-Use:
-
-```
-image: nalanzeyu/vllm-gfx906
-```
-
-### **9.4 Double `/v1` in URL**
-
-Ensure:
-
-```
-LLM_PRIMARY_URL=http://10.0.0.43:8000
-```
-
-Router appends `/v1/completions`.
-
---
-
-## **10. Daily / Reboot Ritual**
-
-### **On Proxmox host**
-
-```bash
-modprobe amdgpu
-pct stop 201
-pct start 201
-```
-
-### **Inside CT 201**
-
-```bash
-/opt/rocm/bin/rocminfo | grep -i gfx
-cd /root/vllm
-docker compose up -d
-docker compose logs -f
-```
-
-### **Test API**
-
-```bash
-curl -X POST http://10.0.0.43:8000/v1/completions \
-  -H "Content-Type: application/json" \
-  -d '{"model":"/model","prompt":"ping","max_tokens":5}'
-```
-
---
-
-## **11. Summary**
-
-You now have:
-
-* **MI50 (gfx906)** correctly passed into LXC
-* **ROCm** inside the container via bind mounts
-* **vLLM** running inside Docker in the LXC
-* **OpenAI-compatible API** on port 8000
-* **Lyra Cortex** using it automatically as primary backend
-
-This is a complete, reproducible setup that survives reboots (with the modprobe ritual) and allows you to upgrade/replace models anytime.
-
---
-
-If you want, I can generate:
-
-* A `/docs/vllm-mi50/README.md`
-* A "vLLM Gotchas" document
-* A quick-reference cheat sheet
-* A troubleshooting decision tree
-
-Just say the word.