Update to v0.9.1 #1
1304
CHANGELOG.md
1304
CHANGELOG.md
File diff suppressed because it is too large
Load Diff
@@ -4,4 +4,6 @@ COPY requirements.txt .
|
|||||||
RUN pip install -r requirements.txt
|
RUN pip install -r requirements.txt
|
||||||
COPY . .
|
COPY . .
|
||||||
EXPOSE 7081
|
EXPOSE 7081
|
||||||
|
# NOTE: Running with single worker to maintain SESSIONS global state in Intake.
|
||||||
|
# If scaling to multiple workers, migrate SESSIONS to Redis or shared storage.
|
||||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7081"]
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7081"]
|
||||||
|
|||||||
@@ -84,6 +84,7 @@ def _init_session(session_id: str) -> Dict[str, Any]:
|
|||||||
"mood": "neutral", # Future: mood tracking
|
"mood": "neutral", # Future: mood tracking
|
||||||
"active_project": None, # Future: project context
|
"active_project": None, # Future: project context
|
||||||
"message_count": 0,
|
"message_count": 0,
|
||||||
|
"message_history": [],
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@@ -275,6 +276,13 @@ async def collect_context(session_id: str, user_prompt: str) -> Dict[str, Any]:
|
|||||||
state["last_user_message"] = user_prompt
|
state["last_user_message"] = user_prompt
|
||||||
state["last_timestamp"] = now
|
state["last_timestamp"] = now
|
||||||
state["message_count"] += 1
|
state["message_count"] += 1
|
||||||
|
# Save user turn to history
|
||||||
|
state["message_history"].append({
|
||||||
|
"user": user_prompt,
|
||||||
|
"assistant": "" # assistant reply filled later by update_last_assistant_message()
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# F. Assemble unified context
|
# F. Assemble unified context
|
||||||
context_state = {
|
context_state = {
|
||||||
@@ -311,20 +319,27 @@ async def collect_context(session_id: str, user_prompt: str) -> Dict[str, Any]:
|
|||||||
# -----------------------------
|
# -----------------------------
|
||||||
def update_last_assistant_message(session_id: str, message: str) -> None:
|
def update_last_assistant_message(session_id: str, message: str) -> None:
|
||||||
"""
|
"""
|
||||||
Update session state with assistant's response.
|
Update session state with assistant's response and complete
|
||||||
|
the last turn inside message_history.
|
||||||
Called by router.py after persona layer completes.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
session_id: Session identifier
|
|
||||||
message: Assistant's final response text
|
|
||||||
"""
|
"""
|
||||||
if session_id in SESSION_STATE:
|
session = SESSION_STATE.get(session_id)
|
||||||
SESSION_STATE[session_id]["last_assistant_message"] = message
|
if not session:
|
||||||
SESSION_STATE[session_id]["last_timestamp"] = datetime.now()
|
|
||||||
logger.debug(f"Updated assistant message for session {session_id}")
|
|
||||||
else:
|
|
||||||
logger.warning(f"Attempted to update non-existent session: {session_id}")
|
logger.warning(f"Attempted to update non-existent session: {session_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Update last assistant message + timestamp
|
||||||
|
session["last_assistant_message"] = message
|
||||||
|
session["last_timestamp"] = datetime.now()
|
||||||
|
|
||||||
|
# Fill in assistant reply for the most recent turn
|
||||||
|
history = session.get("message_history", [])
|
||||||
|
if history:
|
||||||
|
# history entry already contains {"user": "...", "assistant": "...?"}
|
||||||
|
history[-1]["assistant"] = message
|
||||||
|
|
||||||
|
if VERBOSE_DEBUG:
|
||||||
|
logger.debug(f"Updated assistant message for session {session_id}")
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def get_session_state(session_id: str) -> Optional[Dict[str, Any]]:
|
def get_session_state(session_id: str) -> Optional[Dict[str, Any]]:
|
||||||
|
|||||||
18
cortex/intake/__init__.py
Normal file
18
cortex/intake/__init__.py
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
"""
|
||||||
|
Intake module - short-term memory summarization.
|
||||||
|
|
||||||
|
Runs inside the Cortex container as a pure Python module.
|
||||||
|
No standalone API server - called internally by Cortex.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from .intake import (
|
||||||
|
SESSIONS,
|
||||||
|
add_exchange_internal,
|
||||||
|
summarize_context,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"SESSIONS",
|
||||||
|
"add_exchange_internal",
|
||||||
|
"summarize_context",
|
||||||
|
]
|
||||||
@@ -1,18 +1,29 @@
|
|||||||
import os
|
import os
|
||||||
|
import json
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from typing import List, Dict, Any, TYPE_CHECKING
|
from typing import List, Dict, Any, TYPE_CHECKING
|
||||||
from collections import deque
|
from collections import deque
|
||||||
|
from llm.llm_router import call_llm
|
||||||
|
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
# Global Short-Term Memory (new Intake)
|
||||||
|
# -------------------------------------------------------------------
|
||||||
|
SESSIONS: dict[str, dict] = {} # session_id → { buffer: deque, created_at: timestamp }
|
||||||
|
|
||||||
|
# Diagnostic: Verify module loads only once
|
||||||
|
print(f"[Intake Module Init] SESSIONS object id: {id(SESSIONS)}, module: {__name__}")
|
||||||
|
|
||||||
|
# L10 / L20 history lives here too
|
||||||
|
L10_HISTORY: Dict[str, list[str]] = {}
|
||||||
|
L20_HISTORY: Dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
from llm.llm_router import call_llm # Use Cortex's shared LLM router
|
||||||
|
|
||||||
if TYPE_CHECKING:
|
if TYPE_CHECKING:
|
||||||
|
# Only for type hints — do NOT redefine SESSIONS here
|
||||||
from collections import deque as _deque
|
from collections import deque as _deque
|
||||||
SESSIONS: dict
|
|
||||||
L10_HISTORY: dict
|
|
||||||
L20_HISTORY: dict
|
|
||||||
def bg_summarize(session_id: str) -> None: ...
|
def bg_summarize(session_id: str) -> None: ...
|
||||||
|
|
||||||
from llm.llm_router import call_llm # use Cortex's shared router
|
|
||||||
|
|
||||||
# ─────────────────────────────
|
# ─────────────────────────────
|
||||||
# Config
|
# Config
|
||||||
# ─────────────────────────────
|
# ─────────────────────────────
|
||||||
@@ -220,20 +231,24 @@ def push_to_neomem(summary: str, session_id: str, level: str) -> None:
|
|||||||
# ─────────────────────────────
|
# ─────────────────────────────
|
||||||
# Main entrypoint for Cortex
|
# Main entrypoint for Cortex
|
||||||
# ─────────────────────────────
|
# ─────────────────────────────
|
||||||
|
async def summarize_context(session_id: str, exchanges: list[dict]):
|
||||||
async def summarize_context(
|
|
||||||
session_id: str,
|
|
||||||
exchanges: List[Dict[str, Any]],
|
|
||||||
) -> Dict[str, Any]:
|
|
||||||
"""
|
"""
|
||||||
Main API used by Cortex:
|
Internal summarizer that uses Cortex's LLM router.
|
||||||
|
Produces L1 / L5 / L10 / L20 / L30 summaries.
|
||||||
|
|
||||||
summaries = await summarize_context(session_id, exchanges)
|
Args:
|
||||||
|
session_id: The conversation/session ID
|
||||||
`exchanges` should be the recent conversation buffer for that session.
|
exchanges: A list of {"user_msg": ..., "assistant_msg": ..., "timestamp": ...}
|
||||||
"""
|
"""
|
||||||
buf = list(exchanges)
|
|
||||||
if not buf:
|
# Build raw conversation text
|
||||||
|
convo_lines = []
|
||||||
|
for ex in exchanges:
|
||||||
|
convo_lines.append(f"User: {ex.get('user_msg','')}")
|
||||||
|
convo_lines.append(f"Assistant: {ex.get('assistant_msg','')}")
|
||||||
|
convo_text = "\n".join(convo_lines)
|
||||||
|
|
||||||
|
if not convo_text.strip():
|
||||||
return {
|
return {
|
||||||
"session_id": session_id,
|
"session_id": session_id,
|
||||||
"exchange_count": 0,
|
"exchange_count": 0,
|
||||||
@@ -242,31 +257,72 @@ async def summarize_context(
|
|||||||
"L10": "",
|
"L10": "",
|
||||||
"L20": "",
|
"L20": "",
|
||||||
"L30": "",
|
"L30": "",
|
||||||
"last_updated": None,
|
"last_updated": datetime.now().isoformat()
|
||||||
}
|
}
|
||||||
|
|
||||||
# Base levels
|
# Prompt the LLM (internal — no HTTP)
|
||||||
L1 = await summarize_L1(buf)
|
prompt = f"""
|
||||||
L5 = await summarize_L5(buf)
|
Summarize the conversation below into multiple compression levels.
|
||||||
L10 = await summarize_L10(session_id, buf)
|
|
||||||
L20 = await summarize_L20(session_id)
|
|
||||||
L30 = await summarize_L30(session_id)
|
|
||||||
|
|
||||||
# Push the "interesting" tiers into NeoMem
|
Conversation:
|
||||||
push_to_neomem(L10, session_id, "L10")
|
----------------
|
||||||
push_to_neomem(L20, session_id, "L20")
|
{convo_text}
|
||||||
push_to_neomem(L30, session_id, "L30")
|
----------------
|
||||||
|
|
||||||
return {
|
Output strictly in JSON with keys:
|
||||||
"session_id": session_id,
|
L1 → ultra short summary (1–2 sentences max)
|
||||||
"exchange_count": len(buf),
|
L5 → short summary
|
||||||
"L1": L1,
|
L10 → medium summary
|
||||||
"L5": L5,
|
L20 → detailed overview
|
||||||
"L10": L10,
|
L30 → full detailed summary
|
||||||
"L20": L20,
|
|
||||||
"L30": L30,
|
JSON only. No text outside JSON.
|
||||||
"last_updated": datetime.now().isoformat(),
|
"""
|
||||||
}
|
|
||||||
|
try:
|
||||||
|
llm_response = await call_llm(
|
||||||
|
prompt,
|
||||||
|
temperature=0.2
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# LLM should return JSON, parse it
|
||||||
|
summary = json.loads(llm_response)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"exchange_count": len(exchanges),
|
||||||
|
"L1": summary.get("L1", ""),
|
||||||
|
"L5": summary.get("L5", ""),
|
||||||
|
"L10": summary.get("L10", ""),
|
||||||
|
"L20": summary.get("L20", ""),
|
||||||
|
"L30": summary.get("L30", ""),
|
||||||
|
"last_updated": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"exchange_count": len(exchanges),
|
||||||
|
"L1": f"[Error summarizing: {str(e)}]",
|
||||||
|
"L5": "",
|
||||||
|
"L10": "",
|
||||||
|
"L20": "",
|
||||||
|
"L30": "",
|
||||||
|
"last_updated": datetime.now().isoformat()
|
||||||
|
}
|
||||||
|
|
||||||
|
# ─────────────────────────────────
|
||||||
|
# Background summarization stub
|
||||||
|
# ─────────────────────────────────
|
||||||
|
def bg_summarize(session_id: str):
|
||||||
|
"""
|
||||||
|
Placeholder for background summarization.
|
||||||
|
Actual summarization happens during /reason via summarize_context().
|
||||||
|
|
||||||
|
This function exists to prevent NameError when called from add_exchange_internal().
|
||||||
|
"""
|
||||||
|
print(f"[Intake] Exchange added for {session_id}. Will summarize on next /reason call.")
|
||||||
|
|
||||||
# ─────────────────────────────
|
# ─────────────────────────────
|
||||||
# Internal entrypoint for Cortex
|
# Internal entrypoint for Cortex
|
||||||
@@ -283,15 +339,23 @@ def add_exchange_internal(exchange: dict):
|
|||||||
|
|
||||||
exchange["timestamp"] = datetime.now().isoformat()
|
exchange["timestamp"] = datetime.now().isoformat()
|
||||||
|
|
||||||
|
# DEBUG: Verify we're using the module-level SESSIONS
|
||||||
|
print(f"[add_exchange_internal] SESSIONS object id: {id(SESSIONS)}, current sessions: {list(SESSIONS.keys())}")
|
||||||
|
|
||||||
# Ensure session exists
|
# Ensure session exists
|
||||||
if session_id not in SESSIONS:
|
if session_id not in SESSIONS:
|
||||||
SESSIONS[session_id] = {
|
SESSIONS[session_id] = {
|
||||||
"buffer": deque(maxlen=200),
|
"buffer": deque(maxlen=200),
|
||||||
"created_at": datetime.now()
|
"created_at": datetime.now()
|
||||||
}
|
}
|
||||||
|
print(f"[add_exchange_internal] Created new session: {session_id}")
|
||||||
|
else:
|
||||||
|
print(f"[add_exchange_internal] Using existing session: {session_id}")
|
||||||
|
|
||||||
# Append exchange into the rolling buffer
|
# Append exchange into the rolling buffer
|
||||||
SESSIONS[session_id]["buffer"].append(exchange)
|
SESSIONS[session_id]["buffer"].append(exchange)
|
||||||
|
buffer_len = len(SESSIONS[session_id]["buffer"])
|
||||||
|
print(f"[add_exchange_internal] Added exchange to {session_id}, buffer now has {buffer_len} items")
|
||||||
|
|
||||||
# Trigger summarization immediately
|
# Trigger summarization immediately
|
||||||
try:
|
try:
|
||||||
|
|||||||
106
cortex/router.py
106
cortex/router.py
@@ -197,26 +197,110 @@ class IngestPayload(BaseModel):
|
|||||||
user_msg: str
|
user_msg: str
|
||||||
assistant_msg: str
|
assistant_msg: str
|
||||||
|
|
||||||
|
|
||||||
@cortex_router.post("/ingest")
|
@cortex_router.post("/ingest")
|
||||||
async def ingest_stub():
|
async def ingest(payload: IngestPayload):
|
||||||
# Intake is internal now — this endpoint is only for compatibility.
|
"""
|
||||||
return {"status": "ok", "note": "intake is internal now"}
|
Receives (session_id, user_msg, assistant_msg) from Relay
|
||||||
|
and pushes directly into Intake's in-memory buffer.
|
||||||
|
|
||||||
|
Uses lenient error handling - always returns success to avoid
|
||||||
# 1. Update Cortex session state
|
breaking the chat pipeline.
|
||||||
update_last_assistant_message(payload.session_id, payload.assistant_msg)
|
"""
|
||||||
|
|
||||||
# 2. Feed Intake internally (no HTTP)
|
|
||||||
try:
|
try:
|
||||||
|
# 1. Update Cortex session state
|
||||||
|
update_last_assistant_message(payload.session_id, payload.assistant_msg)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"[INGEST] Failed to update session state: {e}")
|
||||||
|
# Continue anyway (lenient mode)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# 2. Feed Intake internally (no HTTP)
|
||||||
add_exchange_internal({
|
add_exchange_internal({
|
||||||
"session_id": payload.session_id,
|
"session_id": payload.session_id,
|
||||||
"user_msg": payload.user_msg,
|
"user_msg": payload.user_msg,
|
||||||
"assistant_msg": payload.assistant_msg,
|
"assistant_msg": payload.assistant_msg,
|
||||||
})
|
})
|
||||||
|
|
||||||
logger.debug(f"[INGEST] Added exchange to Intake for {payload.session_id}")
|
logger.debug(f"[INGEST] Added exchange to Intake for {payload.session_id}")
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f"[INGEST] Failed to add exchange to Intake: {e}")
|
logger.warning(f"[INGEST] Failed to add to Intake: {e}")
|
||||||
|
# Continue anyway (lenient mode)
|
||||||
|
|
||||||
return {"ok": True, "session_id": payload.session_id}
|
# Always return success (user requirement: never fail chat pipeline)
|
||||||
|
return {
|
||||||
|
"status": "ok",
|
||||||
|
"session_id": payload.session_id
|
||||||
|
}
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Debug endpoint: summarized context
|
||||||
|
# -----------------------------
|
||||||
|
@cortex_router.get("/debug/summary")
|
||||||
|
async def debug_summary(session_id: str):
|
||||||
|
"""
|
||||||
|
Diagnostic endpoint that runs Intake's summarize_context() for a session.
|
||||||
|
|
||||||
|
Shows exactly what L1/L5/L10/L20/L30 summaries would look like
|
||||||
|
inside the actual Uvicorn worker, using the real SESSIONS buffer.
|
||||||
|
"""
|
||||||
|
from intake.intake import SESSIONS, summarize_context
|
||||||
|
|
||||||
|
# Validate session
|
||||||
|
session = SESSIONS.get(session_id)
|
||||||
|
if not session:
|
||||||
|
return {"error": "session not found", "session_id": session_id}
|
||||||
|
|
||||||
|
# Convert deque into the structure summarize_context expects
|
||||||
|
buffer = session["buffer"]
|
||||||
|
exchanges = [
|
||||||
|
{
|
||||||
|
"user_msg": ex.get("user_msg", ""),
|
||||||
|
"assistant_msg": ex.get("assistant_msg", ""),
|
||||||
|
}
|
||||||
|
for ex in buffer
|
||||||
|
]
|
||||||
|
|
||||||
|
# 🔥 CRITICAL FIX — summarize_context is async
|
||||||
|
summary = await summarize_context(session_id, exchanges)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"session_id": session_id,
|
||||||
|
"buffer_size": len(buffer),
|
||||||
|
"exchanges_preview": exchanges[-5:], # last 5 items
|
||||||
|
"summary": summary
|
||||||
|
}
|
||||||
|
|
||||||
|
# -----------------------------
|
||||||
|
# Debug endpoint for SESSIONS
|
||||||
|
# -----------------------------
|
||||||
|
@cortex_router.get("/debug/sessions")
|
||||||
|
async def debug_sessions():
|
||||||
|
"""
|
||||||
|
Diagnostic endpoint to inspect SESSIONS from within the running Uvicorn worker.
|
||||||
|
This shows the actual state of the in-memory SESSIONS dict.
|
||||||
|
"""
|
||||||
|
from intake.intake import SESSIONS
|
||||||
|
|
||||||
|
sessions_data = {}
|
||||||
|
for session_id, session_info in SESSIONS.items():
|
||||||
|
buffer = session_info["buffer"]
|
||||||
|
sessions_data[session_id] = {
|
||||||
|
"created_at": session_info["created_at"].isoformat(),
|
||||||
|
"buffer_size": len(buffer),
|
||||||
|
"buffer_maxlen": buffer.maxlen,
|
||||||
|
"recent_exchanges": [
|
||||||
|
{
|
||||||
|
"user_msg": ex.get("user_msg", "")[:100],
|
||||||
|
"assistant_msg": ex.get("assistant_msg", "")[:100],
|
||||||
|
"timestamp": ex.get("timestamp", "")
|
||||||
|
}
|
||||||
|
for ex in list(buffer)[-5:] # Last 5 exchanges
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
"sessions_object_id": id(SESSIONS),
|
||||||
|
"total_sessions": len(SESSIONS),
|
||||||
|
"sessions": sessions_data
|
||||||
|
}
|
||||||
|
|
||||||
|
|||||||
416
vllm-mi50.md
416
vllm-mi50.md
@@ -1,416 +0,0 @@
|
|||||||
Here you go — a **clean, polished, ready-to-drop-into-Trilium or GitHub** Markdown file.
|
|
||||||
|
|
||||||
If you want, I can also auto-generate a matching `/docs/vllm-mi50/` folder structure and a mini-ToC.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
# **MI50 + vLLM + Proxmox LXC Setup Guide**
|
|
||||||
|
|
||||||
### *End-to-End Field Manual for gfx906 LLM Serving*
|
|
||||||
|
|
||||||
**Version:** 1.0
|
|
||||||
**Last updated:** 2025-11-17
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **📌 Overview**
|
|
||||||
|
|
||||||
This guide documents how to run a **vLLM OpenAI-compatible server** on an
|
|
||||||
**AMD Instinct MI50 (gfx906)** inside a **Proxmox LXC container**, expose it over LAN,
|
|
||||||
and wire it into **Project Lyra's Cortex reasoning layer**.
|
|
||||||
|
|
||||||
This file is long, specific, and intentionally leaves *nothing* out so you never have to rediscover ROCm pain rituals again.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **1. What This Stack Looks Like**
|
|
||||||
|
|
||||||
```
|
|
||||||
Proxmox Host
|
|
||||||
├─ AMD Instinct MI50 (gfx906)
|
|
||||||
├─ AMDGPU + ROCm stack
|
|
||||||
└─ LXC Container (CT 201: cortex-gpu)
|
|
||||||
├─ Ubuntu 24.04
|
|
||||||
├─ Docker + docker compose
|
|
||||||
├─ vLLM inside Docker (nalanzeyu/vllm-gfx906)
|
|
||||||
├─ GPU passthrough via /dev/kfd + /dev/dri + PCI bind
|
|
||||||
└─ vLLM API exposed on :8000
|
|
||||||
Lyra Cortex (VM/Server)
|
|
||||||
└─ LLM_PRIMARY_URL=http://10.0.0.43:8000
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **2. Proxmox Host — GPU Setup**
|
|
||||||
|
|
||||||
### **2.1 Confirm MI50 exists**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
lspci -nn | grep -i 'vega\|instinct\|radeon'
|
|
||||||
```
|
|
||||||
|
|
||||||
You should see something like:
|
|
||||||
|
|
||||||
```
|
|
||||||
0a:00.0 Display controller: AMD Instinct MI50 (gfx906)
|
|
||||||
```
|
|
||||||
|
|
||||||
### **2.2 Load AMDGPU driver**
|
|
||||||
|
|
||||||
The main pitfall after **any host reboot**.
|
|
||||||
|
|
||||||
```bash
|
|
||||||
modprobe amdgpu
|
|
||||||
```
|
|
||||||
|
|
||||||
If you skip this, the LXC container won't see the GPU.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **3. LXC Container Configuration (CT 201)**
|
|
||||||
|
|
||||||
The container ID is **201**.
|
|
||||||
Config file is at:
|
|
||||||
|
|
||||||
```
|
|
||||||
/etc/pve/lxc/201.conf
|
|
||||||
```
|
|
||||||
|
|
||||||
### **3.1 Working 201.conf**
|
|
||||||
|
|
||||||
Paste this *exact* version:
|
|
||||||
|
|
||||||
```ini
|
|
||||||
arch: amd64
|
|
||||||
cores: 4
|
|
||||||
hostname: cortex-gpu
|
|
||||||
memory: 16384
|
|
||||||
swap: 512
|
|
||||||
ostype: ubuntu
|
|
||||||
onboot: 1
|
|
||||||
startup: order=2,up=10,down=10
|
|
||||||
net0: name=eth0,bridge=vmbr0,hwaddr=BC:24:11:C6:3E:88,ip=dhcp,type=veth
|
|
||||||
rootfs: local-lvm:vm-201-disk-0,size=200G
|
|
||||||
unprivileged: 0
|
|
||||||
|
|
||||||
# Docker in LXC requires this
|
|
||||||
features: keyctl=1,nesting=1
|
|
||||||
lxc.apparmor.profile: unconfined
|
|
||||||
lxc.cap.drop:
|
|
||||||
|
|
||||||
# --- GPU passthrough for ROCm (MI50) ---
|
|
||||||
lxc.mount.entry: /dev/kfd dev/kfd none bind,optional,create=file,mode=0666
|
|
||||||
lxc.mount.entry: /dev/dri dev/dri none bind,optional,create=dir
|
|
||||||
lxc.mount.entry: /sys/class/drm sys/class/drm none bind,ro,optional,create=dir
|
|
||||||
lxc.mount.entry: /opt/rocm /opt/rocm none bind,ro,optional,create=dir
|
|
||||||
|
|
||||||
# Bind the MI50 PCI device
|
|
||||||
lxc.mount.entry: /dev/bus/pci/0000:0a:00.0 dev/bus/pci/0000:0a:00.0 none bind,optional,create=file
|
|
||||||
|
|
||||||
# Allow GPU-related character devices
|
|
||||||
lxc.cgroup2.devices.allow: c 226:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 29:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 189:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 238:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 241:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 242:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 243:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 244:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 245:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 246:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 247:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 248:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 249:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 250:* rwm
|
|
||||||
lxc.cgroup2.devices.allow: c 510:0 rwm
|
|
||||||
```
|
|
||||||
|
|
||||||
### **3.2 Restart sequence**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
pct stop 201
|
|
||||||
modprobe amdgpu
|
|
||||||
pct start 201
|
|
||||||
pct enter 201
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **4. Inside CT 201 — Verifying ROCm + GPU Visibility**
|
|
||||||
|
|
||||||
### **4.1 Check device nodes**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
ls -l /dev/kfd
|
|
||||||
ls -l /dev/dri
|
|
||||||
ls -l /opt/rocm
|
|
||||||
```
|
|
||||||
|
|
||||||
All must exist.
|
|
||||||
|
|
||||||
### **4.2 Validate GPU via rocminfo**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
/opt/rocm/bin/rocminfo | grep -i gfx
|
|
||||||
```
|
|
||||||
|
|
||||||
You need to see:
|
|
||||||
|
|
||||||
```
|
|
||||||
gfx906
|
|
||||||
```
|
|
||||||
|
|
||||||
If you see **nothing**, the GPU isn’t passed through — restart and re-check the host steps.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **5. Install Docker in the LXC (Ubuntu 24.04)**
|
|
||||||
|
|
||||||
This container runs Docker inside LXC (nesting enabled).
|
|
||||||
|
|
||||||
```bash
|
|
||||||
apt update
|
|
||||||
apt install -y ca-certificates curl gnupg
|
|
||||||
|
|
||||||
install -m 0755 -d /etc/apt/keyrings
|
|
||||||
curl -fsSL https://download.docker.com/linux/ubuntu/gpg \
|
|
||||||
| gpg --dearmor -o /etc/apt/keyrings/docker.gpg
|
|
||||||
chmod a+r /etc/apt/keyrings/docker.gpg
|
|
||||||
|
|
||||||
echo \
|
|
||||||
"deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.gpg] \
|
|
||||||
https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo $VERSION_CODENAME) stable" \
|
|
||||||
> /etc/apt/sources.list.d/docker.list
|
|
||||||
|
|
||||||
apt update
|
|
||||||
apt install -y docker-ce docker-ce-cli containerd.io docker-buildx-plugin docker-compose-plugin
|
|
||||||
```
|
|
||||||
|
|
||||||
Check:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker --version
|
|
||||||
docker compose version
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **6. Running vLLM Inside CT 201 via Docker**
|
|
||||||
|
|
||||||
### **6.1 Create directory**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
mkdir -p /root/vllm
|
|
||||||
cd /root/vllm
|
|
||||||
```
|
|
||||||
|
|
||||||
### **6.2 docker-compose.yml**
|
|
||||||
|
|
||||||
Save this exact file as `/root/vllm/docker-compose.yml`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
version: "3.9"
|
|
||||||
|
|
||||||
services:
|
|
||||||
vllm-mi50:
|
|
||||||
image: nalanzeyu/vllm-gfx906:latest
|
|
||||||
container_name: vllm-mi50
|
|
||||||
restart: unless-stopped
|
|
||||||
ports:
|
|
||||||
- "8000:8000"
|
|
||||||
environment:
|
|
||||||
VLLM_ROLE: "APIServer"
|
|
||||||
VLLM_MODEL: "/model"
|
|
||||||
VLLM_LOGGING_LEVEL: "INFO"
|
|
||||||
command: >
|
|
||||||
vllm serve /model
|
|
||||||
--host 0.0.0.0
|
|
||||||
--port 8000
|
|
||||||
--dtype float16
|
|
||||||
--max-model-len 4096
|
|
||||||
--api-type openai
|
|
||||||
devices:
|
|
||||||
- "/dev/kfd:/dev/kfd"
|
|
||||||
- "/dev/dri:/dev/dri"
|
|
||||||
volumes:
|
|
||||||
- /opt/rocm:/opt/rocm:ro
|
|
||||||
```
|
|
||||||
|
|
||||||
### **6.3 Start vLLM**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
docker compose up -d
|
|
||||||
docker compose logs -f
|
|
||||||
```
|
|
||||||
|
|
||||||
When healthy, you’ll see:
|
|
||||||
|
|
||||||
```
|
|
||||||
(APIServer) Application startup complete.
|
|
||||||
```
|
|
||||||
|
|
||||||
and periodic throughput logs.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **7. Test vLLM API**
|
|
||||||
|
|
||||||
### **7.1 From Proxmox host**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://10.0.0.43:8000/v1/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"model":"/model","prompt":"ping","max_tokens":5}'
|
|
||||||
```
|
|
||||||
|
|
||||||
Should respond like:
|
|
||||||
|
|
||||||
```json
|
|
||||||
{"choices":[{"text":"-pong"}]}
|
|
||||||
```
|
|
||||||
|
|
||||||
### **7.2 From Cortex machine**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://10.0.0.43:8000/v1/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"model":"/model","prompt":"ping from cortex","max_tokens":5}'
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **8. Wiring into Lyra Cortex**
|
|
||||||
|
|
||||||
In `cortex` container’s `docker-compose.yml`:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
environment:
|
|
||||||
LLM_PRIMARY_URL: http://10.0.0.43:8000
|
|
||||||
```
|
|
||||||
|
|
||||||
Not `/v1/completions` because the router appends that automatically.
|
|
||||||
|
|
||||||
In `cortex/.env`:
|
|
||||||
|
|
||||||
```env
|
|
||||||
LLM_FORCE_BACKEND=primary
|
|
||||||
LLM_MODEL=/model
|
|
||||||
```
|
|
||||||
|
|
||||||
Test:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://10.0.0.41:7081/reason \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"prompt":"test vllm","session_id":"dev"}'
|
|
||||||
```
|
|
||||||
|
|
||||||
If you get a meaningful response: **Cortex → vLLM is online**.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **9. Common Failure Modes (And Fixes)**
|
|
||||||
|
|
||||||
### **9.1 “Failed to infer device type”**
|
|
||||||
|
|
||||||
vLLM cannot see any ROCm devices.
|
|
||||||
|
|
||||||
Fix:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# On host
|
|
||||||
modprobe amdgpu
|
|
||||||
pct stop 201
|
|
||||||
pct start 201
|
|
||||||
# In container
|
|
||||||
/opt/rocm/bin/rocminfo | grep -i gfx
|
|
||||||
docker compose up -d
|
|
||||||
```
|
|
||||||
|
|
||||||
### **9.2 GPU disappears after reboot**
|
|
||||||
|
|
||||||
Same fix:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
modprobe amdgpu
|
|
||||||
pct stop 201
|
|
||||||
pct start 201
|
|
||||||
```
|
|
||||||
|
|
||||||
### **9.3 Invalid image name**
|
|
||||||
|
|
||||||
If you see pull errors:
|
|
||||||
|
|
||||||
```
|
|
||||||
pull access denied for nalanzeuy...
|
|
||||||
```
|
|
||||||
|
|
||||||
Use:
|
|
||||||
|
|
||||||
```
|
|
||||||
image: nalanzeyu/vllm-gfx906
|
|
||||||
```
|
|
||||||
|
|
||||||
### **9.4 Double `/v1` in URL**
|
|
||||||
|
|
||||||
Ensure:
|
|
||||||
|
|
||||||
```
|
|
||||||
LLM_PRIMARY_URL=http://10.0.0.43:8000
|
|
||||||
```
|
|
||||||
|
|
||||||
Router appends `/v1/completions`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **10. Daily / Reboot Ritual**
|
|
||||||
|
|
||||||
### **On Proxmox host**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
modprobe amdgpu
|
|
||||||
pct stop 201
|
|
||||||
pct start 201
|
|
||||||
```
|
|
||||||
|
|
||||||
### **Inside CT 201**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
/opt/rocm/bin/rocminfo | grep -i gfx
|
|
||||||
cd /root/vllm
|
|
||||||
docker compose up -d
|
|
||||||
docker compose logs -f
|
|
||||||
```
|
|
||||||
|
|
||||||
### **Test API**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
curl -X POST http://10.0.0.43:8000/v1/completions \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{"model":"/model","prompt":"ping","max_tokens":5}'
|
|
||||||
```
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## **11. Summary**
|
|
||||||
|
|
||||||
You now have:
|
|
||||||
|
|
||||||
* **MI50 (gfx906)** correctly passed into LXC
|
|
||||||
* **ROCm** inside the container via bind mounts
|
|
||||||
* **vLLM** running inside Docker in the LXC
|
|
||||||
* **OpenAI-compatible API** on port 8000
|
|
||||||
* **Lyra Cortex** using it automatically as primary backend
|
|
||||||
|
|
||||||
This is a complete, reproducible setup that survives reboots (with the modprobe ritual) and allows you to upgrade/replace models anytime.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
If you want, I can generate:
|
|
||||||
|
|
||||||
* A `/docs/vllm-mi50/README.md`
|
|
||||||
* A "vLLM Gotchas" document
|
|
||||||
* A quick-reference cheat sheet
|
|
||||||
* A troubleshooting decision tree
|
|
||||||
|
|
||||||
Just say the word.
|
|
||||||
Reference in New Issue
Block a user