Initial clean commit - unified Lyra stack

This commit is contained in:
serversdwn
2025-11-16 03:17:32 -05:00
commit 94fb091e59
270 changed files with 74200 additions and 0 deletions

53
rag/rag_build.py Normal file
View File

@@ -0,0 +1,53 @@
import uuid, hashlib, os, json, glob
from tqdm import tqdm
import chromadb
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
# persistent local DB
chroma = chromadb.PersistentClient(path="./chromadb")
collection = chroma.get_or_create_collection("lyra_chats")
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
files = glob.glob("chatlogs/*.json")
added, skipped = 0, 0
for f in tqdm(files, desc="Indexing chats"):
with open(f) as fh:
data = json.load(fh)
title = data.get("title", f)
for msg in data.get("messages", []):
if msg["role"] not in ("user", "assistant"):
continue
text = msg["content"].strip()
if not text:
continue
# deterministic hash ID
doc_id = hashlib.sha1(text.encode("utf-8")).hexdigest()
# skip if already indexed
existing = collection.get(ids=[doc_id])
if existing and existing.get("ids"):
skipped += 1
continue
emb = client.embeddings.create(
model="text-embedding-3-small",
input=text
).data[0].embedding
collection.add(
ids=[doc_id],
documents=[text],
embeddings=[emb],
metadatas=[{"source": f, "title": title, "role": msg["role"]}]
)
added += 1
print(f"\n✅ Finished indexing {len(files)} chat files.")
print(f"🆕 Added {added:,} new chunks | ⏭️ Skipped {skipped:,} duplicates")
print(f"📦 Total in collection now: {collection.count()} (stored in ./chromadb)")