project-lyra/rag/rag_build.py

import uuid, hashlib, os, json, glob
from tqdm import tqdm
import chromadb
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

# persistent local DB
chroma = chromadb.PersistentClient(path="./chromadb")
collection = chroma.get_or_create_collection("lyra_chats")

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
files = glob.glob("chatlogs/*.json")

added, skipped = 0, 0

for f in tqdm(files, desc="Indexing chats"):
    with open(f) as fh:
        data = json.load(fh)
    title = data.get("title", f)

    for msg in data.get("messages", []):
        if msg["role"] not in ("user", "assistant"):
            continue
        text = msg["content"].strip()
        if not text:
            continue

        # deterministic hash ID
        doc_id = hashlib.sha1(text.encode("utf-8")).hexdigest()

        # skip if already indexed
        existing = collection.get(ids=[doc_id])
        if existing and existing.get("ids"):
            skipped += 1
            continue

        emb = client.embeddings.create(
            model="text-embedding-3-small",
            input=text
        ).data[0].embedding

        collection.add(
            ids=[doc_id],
            documents=[text],
            embeddings=[emb],
            metadatas=[{"source": f, "title": title, "role": msg["role"]}]
        )
        added += 1

print(f"\n✅ Finished indexing {len(files)} chat files.")
print(f"🆕 Added {added:,} new chunks  |  ⏭️  Skipped {skipped:,} duplicates")
print(f"📦 Total in collection now: {collection.count()}  (stored in ./chromadb)")