import uuid, hashlib, os, json, glob from tqdm import tqdm import chromadb from openai import OpenAI from dotenv import load_dotenv load_dotenv() # persistent local DB chroma = chromadb.PersistentClient(path="./chromadb") collection = chroma.get_or_create_collection("lyra_chats") client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) files = glob.glob("chatlogs/*.json") added, skipped = 0, 0 for f in tqdm(files, desc="Indexing chats"): with open(f) as fh: data = json.load(fh) title = data.get("title", f) for msg in data.get("messages", []): if msg["role"] not in ("user", "assistant"): continue text = msg["content"].strip() if not text: continue # deterministic hash ID doc_id = hashlib.sha1(text.encode("utf-8")).hexdigest() # skip if already indexed existing = collection.get(ids=[doc_id]) if existing and existing.get("ids"): skipped += 1 continue emb = client.embeddings.create( model="text-embedding-3-small", input=text ).data[0].embedding collection.add( ids=[doc_id], documents=[text], embeddings=[emb], metadatas=[{"source": f, "title": title, "role": msg["role"]}] ) added += 1 print(f"\nāœ… Finished indexing {len(files)} chat files.") print(f"šŸ†• Added {added:,} new chunks | ā­ļø Skipped {skipped:,} duplicates") print(f"šŸ“¦ Total in collection now: {collection.count()} (stored in ./chromadb)")