import json, glob, os, hashlib from tqdm import tqdm import chromadb import datetime, hashlib from openai import OpenAI from dotenv import load_dotenv load_dotenv() client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) chroma = chromadb.PersistentClient(path="./chromadb") collection = chroma.get_or_create_collection("lyra_chats") CHUNK_SIZE = 5000 # characters (~1500–2000 tokens) added, skipped = 0, 0 # recursive glob through all category folders files = glob.glob("chatlogs/**/*.json", recursive=True) for f in tqdm(files, desc="Indexing chats"): with open(f) as fh: data = json.load(fh) title = data.get("title", os.path.basename(f)) category = os.path.basename(os.path.dirname(f)) # e.g. work, poker, etc. chat_id = hashlib.sha1(f.encode("utf-8")).hexdigest() # <-- move it here (per file) mtime = datetime.datetime.fromtimestamp(os.path.getmtime(f)).isoformat() now = datetime.datetime.utcnow().isoformat() for msg in data.get("messages", []): if msg["role"] not in ("user", "assistant"): continue text = msg["content"].strip() if not text: continue for i in range(0, len(text), CHUNK_SIZE): chunk = text[i:i+CHUNK_SIZE] doc_id = hashlib.sha1((f"{f}_{i}_{chunk}").encode("utf-8")).hexdigest() existing = collection.get(ids=[doc_id]) if existing and existing.get("ids"): skipped += 1 continue emb = client.embeddings.create( model="text-embedding-3-small", input=chunk ).data[0].embedding metadata = { "chat_id": chat_id, # āœ… now defined "chunk_index": i // CHUNK_SIZE, "source": f, "title": title, "role": msg["role"], "category": category, "type": "chat", "file_modified": mtime, "imported_at": now } collection.add( ids=[doc_id], documents=[chunk], embeddings=[emb], metadatas=[metadata] ) added += 1 print(f"\nāœ… Finished indexing {len(files)} chat files.") print(f"šŸ†• Added {added:,} new chunks | ā­ļø Skipped {skipped:,} duplicates") print(f"šŸ“¦ Total in collection now: {collection.count()} (stored in ./chromadb)")