54 lines
1.6 KiB
Python
54 lines
1.6 KiB
Python
import uuid, hashlib, os, json, glob
|
|
from tqdm import tqdm
|
|
import chromadb
|
|
from openai import OpenAI
|
|
from dotenv import load_dotenv
|
|
load_dotenv()
|
|
|
|
# persistent local DB
|
|
chroma = chromadb.PersistentClient(path="./chromadb")
|
|
collection = chroma.get_or_create_collection("lyra_chats")
|
|
|
|
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
|
files = glob.glob("chatlogs/*.json")
|
|
|
|
added, skipped = 0, 0
|
|
|
|
for f in tqdm(files, desc="Indexing chats"):
|
|
with open(f) as fh:
|
|
data = json.load(fh)
|
|
title = data.get("title", f)
|
|
|
|
for msg in data.get("messages", []):
|
|
if msg["role"] not in ("user", "assistant"):
|
|
continue
|
|
text = msg["content"].strip()
|
|
if not text:
|
|
continue
|
|
|
|
# deterministic hash ID
|
|
doc_id = hashlib.sha1(text.encode("utf-8")).hexdigest()
|
|
|
|
# skip if already indexed
|
|
existing = collection.get(ids=[doc_id])
|
|
if existing and existing.get("ids"):
|
|
skipped += 1
|
|
continue
|
|
|
|
emb = client.embeddings.create(
|
|
model="text-embedding-3-small",
|
|
input=text
|
|
).data[0].embedding
|
|
|
|
collection.add(
|
|
ids=[doc_id],
|
|
documents=[text],
|
|
embeddings=[emb],
|
|
metadatas=[{"source": f, "title": title, "role": msg["role"]}]
|
|
)
|
|
added += 1
|
|
|
|
print(f"\n✅ Finished indexing {len(files)} chat files.")
|
|
print(f"🆕 Added {added:,} new chunks | ⏭️ Skipped {skipped:,} duplicates")
|
|
print(f"📦 Total in collection now: {collection.count()} (stored in ./chromadb)")
|