76 lines
2.5 KiB
Python
76 lines
2.5 KiB
Python
import json, glob, os, hashlib
|
||
from tqdm import tqdm
|
||
import chromadb
|
||
import datetime, hashlib
|
||
from openai import OpenAI
|
||
from dotenv import load_dotenv
|
||
load_dotenv()
|
||
|
||
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
|
||
chroma = chromadb.PersistentClient(path="./chromadb")
|
||
collection = chroma.get_or_create_collection("lyra_chats")
|
||
|
||
CHUNK_SIZE = 5000 # characters (~1500–2000 tokens)
|
||
|
||
added, skipped = 0, 0
|
||
|
||
# recursive glob through all category folders
|
||
files = glob.glob("chatlogs/**/*.json", recursive=True)
|
||
|
||
for f in tqdm(files, desc="Indexing chats"):
|
||
with open(f) as fh:
|
||
data = json.load(fh)
|
||
|
||
title = data.get("title", os.path.basename(f))
|
||
category = os.path.basename(os.path.dirname(f)) # e.g. work, poker, etc.
|
||
chat_id = hashlib.sha1(f.encode("utf-8")).hexdigest() # <-- move it here (per file)
|
||
|
||
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(f)).isoformat()
|
||
now = datetime.datetime.utcnow().isoformat()
|
||
|
||
for msg in data.get("messages", []):
|
||
if msg["role"] not in ("user", "assistant"):
|
||
continue
|
||
text = msg["content"].strip()
|
||
if not text:
|
||
continue
|
||
|
||
for i in range(0, len(text), CHUNK_SIZE):
|
||
chunk = text[i:i+CHUNK_SIZE]
|
||
doc_id = hashlib.sha1((f"{f}_{i}_{chunk}").encode("utf-8")).hexdigest()
|
||
|
||
existing = collection.get(ids=[doc_id])
|
||
if existing and existing.get("ids"):
|
||
skipped += 1
|
||
continue
|
||
|
||
emb = client.embeddings.create(
|
||
model="text-embedding-3-small",
|
||
input=chunk
|
||
).data[0].embedding
|
||
|
||
metadata = {
|
||
"chat_id": chat_id, # ✅ now defined
|
||
"chunk_index": i // CHUNK_SIZE,
|
||
"source": f,
|
||
"title": title,
|
||
"role": msg["role"],
|
||
"category": category,
|
||
"type": "chat",
|
||
"file_modified": mtime,
|
||
"imported_at": now
|
||
}
|
||
|
||
collection.add(
|
||
ids=[doc_id],
|
||
documents=[chunk],
|
||
embeddings=[emb],
|
||
metadatas=[metadata]
|
||
)
|
||
added += 1
|
||
|
||
|
||
print(f"\n✅ Finished indexing {len(files)} chat files.")
|
||
print(f"🆕 Added {added:,} new chunks | ⏭️ Skipped {skipped:,} duplicates")
|
||
print(f"📦 Total in collection now: {collection.count()} (stored in ./chromadb)")
|