Files
project-lyra/rag/rag_chat_import.py
2025-11-16 03:17:32 -05:00

76 lines
2.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import json, glob, os, hashlib
from tqdm import tqdm
import chromadb
import datetime, hashlib
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
chroma = chromadb.PersistentClient(path="./chromadb")
collection = chroma.get_or_create_collection("lyra_chats")
CHUNK_SIZE = 5000 # characters (~15002000 tokens)
added, skipped = 0, 0
# recursive glob through all category folders
files = glob.glob("chatlogs/**/*.json", recursive=True)
for f in tqdm(files, desc="Indexing chats"):
with open(f) as fh:
data = json.load(fh)
title = data.get("title", os.path.basename(f))
category = os.path.basename(os.path.dirname(f)) # e.g. work, poker, etc.
chat_id = hashlib.sha1(f.encode("utf-8")).hexdigest() # <-- move it here (per file)
mtime = datetime.datetime.fromtimestamp(os.path.getmtime(f)).isoformat()
now = datetime.datetime.utcnow().isoformat()
for msg in data.get("messages", []):
if msg["role"] not in ("user", "assistant"):
continue
text = msg["content"].strip()
if not text:
continue
for i in range(0, len(text), CHUNK_SIZE):
chunk = text[i:i+CHUNK_SIZE]
doc_id = hashlib.sha1((f"{f}_{i}_{chunk}").encode("utf-8")).hexdigest()
existing = collection.get(ids=[doc_id])
if existing and existing.get("ids"):
skipped += 1
continue
emb = client.embeddings.create(
model="text-embedding-3-small",
input=chunk
).data[0].embedding
metadata = {
"chat_id": chat_id, # ✅ now defined
"chunk_index": i // CHUNK_SIZE,
"source": f,
"title": title,
"role": msg["role"],
"category": category,
"type": "chat",
"file_modified": mtime,
"imported_at": now
}
collection.add(
ids=[doc_id],
documents=[chunk],
embeddings=[emb],
metadatas=[metadata]
)
added += 1
print(f"\n✅ Finished indexing {len(files)} chat files.")
print(f"🆕 Added {added:,} new chunks | ⏭️ Skipped {skipped:,} duplicates")
print(f"📦 Total in collection now: {collection.count()} (stored in ./chromadb)")