project-lyra/rag/rag_chat_import.py

import json, glob, os, hashlib
from tqdm import tqdm
import chromadb
import datetime, hashlib
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv()

client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
chroma = chromadb.PersistentClient(path="./chromadb")
collection = chroma.get_or_create_collection("lyra_chats")

CHUNK_SIZE = 5000  # characters (~1500–2000 tokens)

added, skipped = 0, 0

# recursive glob through all category folders
files = glob.glob("chatlogs/**/*.json", recursive=True)

for f in tqdm(files, desc="Indexing chats"):
    with open(f) as fh:
        data = json.load(fh)

    title = data.get("title", os.path.basename(f))
    category = os.path.basename(os.path.dirname(f))  # e.g. work, poker, etc.
    chat_id = hashlib.sha1(f.encode("utf-8")).hexdigest()   # <-- move it here (per file)

    mtime = datetime.datetime.fromtimestamp(os.path.getmtime(f)).isoformat()
    now = datetime.datetime.utcnow().isoformat()

    for msg in data.get("messages", []):
        if msg["role"] not in ("user", "assistant"):
            continue
        text = msg["content"].strip()
        if not text:
            continue

        for i in range(0, len(text), CHUNK_SIZE):
            chunk = text[i:i+CHUNK_SIZE]
            doc_id = hashlib.sha1((f"{f}_{i}_{chunk}").encode("utf-8")).hexdigest()

            existing = collection.get(ids=[doc_id])
            if existing and existing.get("ids"):
                skipped += 1
                continue

            emb = client.embeddings.create(
                model="text-embedding-3-small",
                input=chunk
            ).data[0].embedding

            metadata = {
                "chat_id": chat_id,                # ✅ now defined
                "chunk_index": i // CHUNK_SIZE,
                "source": f,
                "title": title,
                "role": msg["role"],
                "category": category,
                "type": "chat",
                "file_modified": mtime,
                "imported_at": now
            }

            collection.add(
                ids=[doc_id],
                documents=[chunk],
                embeddings=[emb],
                metadatas=[metadata]
            )
            added += 1


print(f"\n✅ Finished indexing {len(files)} chat files.")
print(f"🆕 Added {added:,} new chunks  |  ⏭️  Skipped {skipped:,} duplicates")
print(f"📦 Total in collection now: {collection.count()}  (stored in ./chromadb)")