import hashlib import os import pickle from pathlib import Path import yaml import faiss import numpy as np from tqdm import tqdm from dotenv import load_dotenv from openai import OpenAI load_dotenv() client = OpenAI() MODEL = "text-embedding-3-small" CHUNK_SIZE = 800 CACHE_PATH = "index/embed_cache.pkl" def read_sources(): with open("sources.yaml") as f: cfg = yaml.safe_load(f) files = [] for src in cfg["sources"]: for root, dirs, filenames in os.walk(src): dirs[:] = [d for d in dirs if d not in [ ".git", "__pycache__", ".venv", "index" ]] for name in filenames: if name.endswith((".md", ".txt", ".py")): path = os.path.join(root, name) if os.path.getsize(path) < 200_000: files.append(path) return files def file_hash(path): h = hashlib.md5() h.update(Path(path).read_bytes()) return h.hexdigest() def chunk_text(text): chunks = [] for i in range(0, len(text), CHUNK_SIZE): chunks.append(text[i:i + CHUNK_SIZE]) return chunks def embed(text): res = client.embeddings.create( model=MODEL, input=text ) return res.data[0].embedding def load_cache(): if os.path.exists(CACHE_PATH): with open(CACHE_PATH, "rb") as f: return pickle.load(f) # cache format: {file_hash: [(chunk_text, vector), ...]} return {} def save_cache(cache): os.makedirs("index", exist_ok=True) with open(CACHE_PATH, "wb") as f: pickle.dump(cache, f) def main(): files = read_sources() cache = load_cache() new_cache = {} all_chunks = [] metadata = [] vectors = [] files_new = 0 files_cached = 0 for file in files: h = file_hash(file) if h in cache: # reuse cached embeddings cached = cache[h] files_cached += 1 else: # embed and cache text = Path(file).read_text(errors="ignore") chunks = chunk_text(text) file_vectors = [] for chunk in chunks: file_vectors.append(embed(chunk)) cached = list(zip(chunks, file_vectors)) files_new += 1 new_cache[h] = cached for chunk, vec in cached: all_chunks.append(chunk) metadata.append({"source": file, "chunk_id": len(all_chunks)}) vectors.append(vec) print(f"Files: {files_cached} cached, {files_new} re-embedded ({len(all_chunks)} chunks total)") if not vectors: print("No content found.") return dim = len(vectors[0]) index = faiss.IndexFlatL2(dim) index.add(np.array(vectors).astype("float32")) os.makedirs("index", exist_ok=True) faiss.write_index(index, "index/index.faiss") with open("index/meta.pkl", "wb") as f: pickle.dump((all_chunks, metadata), f) save_cache(new_cache) print("Index built.") if __name__ == "__main__": main()