import os import pickle from pathlib import Path import yaml import faiss import numpy as np from tqdm import tqdm from dotenv import load_dotenv from openai import OpenAI load_dotenv() client = OpenAI() MODEL = "text-embedding-3-small" CHUNK_SIZE = 800 def read_sources(): with open("sources.yaml") as f: cfg = yaml.safe_load(f) files = [] for src in cfg["sources"]: for root, dirs, filenames in os.walk(src): dirs[:] = [d for d in dirs if d not in [ ".git", "__pycache__", ".venv", "index" ]] for name in filenames: if name.endswith((".md", ".txt", ".py")): path = os.path.join(root, name) if os.path.getsize(path) < 200_000: # skip >200KB files.append(path) return files def chunk_text(text): chunks = [] for i in range(0, len(text), CHUNK_SIZE): chunks.append(text[i:i+CHUNK_SIZE]) return chunks def embed(text): res = client.embeddings.create( model=MODEL, input=text ) return res.data[0].embedding def main(): files = read_sources() all_chunks = [] metadata = [] for file in files: text = Path(file).read_text(errors="ignore") chunks = chunk_text(text) for c in chunks: all_chunks.append(c) metadata.append({ "source": file, "chunk_id": len(all_chunks) }) print("Embedding", len(all_chunks), "chunks") vectors = [] for chunk in tqdm(all_chunks): vectors.append(embed(chunk)) dim = len(vectors[0]) index = faiss.IndexFlatL2(dim) index.add(np.array(vectors).astype("float32")) os.makedirs("index", exist_ok=True) faiss.write_index(index, "index/index.faiss") with open("index/meta.pkl", "wb") as f: pickle.dump((all_chunks, metadata), f) print("Index built.") if __name__ == "__main__": main()