Files
rag-tmi/ingest.py
2026-03-09 16:23:59 -04:00

139 lines
3.1 KiB
Python

import hashlib
import os
import pickle
from pathlib import Path
import yaml
import faiss
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
client = OpenAI()
MODEL = "text-embedding-3-small"
CHUNK_SIZE = 800
CACHE_PATH = "index/embed_cache.pkl"
def read_sources():
with open("sources.yaml") as f:
cfg = yaml.safe_load(f)
files = []
for src in cfg["sources"]:
for root, dirs, filenames in os.walk(src):
dirs[:] = [d for d in dirs if d not in [
".git",
"__pycache__",
".venv",
"index"
]]
for name in filenames:
if name.endswith((".md", ".txt", ".py")):
path = os.path.join(root, name)
if os.path.getsize(path) < 200_000:
files.append(path)
return files
def file_hash(path):
h = hashlib.md5()
h.update(Path(path).read_bytes())
return h.hexdigest()
def chunk_text(text):
chunks = []
for i in range(0, len(text), CHUNK_SIZE):
chunks.append(text[i:i + CHUNK_SIZE])
return chunks
def embed(text):
res = client.embeddings.create(
model=MODEL,
input=text
)
return res.data[0].embedding
def load_cache():
if os.path.exists(CACHE_PATH):
with open(CACHE_PATH, "rb") as f:
return pickle.load(f)
# cache format: {file_hash: [(chunk_text, vector), ...]}
return {}
def save_cache(cache):
os.makedirs("index", exist_ok=True)
with open(CACHE_PATH, "wb") as f:
pickle.dump(cache, f)
def main():
files = read_sources()
cache = load_cache()
new_cache = {}
all_chunks = []
metadata = []
vectors = []
files_new = 0
files_cached = 0
for file in files:
h = file_hash(file)
if h in cache:
# reuse cached embeddings
cached = cache[h]
files_cached += 1
else:
# embed and cache
text = Path(file).read_text(errors="ignore")
chunks = chunk_text(text)
file_vectors = []
for chunk in chunks:
file_vectors.append(embed(chunk))
cached = list(zip(chunks, file_vectors))
files_new += 1
new_cache[h] = cached
for chunk, vec in cached:
all_chunks.append(chunk)
metadata.append({"source": file, "chunk_id": len(all_chunks)})
vectors.append(vec)
print(f"Files: {files_cached} cached, {files_new} re-embedded ({len(all_chunks)} chunks total)")
if not vectors:
print("No content found.")
return
dim = len(vectors[0])
index = faiss.IndexFlatL2(dim)
index.add(np.array(vectors).astype("float32"))
os.makedirs("index", exist_ok=True)
faiss.write_index(index, "index/index.faiss")
with open("index/meta.pkl", "wb") as f:
pickle.dump((all_chunks, metadata), f)
save_cache(new_cache)
print("Index built.")
if __name__ == "__main__":
main()