From c015e39925e4e77ad9ed6173ea8b2f903d5718fc Mon Sep 17 00:00:00 2001 From: serversdwn Date: Mon, 9 Mar 2026 16:23:59 -0400 Subject: [PATCH] feat: add incremental embedding, unchanged files now resuse cached embedding to save on tokens. fix: early return bug in ingest.py fixed. --- ingest.py | 85 ++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 62 insertions(+), 23 deletions(-) diff --git a/ingest.py b/ingest.py index 65cd63d..13561b4 100644 --- a/ingest.py +++ b/ingest.py @@ -1,3 +1,4 @@ +import hashlib import os import pickle from pathlib import Path @@ -16,6 +17,8 @@ client = OpenAI() MODEL = "text-embedding-3-small" CHUNK_SIZE = 800 +CACHE_PATH = "index/embed_cache.pkl" + def read_sources(): with open("sources.yaml") as f: @@ -25,7 +28,6 @@ def read_sources(): for src in cfg["sources"]: for root, dirs, filenames in os.walk(src): - dirs[:] = [d for d in dirs if d not in [ ".git", "__pycache__", @@ -35,17 +37,22 @@ def read_sources(): for name in filenames: if name.endswith((".md", ".txt", ".py")): path = os.path.join(root, name) - - if os.path.getsize(path) < 200_000: # skip >200KB + if os.path.getsize(path) < 200_000: files.append(path) - return files + return files + + +def file_hash(path): + h = hashlib.md5() + h.update(Path(path).read_bytes()) + return h.hexdigest() def chunk_text(text): chunks = [] for i in range(0, len(text), CHUNK_SIZE): - chunks.append(text[i:i+CHUNK_SIZE]) + chunks.append(text[i:i + CHUNK_SIZE]) return chunks @@ -57,43 +64,75 @@ def embed(text): return res.data[0].embedding +def load_cache(): + if os.path.exists(CACHE_PATH): + with open(CACHE_PATH, "rb") as f: + return pickle.load(f) + # cache format: {file_hash: [(chunk_text, vector), ...]} + return {} + + +def save_cache(cache): + os.makedirs("index", exist_ok=True) + with open(CACHE_PATH, "wb") as f: + pickle.dump(cache, f) + + def main(): files = read_sources() + cache = load_cache() + new_cache = {} all_chunks = [] metadata = [] - - for file in files: - text = Path(file).read_text(errors="ignore") - chunks = chunk_text(text) - - for c in chunks: - all_chunks.append(c) - metadata.append({ - "source": file, - "chunk_id": len(all_chunks) - }) - - print("Embedding", len(all_chunks), "chunks") - vectors = [] - for chunk in tqdm(all_chunks): - vectors.append(embed(chunk)) + files_new = 0 + files_cached = 0 + + for file in files: + h = file_hash(file) + + if h in cache: + # reuse cached embeddings + cached = cache[h] + files_cached += 1 + else: + # embed and cache + text = Path(file).read_text(errors="ignore") + chunks = chunk_text(text) + file_vectors = [] + for chunk in chunks: + file_vectors.append(embed(chunk)) + cached = list(zip(chunks, file_vectors)) + files_new += 1 + + new_cache[h] = cached + + for chunk, vec in cached: + all_chunks.append(chunk) + metadata.append({"source": file, "chunk_id": len(all_chunks)}) + vectors.append(vec) + + print(f"Files: {files_cached} cached, {files_new} re-embedded ({len(all_chunks)} chunks total)") + + if not vectors: + print("No content found.") + return dim = len(vectors[0]) index = faiss.IndexFlatL2(dim) index.add(np.array(vectors).astype("float32")) os.makedirs("index", exist_ok=True) - faiss.write_index(index, "index/index.faiss") with open("index/meta.pkl", "wb") as f: pickle.dump((all_chunks, metadata), f) + save_cache(new_cache) print("Index built.") if __name__ == "__main__": - main() \ No newline at end of file + main()