From c015e39925e4e77ad9ed6173ea8b2f903d5718fc Mon Sep 17 00:00:00 2001
From: serversdwn <brianharrison02@gmail.com>
Date: Mon, 9 Mar 2026 16:23:59 -0400
Subject: [PATCH] feat: add incremental embedding, unchanged files now resuse
 cached embedding to save on tokens. fix: early return bug in ingest.py fixed.

---
 ingest.py | 85 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 62 insertions(+), 23 deletions(-)

diff --git a/ingest.py b/ingest.py
index 65cd63d..13561b4 100644
--- a/ingest.py
+++ b/ingest.py
@@ -1,3 +1,4 @@
+import hashlib
 import os
 import pickle
 from pathlib import Path
@@ -16,6 +17,8 @@ client = OpenAI()
 
 MODEL = "text-embedding-3-small"
 CHUNK_SIZE = 800
+CACHE_PATH = "index/embed_cache.pkl"
+
 
 def read_sources():
     with open("sources.yaml") as f:
@@ -25,7 +28,6 @@ def read_sources():
 
     for src in cfg["sources"]:
         for root, dirs, filenames in os.walk(src):
-
             dirs[:] = [d for d in dirs if d not in [
                 ".git",
                 "__pycache__",
@@ -35,17 +37,22 @@ def read_sources():
             for name in filenames:
                 if name.endswith((".md", ".txt", ".py")):
                     path = os.path.join(root, name)
-
-                    if os.path.getsize(path) < 200_000:   # skip >200KB
+                    if os.path.getsize(path) < 200_000:
                         files.append(path)
 
-                        return files
+    return files
+
+
+def file_hash(path):
+    h = hashlib.md5()
+    h.update(Path(path).read_bytes())
+    return h.hexdigest()
 
 
 def chunk_text(text):
     chunks = []
     for i in range(0, len(text), CHUNK_SIZE):
-        chunks.append(text[i:i+CHUNK_SIZE])
+        chunks.append(text[i:i + CHUNK_SIZE])
     return chunks
 
 
@@ -57,43 +64,75 @@ def embed(text):
     return res.data[0].embedding
 
 
+def load_cache():
+    if os.path.exists(CACHE_PATH):
+        with open(CACHE_PATH, "rb") as f:
+            return pickle.load(f)
+    # cache format: {file_hash: [(chunk_text, vector), ...]}
+    return {}
+
+
+def save_cache(cache):
+    os.makedirs("index", exist_ok=True)
+    with open(CACHE_PATH, "wb") as f:
+        pickle.dump(cache, f)
+
+
 def main():
     files = read_sources()
+    cache = load_cache()
+    new_cache = {}
 
     all_chunks = []
     metadata = []
-
-    for file in files:
-        text = Path(file).read_text(errors="ignore")
-        chunks = chunk_text(text)
-
-        for c in chunks:
-            all_chunks.append(c)
-            metadata.append({
-                "source": file,
-                "chunk_id": len(all_chunks)
-            })
-
-    print("Embedding", len(all_chunks), "chunks")
-
     vectors = []
 
-    for chunk in tqdm(all_chunks):
-        vectors.append(embed(chunk))
+    files_new = 0
+    files_cached = 0
+
+    for file in files:
+        h = file_hash(file)
+
+        if h in cache:
+            # reuse cached embeddings
+            cached = cache[h]
+            files_cached += 1
+        else:
+            # embed and cache
+            text = Path(file).read_text(errors="ignore")
+            chunks = chunk_text(text)
+            file_vectors = []
+            for chunk in chunks:
+                file_vectors.append(embed(chunk))
+            cached = list(zip(chunks, file_vectors))
+            files_new += 1
+
+        new_cache[h] = cached
+
+        for chunk, vec in cached:
+            all_chunks.append(chunk)
+            metadata.append({"source": file, "chunk_id": len(all_chunks)})
+            vectors.append(vec)
+
+    print(f"Files: {files_cached} cached, {files_new} re-embedded ({len(all_chunks)} chunks total)")
+
+    if not vectors:
+        print("No content found.")
+        return
 
     dim = len(vectors[0])
     index = faiss.IndexFlatL2(dim)
     index.add(np.array(vectors).astype("float32"))
 
     os.makedirs("index", exist_ok=True)
-
     faiss.write_index(index, "index/index.faiss")
 
     with open("index/meta.pkl", "wb") as f:
         pickle.dump((all_chunks, metadata), f)
 
+    save_cache(new_cache)
     print("Index built.")
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()