feat: add incremental embedding, unchanged files now resuse cached embedding to save on tokens.

fix: early return bug in ingest.py fixed.
This commit is contained in:
serversdwn
2026-03-09 16:23:59 -04:00
parent 0efe2789b4
commit c015e39925

View File

@@ -1,3 +1,4 @@
import hashlib
import os
import pickle
from pathlib import Path
@@ -16,6 +17,8 @@ client = OpenAI()
MODEL = "text-embedding-3-small"
CHUNK_SIZE = 800
CACHE_PATH = "index/embed_cache.pkl"
def read_sources():
with open("sources.yaml") as f:
@@ -25,7 +28,6 @@ def read_sources():
for src in cfg["sources"]:
for root, dirs, filenames in os.walk(src):
dirs[:] = [d for d in dirs if d not in [
".git",
"__pycache__",
@@ -35,17 +37,22 @@ def read_sources():
for name in filenames:
if name.endswith((".md", ".txt", ".py")):
path = os.path.join(root, name)
if os.path.getsize(path) < 200_000: # skip >200KB
if os.path.getsize(path) < 200_000:
files.append(path)
return files
return files
def file_hash(path):
h = hashlib.md5()
h.update(Path(path).read_bytes())
return h.hexdigest()
def chunk_text(text):
chunks = []
for i in range(0, len(text), CHUNK_SIZE):
chunks.append(text[i:i+CHUNK_SIZE])
chunks.append(text[i:i + CHUNK_SIZE])
return chunks
@@ -57,41 +64,73 @@ def embed(text):
return res.data[0].embedding
def load_cache():
if os.path.exists(CACHE_PATH):
with open(CACHE_PATH, "rb") as f:
return pickle.load(f)
# cache format: {file_hash: [(chunk_text, vector), ...]}
return {}
def save_cache(cache):
os.makedirs("index", exist_ok=True)
with open(CACHE_PATH, "wb") as f:
pickle.dump(cache, f)
def main():
files = read_sources()
cache = load_cache()
new_cache = {}
all_chunks = []
metadata = []
for file in files:
text = Path(file).read_text(errors="ignore")
chunks = chunk_text(text)
for c in chunks:
all_chunks.append(c)
metadata.append({
"source": file,
"chunk_id": len(all_chunks)
})
print("Embedding", len(all_chunks), "chunks")
vectors = []
for chunk in tqdm(all_chunks):
vectors.append(embed(chunk))
files_new = 0
files_cached = 0
for file in files:
h = file_hash(file)
if h in cache:
# reuse cached embeddings
cached = cache[h]
files_cached += 1
else:
# embed and cache
text = Path(file).read_text(errors="ignore")
chunks = chunk_text(text)
file_vectors = []
for chunk in chunks:
file_vectors.append(embed(chunk))
cached = list(zip(chunks, file_vectors))
files_new += 1
new_cache[h] = cached
for chunk, vec in cached:
all_chunks.append(chunk)
metadata.append({"source": file, "chunk_id": len(all_chunks)})
vectors.append(vec)
print(f"Files: {files_cached} cached, {files_new} re-embedded ({len(all_chunks)} chunks total)")
if not vectors:
print("No content found.")
return
dim = len(vectors[0])
index = faiss.IndexFlatL2(dim)
index.add(np.array(vectors).astype("float32"))
os.makedirs("index", exist_ok=True)
faiss.write_index(index, "index/index.faiss")
with open("index/meta.pkl", "wb") as f:
pickle.dump((all_chunks, metadata), f)
save_cache(new_cache)
print("Index built.")