feat: add incremental embedding, unchanged files now resuse cached embedding to save on tokens.
fix: early return bug in ingest.py fixed.
This commit is contained in:
81
ingest.py
81
ingest.py
@@ -1,3 +1,4 @@
|
|||||||
|
import hashlib
|
||||||
import os
|
import os
|
||||||
import pickle
|
import pickle
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
@@ -16,6 +17,8 @@ client = OpenAI()
|
|||||||
|
|
||||||
MODEL = "text-embedding-3-small"
|
MODEL = "text-embedding-3-small"
|
||||||
CHUNK_SIZE = 800
|
CHUNK_SIZE = 800
|
||||||
|
CACHE_PATH = "index/embed_cache.pkl"
|
||||||
|
|
||||||
|
|
||||||
def read_sources():
|
def read_sources():
|
||||||
with open("sources.yaml") as f:
|
with open("sources.yaml") as f:
|
||||||
@@ -25,7 +28,6 @@ def read_sources():
|
|||||||
|
|
||||||
for src in cfg["sources"]:
|
for src in cfg["sources"]:
|
||||||
for root, dirs, filenames in os.walk(src):
|
for root, dirs, filenames in os.walk(src):
|
||||||
|
|
||||||
dirs[:] = [d for d in dirs if d not in [
|
dirs[:] = [d for d in dirs if d not in [
|
||||||
".git",
|
".git",
|
||||||
"__pycache__",
|
"__pycache__",
|
||||||
@@ -35,17 +37,22 @@ def read_sources():
|
|||||||
for name in filenames:
|
for name in filenames:
|
||||||
if name.endswith((".md", ".txt", ".py")):
|
if name.endswith((".md", ".txt", ".py")):
|
||||||
path = os.path.join(root, name)
|
path = os.path.join(root, name)
|
||||||
|
if os.path.getsize(path) < 200_000:
|
||||||
if os.path.getsize(path) < 200_000: # skip >200KB
|
|
||||||
files.append(path)
|
files.append(path)
|
||||||
|
|
||||||
return files
|
return files
|
||||||
|
|
||||||
|
|
||||||
|
def file_hash(path):
|
||||||
|
h = hashlib.md5()
|
||||||
|
h.update(Path(path).read_bytes())
|
||||||
|
return h.hexdigest()
|
||||||
|
|
||||||
|
|
||||||
def chunk_text(text):
|
def chunk_text(text):
|
||||||
chunks = []
|
chunks = []
|
||||||
for i in range(0, len(text), CHUNK_SIZE):
|
for i in range(0, len(text), CHUNK_SIZE):
|
||||||
chunks.append(text[i:i+CHUNK_SIZE])
|
chunks.append(text[i:i + CHUNK_SIZE])
|
||||||
return chunks
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
@@ -57,41 +64,73 @@ def embed(text):
|
|||||||
return res.data[0].embedding
|
return res.data[0].embedding
|
||||||
|
|
||||||
|
|
||||||
|
def load_cache():
|
||||||
|
if os.path.exists(CACHE_PATH):
|
||||||
|
with open(CACHE_PATH, "rb") as f:
|
||||||
|
return pickle.load(f)
|
||||||
|
# cache format: {file_hash: [(chunk_text, vector), ...]}
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def save_cache(cache):
|
||||||
|
os.makedirs("index", exist_ok=True)
|
||||||
|
with open(CACHE_PATH, "wb") as f:
|
||||||
|
pickle.dump(cache, f)
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
files = read_sources()
|
files = read_sources()
|
||||||
|
cache = load_cache()
|
||||||
|
new_cache = {}
|
||||||
|
|
||||||
all_chunks = []
|
all_chunks = []
|
||||||
metadata = []
|
metadata = []
|
||||||
|
|
||||||
for file in files:
|
|
||||||
text = Path(file).read_text(errors="ignore")
|
|
||||||
chunks = chunk_text(text)
|
|
||||||
|
|
||||||
for c in chunks:
|
|
||||||
all_chunks.append(c)
|
|
||||||
metadata.append({
|
|
||||||
"source": file,
|
|
||||||
"chunk_id": len(all_chunks)
|
|
||||||
})
|
|
||||||
|
|
||||||
print("Embedding", len(all_chunks), "chunks")
|
|
||||||
|
|
||||||
vectors = []
|
vectors = []
|
||||||
|
|
||||||
for chunk in tqdm(all_chunks):
|
files_new = 0
|
||||||
vectors.append(embed(chunk))
|
files_cached = 0
|
||||||
|
|
||||||
|
for file in files:
|
||||||
|
h = file_hash(file)
|
||||||
|
|
||||||
|
if h in cache:
|
||||||
|
# reuse cached embeddings
|
||||||
|
cached = cache[h]
|
||||||
|
files_cached += 1
|
||||||
|
else:
|
||||||
|
# embed and cache
|
||||||
|
text = Path(file).read_text(errors="ignore")
|
||||||
|
chunks = chunk_text(text)
|
||||||
|
file_vectors = []
|
||||||
|
for chunk in chunks:
|
||||||
|
file_vectors.append(embed(chunk))
|
||||||
|
cached = list(zip(chunks, file_vectors))
|
||||||
|
files_new += 1
|
||||||
|
|
||||||
|
new_cache[h] = cached
|
||||||
|
|
||||||
|
for chunk, vec in cached:
|
||||||
|
all_chunks.append(chunk)
|
||||||
|
metadata.append({"source": file, "chunk_id": len(all_chunks)})
|
||||||
|
vectors.append(vec)
|
||||||
|
|
||||||
|
print(f"Files: {files_cached} cached, {files_new} re-embedded ({len(all_chunks)} chunks total)")
|
||||||
|
|
||||||
|
if not vectors:
|
||||||
|
print("No content found.")
|
||||||
|
return
|
||||||
|
|
||||||
dim = len(vectors[0])
|
dim = len(vectors[0])
|
||||||
index = faiss.IndexFlatL2(dim)
|
index = faiss.IndexFlatL2(dim)
|
||||||
index.add(np.array(vectors).astype("float32"))
|
index.add(np.array(vectors).astype("float32"))
|
||||||
|
|
||||||
os.makedirs("index", exist_ok=True)
|
os.makedirs("index", exist_ok=True)
|
||||||
|
|
||||||
faiss.write_index(index, "index/index.faiss")
|
faiss.write_index(index, "index/index.faiss")
|
||||||
|
|
||||||
with open("index/meta.pkl", "wb") as f:
|
with open("index/meta.pkl", "wb") as f:
|
||||||
pickle.dump((all_chunks, metadata), f)
|
pickle.dump((all_chunks, metadata), f)
|
||||||
|
|
||||||
|
save_cache(new_cache)
|
||||||
print("Index built.")
|
print("Index built.")
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user