99 lines
2.0 KiB
Python
99 lines
2.0 KiB
Python
import os
|
|
import pickle
|
|
from pathlib import Path
|
|
|
|
import yaml
|
|
import faiss
|
|
import numpy as np
|
|
from tqdm import tqdm
|
|
from dotenv import load_dotenv
|
|
|
|
from openai import OpenAI
|
|
|
|
load_dotenv()
|
|
|
|
client = OpenAI()
|
|
|
|
MODEL = "text-embedding-3-small"
|
|
CHUNK_SIZE = 800
|
|
|
|
def read_sources():
|
|
with open("sources.yaml") as f:
|
|
cfg = yaml.safe_load(f)
|
|
|
|
files = []
|
|
|
|
for src in cfg["sources"]:
|
|
for root, dirs, filenames in os.walk(src):
|
|
|
|
dirs[:] = [d for d in dirs if d not in [
|
|
".git",
|
|
"__pycache__",
|
|
".venv",
|
|
"index"
|
|
]]
|
|
for name in filenames:
|
|
if name.endswith((".md", ".txt", ".py")):
|
|
path = os.path.join(root, name)
|
|
|
|
if os.path.getsize(path) < 200_000: # skip >200KB
|
|
files.append(path)
|
|
|
|
return files
|
|
|
|
|
|
def chunk_text(text):
|
|
chunks = []
|
|
for i in range(0, len(text), CHUNK_SIZE):
|
|
chunks.append(text[i:i+CHUNK_SIZE])
|
|
return chunks
|
|
|
|
|
|
def embed(text):
|
|
res = client.embeddings.create(
|
|
model=MODEL,
|
|
input=text
|
|
)
|
|
return res.data[0].embedding
|
|
|
|
|
|
def main():
|
|
files = read_sources()
|
|
|
|
all_chunks = []
|
|
metadata = []
|
|
|
|
for file in files:
|
|
text = Path(file).read_text(errors="ignore")
|
|
chunks = chunk_text(text)
|
|
|
|
for c in chunks:
|
|
all_chunks.append(c)
|
|
metadata.append({
|
|
"source": file,
|
|
"chunk_id": len(all_chunks)
|
|
})
|
|
|
|
print("Embedding", len(all_chunks), "chunks")
|
|
|
|
vectors = []
|
|
|
|
for chunk in tqdm(all_chunks):
|
|
vectors.append(embed(chunk))
|
|
|
|
dim = len(vectors[0])
|
|
index = faiss.IndexFlatL2(dim)
|
|
index.add(np.array(vectors).astype("float32"))
|
|
|
|
os.makedirs("index", exist_ok=True)
|
|
|
|
faiss.write_index(index, "index/index.faiss")
|
|
|
|
with open("index/meta.pkl", "wb") as f:
|
|
pickle.dump((all_chunks, metadata), f)
|
|
|
|
print("Index built.")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main() |