init commit: v0.1.0

This commit is contained in:
serversdwn
2026-03-04 17:20:29 -05:00
commit 443f81425a
6 changed files with 157 additions and 0 deletions

0
.env.example Normal file
View File

6
.gitignore vendored Normal file
View File

@@ -0,0 +1,6 @@
index/
*.faiss
*.pkl
__pycache__/
*.env
.venv/

99
ingest.py Normal file
View File

@@ -0,0 +1,99 @@
import os
import pickle
from pathlib import Path
import yaml
import faiss
import numpy as np
from tqdm import tqdm
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
client = OpenAI()
MODEL = "text-embedding-3-small"
CHUNK_SIZE = 800
def read_sources():
with open("sources.yaml") as f:
cfg = yaml.safe_load(f)
files = []
for src in cfg["sources"]:
for root, dirs, filenames in os.walk(src):
dirs[:] = [d for d in dirs if d not in [
".git",
"__pycache__",
".venv",
"index"
]]
for name in filenames:
if name.endswith((".md", ".txt", ".py")):
path = os.path.join(root, name)
if os.path.getsize(path) < 200_000: # skip >200KB
files.append(path)
return files
def chunk_text(text):
chunks = []
for i in range(0, len(text), CHUNK_SIZE):
chunks.append(text[i:i+CHUNK_SIZE])
return chunks
def embed(text):
res = client.embeddings.create(
model=MODEL,
input=text
)
return res.data[0].embedding
def main():
files = read_sources()
all_chunks = []
metadata = []
for file in files:
text = Path(file).read_text(errors="ignore")
chunks = chunk_text(text)
for c in chunks:
all_chunks.append(c)
metadata.append({
"source": file,
"chunk_id": len(all_chunks)
})
print("Embedding", len(all_chunks), "chunks")
vectors = []
for chunk in tqdm(all_chunks):
vectors.append(embed(chunk))
dim = len(vectors[0])
index = faiss.IndexFlatL2(dim)
index.add(np.array(vectors).astype("float32"))
os.makedirs("index", exist_ok=True)
faiss.write_index(index, "index/index.faiss")
with open("index/meta.pkl", "wb") as f:
pickle.dump((all_chunks, metadata), f)
print("Index built.")
if __name__ == "__main__":
main()

43
query.py Normal file
View File

@@ -0,0 +1,43 @@
import pickle
import numpy as np
import faiss
from dotenv import load_dotenv
from openai import OpenAI
load_dotenv()
client = OpenAI()
MODEL = "text-embedding-3-small"
def embed(text):
res = client.embeddings.create(
model=MODEL,
input=text
)
return res.data[0].embedding
index = faiss.read_index("index/index.faiss")
with open("index/meta.pkl", "rb") as f:
chunks, meta = pickle.load(f)
def search(query, k=5):
qvec = np.array([embed(query)]).astype("float32")
distances, ids = index.search(qvec, k)
for i in ids[0]:
print("\n----")
print(meta[i]["source"])
print(chunks[i][:500])
if __name__ == "__main__":
import sys
query = " ".join(sys.argv[1:])
search(query)

5
requirements.txt Normal file
View File

@@ -0,0 +1,5 @@
openai
faiss-cpu
pyyaml
tqdm
python-dotenv

4
sources.yaml Normal file
View File

@@ -0,0 +1,4 @@
sources:
- ../seismo-relay/docs
- ../seismo-relay/parsers
- ../series3-agent