commit 443f81425a8e25a68540bbc0dcc6d9acbd5b479d Author: serversdwn Date: Wed Mar 4 17:20:29 2026 -0500 init commit: v0.1.0 diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..e69de29 diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d089117 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +index/ +*.faiss +*.pkl +__pycache__/ +*.env +.venv/ \ No newline at end of file diff --git a/ingest.py b/ingest.py new file mode 100644 index 0000000..65cd63d --- /dev/null +++ b/ingest.py @@ -0,0 +1,99 @@ +import os +import pickle +from pathlib import Path + +import yaml +import faiss +import numpy as np +from tqdm import tqdm +from dotenv import load_dotenv + +from openai import OpenAI + +load_dotenv() + +client = OpenAI() + +MODEL = "text-embedding-3-small" +CHUNK_SIZE = 800 + +def read_sources(): + with open("sources.yaml") as f: + cfg = yaml.safe_load(f) + + files = [] + + for src in cfg["sources"]: + for root, dirs, filenames in os.walk(src): + + dirs[:] = [d for d in dirs if d not in [ + ".git", + "__pycache__", + ".venv", + "index" + ]] + for name in filenames: + if name.endswith((".md", ".txt", ".py")): + path = os.path.join(root, name) + + if os.path.getsize(path) < 200_000: # skip >200KB + files.append(path) + + return files + + +def chunk_text(text): + chunks = [] + for i in range(0, len(text), CHUNK_SIZE): + chunks.append(text[i:i+CHUNK_SIZE]) + return chunks + + +def embed(text): + res = client.embeddings.create( + model=MODEL, + input=text + ) + return res.data[0].embedding + + +def main(): + files = read_sources() + + all_chunks = [] + metadata = [] + + for file in files: + text = Path(file).read_text(errors="ignore") + chunks = chunk_text(text) + + for c in chunks: + all_chunks.append(c) + metadata.append({ + "source": file, + "chunk_id": len(all_chunks) + }) + + print("Embedding", len(all_chunks), "chunks") + + vectors = [] + + for chunk in tqdm(all_chunks): + vectors.append(embed(chunk)) + + dim = len(vectors[0]) + index = faiss.IndexFlatL2(dim) + index.add(np.array(vectors).astype("float32")) + + os.makedirs("index", exist_ok=True) + + faiss.write_index(index, "index/index.faiss") + + with open("index/meta.pkl", "wb") as f: + pickle.dump((all_chunks, metadata), f) + + print("Index built.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/query.py b/query.py new file mode 100644 index 0000000..d312e98 --- /dev/null +++ b/query.py @@ -0,0 +1,43 @@ + +import pickle + +import numpy as np +import faiss +from dotenv import load_dotenv +from openai import OpenAI + +load_dotenv() +client = OpenAI() + +MODEL = "text-embedding-3-small" + + +def embed(text): + res = client.embeddings.create( + model=MODEL, + input=text + ) + return res.data[0].embedding + + +index = faiss.read_index("index/index.faiss") + +with open("index/meta.pkl", "rb") as f: + chunks, meta = pickle.load(f) + + +def search(query, k=5): + qvec = np.array([embed(query)]).astype("float32") + + distances, ids = index.search(qvec, k) + + for i in ids[0]: + print("\n----") + print(meta[i]["source"]) + print(chunks[i][:500]) + + +if __name__ == "__main__": + import sys + query = " ".join(sys.argv[1:]) + search(query) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ea44c76 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +openai +faiss-cpu +pyyaml +tqdm +python-dotenv \ No newline at end of file diff --git a/sources.yaml b/sources.yaml new file mode 100644 index 0000000..da892e3 --- /dev/null +++ b/sources.yaml @@ -0,0 +1,4 @@ +sources: + - ../seismo-relay/docs + - ../seismo-relay/parsers + - ../series3-agent \ No newline at end of file