init commit: v0.1.0

2026-03-04 17:20:29 -05:00
commit 443f81425a
6 changed files with 157 additions and 0 deletions
@@ -0,0 +1,6 @@
+index/
+*.faiss
+*.pkl
+__pycache__/
+*.env
+.venv/
@@ -0,0 +1,99 @@
+import os
+import pickle
+from pathlib import Path
+
+import yaml
+import faiss
+import numpy as np
+from tqdm import tqdm
+from dotenv import load_dotenv
+
+from openai import OpenAI
+
+load_dotenv()
+
+client = OpenAI()
+
+MODEL = "text-embedding-3-small"
+CHUNK_SIZE = 800
+
+def read_sources():
+    with open("sources.yaml") as f:
+        cfg = yaml.safe_load(f)
+
+    files = []
+
+    for src in cfg["sources"]:
+        for root, dirs, filenames in os.walk(src):
+
+            dirs[:] = [d for d in dirs if d not in [
+                ".git",
+                "__pycache__",
+                ".venv",
+                "index"
+            ]]
+            for name in filenames:
+                if name.endswith((".md", ".txt", ".py")):
+                    path = os.path.join(root, name)
+
+                    if os.path.getsize(path) < 200_000:   # skip >200KB
+                        files.append(path)
+
+                        return files
+
+
+def chunk_text(text):
+    chunks = []
+    for i in range(0, len(text), CHUNK_SIZE):
+        chunks.append(text[i:i+CHUNK_SIZE])
+    return chunks
+
+
+def embed(text):
+    res = client.embeddings.create(
+        model=MODEL,
+        input=text
+    )
+    return res.data[0].embedding
+
+
+def main():
+    files = read_sources()
+
+    all_chunks = []
+    metadata = []
+
+    for file in files:
+        text = Path(file).read_text(errors="ignore")
+        chunks = chunk_text(text)
+
+        for c in chunks:
+            all_chunks.append(c)
+            metadata.append({
+                "source": file,
+                "chunk_id": len(all_chunks)
+            })
+
+    print("Embedding", len(all_chunks), "chunks")
+
+    vectors = []
+
+    for chunk in tqdm(all_chunks):
+        vectors.append(embed(chunk))
+
+    dim = len(vectors[0])
+    index = faiss.IndexFlatL2(dim)
+    index.add(np.array(vectors).astype("float32"))
+
+    os.makedirs("index", exist_ok=True)
+
+    faiss.write_index(index, "index/index.faiss")
+
+    with open("index/meta.pkl", "wb") as f:
+        pickle.dump((all_chunks, metadata), f)
+
+    print("Index built.")
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,43 @@
+
+import pickle
+
+import numpy as np
+import faiss
+from dotenv import load_dotenv
+from openai import OpenAI
+
+load_dotenv()
+client = OpenAI()
+
+MODEL = "text-embedding-3-small"
+
+
+def embed(text):
+    res = client.embeddings.create(
+        model=MODEL,
+        input=text
+    )
+    return res.data[0].embedding
+
+
+index = faiss.read_index("index/index.faiss")
+
+with open("index/meta.pkl", "rb") as f:
+    chunks, meta = pickle.load(f)
+
+
+def search(query, k=5):
+    qvec = np.array([embed(query)]).astype("float32")
+
+    distances, ids = index.search(qvec, k)
+
+    for i in ids[0]:
+        print("\n----")
+        print(meta[i]["source"])
+        print(chunks[i][:500])
+
+
+if __name__ == "__main__":
+    import sys
+    query = " ".join(sys.argv[1:])
+    search(query)
@@ -0,0 +1,5 @@
+openai
+faiss-cpu
+pyyaml
+tqdm
+python-dotenv
@@ -0,0 +1,4 @@
+sources:
+  - ../seismo-relay/docs
+  - ../seismo-relay/parsers
+  - ../series3-agent