init commit: v0.1.0
This commit is contained in:
0
.env.example
Normal file
0
.env.example
Normal file
6
.gitignore
vendored
Normal file
6
.gitignore
vendored
Normal file
@@ -0,0 +1,6 @@
|
||||
index/
|
||||
*.faiss
|
||||
*.pkl
|
||||
__pycache__/
|
||||
*.env
|
||||
.venv/
|
||||
99
ingest.py
Normal file
99
ingest.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import os
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import yaml
|
||||
import faiss
|
||||
import numpy as np
|
||||
from tqdm import tqdm
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from openai import OpenAI
|
||||
|
||||
load_dotenv()
|
||||
|
||||
client = OpenAI()
|
||||
|
||||
MODEL = "text-embedding-3-small"
|
||||
CHUNK_SIZE = 800
|
||||
|
||||
def read_sources():
|
||||
with open("sources.yaml") as f:
|
||||
cfg = yaml.safe_load(f)
|
||||
|
||||
files = []
|
||||
|
||||
for src in cfg["sources"]:
|
||||
for root, dirs, filenames in os.walk(src):
|
||||
|
||||
dirs[:] = [d for d in dirs if d not in [
|
||||
".git",
|
||||
"__pycache__",
|
||||
".venv",
|
||||
"index"
|
||||
]]
|
||||
for name in filenames:
|
||||
if name.endswith((".md", ".txt", ".py")):
|
||||
path = os.path.join(root, name)
|
||||
|
||||
if os.path.getsize(path) < 200_000: # skip >200KB
|
||||
files.append(path)
|
||||
|
||||
return files
|
||||
|
||||
|
||||
def chunk_text(text):
|
||||
chunks = []
|
||||
for i in range(0, len(text), CHUNK_SIZE):
|
||||
chunks.append(text[i:i+CHUNK_SIZE])
|
||||
return chunks
|
||||
|
||||
|
||||
def embed(text):
|
||||
res = client.embeddings.create(
|
||||
model=MODEL,
|
||||
input=text
|
||||
)
|
||||
return res.data[0].embedding
|
||||
|
||||
|
||||
def main():
|
||||
files = read_sources()
|
||||
|
||||
all_chunks = []
|
||||
metadata = []
|
||||
|
||||
for file in files:
|
||||
text = Path(file).read_text(errors="ignore")
|
||||
chunks = chunk_text(text)
|
||||
|
||||
for c in chunks:
|
||||
all_chunks.append(c)
|
||||
metadata.append({
|
||||
"source": file,
|
||||
"chunk_id": len(all_chunks)
|
||||
})
|
||||
|
||||
print("Embedding", len(all_chunks), "chunks")
|
||||
|
||||
vectors = []
|
||||
|
||||
for chunk in tqdm(all_chunks):
|
||||
vectors.append(embed(chunk))
|
||||
|
||||
dim = len(vectors[0])
|
||||
index = faiss.IndexFlatL2(dim)
|
||||
index.add(np.array(vectors).astype("float32"))
|
||||
|
||||
os.makedirs("index", exist_ok=True)
|
||||
|
||||
faiss.write_index(index, "index/index.faiss")
|
||||
|
||||
with open("index/meta.pkl", "wb") as f:
|
||||
pickle.dump((all_chunks, metadata), f)
|
||||
|
||||
print("Index built.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
43
query.py
Normal file
43
query.py
Normal file
@@ -0,0 +1,43 @@
|
||||
|
||||
import pickle
|
||||
|
||||
import numpy as np
|
||||
import faiss
|
||||
from dotenv import load_dotenv
|
||||
from openai import OpenAI
|
||||
|
||||
load_dotenv()
|
||||
client = OpenAI()
|
||||
|
||||
MODEL = "text-embedding-3-small"
|
||||
|
||||
|
||||
def embed(text):
|
||||
res = client.embeddings.create(
|
||||
model=MODEL,
|
||||
input=text
|
||||
)
|
||||
return res.data[0].embedding
|
||||
|
||||
|
||||
index = faiss.read_index("index/index.faiss")
|
||||
|
||||
with open("index/meta.pkl", "rb") as f:
|
||||
chunks, meta = pickle.load(f)
|
||||
|
||||
|
||||
def search(query, k=5):
|
||||
qvec = np.array([embed(query)]).astype("float32")
|
||||
|
||||
distances, ids = index.search(qvec, k)
|
||||
|
||||
for i in ids[0]:
|
||||
print("\n----")
|
||||
print(meta[i]["source"])
|
||||
print(chunks[i][:500])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
query = " ".join(sys.argv[1:])
|
||||
search(query)
|
||||
5
requirements.txt
Normal file
5
requirements.txt
Normal file
@@ -0,0 +1,5 @@
|
||||
openai
|
||||
faiss-cpu
|
||||
pyyaml
|
||||
tqdm
|
||||
python-dotenv
|
||||
4
sources.yaml
Normal file
4
sources.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
sources:
|
||||
- ../seismo-relay/docs
|
||||
- ../seismo-relay/parsers
|
||||
- ../series3-agent
|
||||
Reference in New Issue
Block a user