SpaceGENAI/build_index.py at main · ryanzone/SpaceGENAI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# build_index.py
import os
from dotenv import load_dotenv
from langchain_community.embeddings import HuggingFaceEmbeddings  # FREE!
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document

# -------------------------------
# Load environment (for later use)
# -------------------------------
load_dotenv()

# -------------------------------
# Paths
# -------------------------------
DATA_DIR = "data"
EMBEDDINGS_DIR = "embeddings/faiss_index"

# -------------------------------
# Step 1: Read all text files in data/
# -------------------------------
documents = []
for filename in os.listdir(DATA_DIR):
    if filename.endswith(".txt"):
        filepath = os.path.join(DATA_DIR, filename)
        with open(filepath, "r", encoding="utf-8") as f:
            text = f.read()
            documents.append(Document(
                page_content=text,
                metadata={"source": filename}
            ))

if not documents:
    raise ValueError(f"No text files found in {DATA_DIR}!")

print(f"📄 Found {len(documents)} text files")

# -------------------------------
# Step 2: Split text into chunks
# -------------------------------
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=100,
    separators=["\n\n", "\n", " ", ""]
)
docs = text_splitter.split_documents(documents)

print(f"✂️  Split into {len(docs)} chunks")

# -------------------------------
# Step 3: Create FREE local embeddings
# -------------------------------
print("🔄 Loading HuggingFace embeddings (first time will download model)...")
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

print("🔄 Creating embeddings...")

# -------------------------------
# Step 4: Build FAISS index
# -------------------------------
vectorstore = FAISS.from_documents(docs, embeddings)

print("🔨 FAISS index built")

# -------------------------------
# Step 5: Save FAISS index locally
# -------------------------------
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
vectorstore.save_local(EMBEDDINGS_DIR)

print(f"✅ FAISS index saved to {EMBEDDINGS_DIR}")
print(f"📊 Total chunks indexed: {len(docs)}")