-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathbuild_index.py
More file actions
75 lines (62 loc) · 2.28 KB
/
build_index.py
File metadata and controls
75 lines (62 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# build_index.py
import os
from dotenv import load_dotenv
from langchain_community.embeddings import HuggingFaceEmbeddings # FREE!
from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.schema import Document
# -------------------------------
# Load environment (for later use)
# -------------------------------
load_dotenv()
# -------------------------------
# Paths
# -------------------------------
DATA_DIR = "data"
EMBEDDINGS_DIR = "embeddings/faiss_index"
# -------------------------------
# Step 1: Read all text files in data/
# -------------------------------
documents = []
for filename in os.listdir(DATA_DIR):
if filename.endswith(".txt"):
filepath = os.path.join(DATA_DIR, filename)
with open(filepath, "r", encoding="utf-8") as f:
text = f.read()
documents.append(Document(
page_content=text,
metadata={"source": filename}
))
if not documents:
raise ValueError(f"No text files found in {DATA_DIR}!")
print(f"📄 Found {len(documents)} text files")
# -------------------------------
# Step 2: Split text into chunks
# -------------------------------
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=100,
separators=["\n\n", "\n", " ", ""]
)
docs = text_splitter.split_documents(documents)
print(f"✂️ Split into {len(docs)} chunks")
# -------------------------------
# Step 3: Create FREE local embeddings
# -------------------------------
print("🔄 Loading HuggingFace embeddings (first time will download model)...")
embeddings = HuggingFaceEmbeddings(
model_name="sentence-transformers/all-MiniLM-L6-v2"
)
print("🔄 Creating embeddings...")
# -------------------------------
# Step 4: Build FAISS index
# -------------------------------
vectorstore = FAISS.from_documents(docs, embeddings)
print("🔨 FAISS index built")
# -------------------------------
# Step 5: Save FAISS index locally
# -------------------------------
os.makedirs(EMBEDDINGS_DIR, exist_ok=True)
vectorstore.save_local(EMBEDDINGS_DIR)
print(f"✅ FAISS index saved to {EMBEDDINGS_DIR}")
print(f"📊 Total chunks indexed: {len(docs)}")