Skip to content

Commit 3b0db18

Browse files
committed
feat(ai):add BM25 keyword engine & hybrid search with RRF
1 parent c21392e commit 3b0db18

6 files changed

Lines changed: 586 additions & 0 deletions

File tree

Lines changed: 189 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,189 @@
1+
import { describe, it, expect, beforeEach } from "vitest";
2+
import { HybridSearchService, reciprocalRankFusion } from "../hybridSearch.js";
3+
import { KeywordSearchEngine } from "../keywordSearch.js";
4+
import { SemanticSearchService } from "../semanticSearch.js";
5+
import { EmbeddingService } from "../embeddings.js";
6+
import { VectorStore } from "../vectorStore.js";
7+
import type { EmbedFn } from "../embeddings.js";
8+
import type { SearchResult, TextChunk } from "../types.js";
9+
10+
11+
/** Topic-aware mock embedder — mirrors semanticSearch.test.ts convention. */
12+
function createMockEmbedFn(dimensions: number = 384): EmbedFn {
13+
return async (texts: string[]): Promise<number[][]> => {
14+
return texts.map((text) => {
15+
const vector = new Array(dimensions).fill(0);
16+
const lower = text.toLowerCase();
17+
if (lower.includes("machine learning") || lower.includes("neural")) {
18+
vector[0] = 0.8; vector[1] = 0.6;
19+
}
20+
if (lower.includes("cooking") || lower.includes("recipe")) {
21+
vector[2] = 0.8; vector[3] = 0.6;
22+
}
23+
if (lower.includes("typescript") || lower.includes("javascript")) {
24+
vector[4] = 0.8; vector[5] = 0.6;
25+
}
26+
const norm = Math.sqrt(vector.reduce((s, v) => s + v * v, 0));
27+
return norm > 0 ? vector.map((v) => v / norm) : vector;
28+
});
29+
};
30+
}
31+
32+
function makeResult(noteId: string, chunkIndex: number, score: number): SearchResult {
33+
const chunk: TextChunk = {
34+
id: `${noteId}:${chunkIndex}`,
35+
noteId,
36+
content: "dummy",
37+
chunkIndex,
38+
};
39+
return { chunk, score };
40+
}
41+
42+
function buildService(embedFn?: EmbedFn) {
43+
const embeddingService = new EmbeddingService({}, embedFn ?? createMockEmbedFn());
44+
const vectorStore = new VectorStore({
45+
persistDir: "/tmp/hybrid-test",
46+
indexFilename: "test-index.json",
47+
});
48+
const semantic = new SemanticSearchService(embeddingService, vectorStore);
49+
const keyword = new KeywordSearchEngine();
50+
return new HybridSearchService(semantic, keyword);
51+
}
52+
53+
describe("reciprocalRankFusion", () => {
54+
it("returns empty for empty inputs", () => {
55+
expect(reciprocalRankFusion([])).toEqual([]);
56+
expect(reciprocalRankFusion([{ results: [], weight: 1 }])).toEqual([]);
57+
});
58+
59+
it("accumulates scores for items appearing in multiple lists", () => {
60+
const list1 = [makeResult("n1", 0, 0.9), makeResult("n2", 0, 0.5)];
61+
const list2 = [makeResult("n2", 0, 0.8), makeResult("n3", 0, 0.4)];
62+
const fused = reciprocalRankFusion(
63+
[{ results: list1, weight: 1 }, { results: list2, weight: 1 }],
64+
60,
65+
);
66+
const n2 = fused.find((r) => r.chunk.noteId === "n2")!;
67+
const n1 = fused.find((r) => r.chunk.noteId === "n1")!;
68+
expect(n2.score).toBeGreaterThan(n1.score);
69+
});
70+
71+
it("includes items from disjoint lists", () => {
72+
const list1 = [makeResult("n1", 0, 0.9)];
73+
const list2 = [makeResult("n2", 0, 0.9)];
74+
const fused = reciprocalRankFusion(
75+
[{ results: list1, weight: 1 }, { results: list2, weight: 1 }],
76+
);
77+
const noteIds = fused.map((r) => r.chunk.noteId);
78+
expect(noteIds).toContain("n1");
79+
expect(noteIds).toContain("n2");
80+
});
81+
82+
it("respects weights — higher-weight list dominates when items differ", () => {
83+
// n1 is rank-1 in the low-weight list; n2 is rank-1 in the high-weight list
84+
const lowWeight = [makeResult("n1", 0, 0.99)];
85+
const highWeight = [makeResult("n2", 0, 0.99)];
86+
const fused = reciprocalRankFusion(
87+
[{ results: lowWeight, weight: 0.1 }, { results: highWeight, weight: 10 }],
88+
60,
89+
);
90+
expect(fused[0].chunk.noteId).toBe("n2");
91+
});
92+
93+
});
94+
95+
describe("HybridSearchService", () => {
96+
let service: HybridSearchService;
97+
98+
beforeEach(async () => {
99+
service = buildService();
100+
await service.initialize();
101+
});
102+
103+
describe("indexNote + search round-trip", () => {
104+
it("finds a note after indexing", async () => {
105+
await service.indexNote("ml-note", "machine learning and neural networks");
106+
const results = await service.search("machine learning");
107+
const noteIds = results.map((r) => r.chunk.noteId);
108+
expect(noteIds).toContain("ml-note");
109+
});
110+
111+
it("returns results from keyword-only match", async () => {
112+
// Index a note whose content won't produce a high semantic score
113+
// (all zeros in mock embedder), but has exact keyword matches.
114+
await service.indexNote("kw-note", "zygomorphic floriculture botany");
115+
const results = await service.search("zygomorphic floriculture", {
116+
topK: 5,
117+
});
118+
const noteIds = results.map((r) => r.chunk.noteId);
119+
expect(noteIds).toContain("kw-note");
120+
});
121+
122+
it("a note matching both modalities ranks at the top", async () => {
123+
await service.indexNote(
124+
"both-note",
125+
"machine learning neural networks deep learning",
126+
);
127+
await service.indexNote("kw-only-note", "zygomorphic floriculture exotic");
128+
const results = await service.search("machine learning");
129+
expect(results[0].chunk.noteId).toBe("both-note");
130+
});
131+
});
132+
133+
describe("search options", () => {
134+
beforeEach(async () => {
135+
await service.indexNote("ml-note", "machine learning and neural networks");
136+
await service.indexNote("cooking-note", "pasta cooking recipe dinner");
137+
await service.indexNote("ts-note", "typescript javascript programming");
138+
});
139+
140+
it("respects topK", async () => {
141+
const results = await service.search("general query text", { topK: 2 });
142+
expect(results.length).toBeLessThanOrEqual(2);
143+
});
144+
145+
it("filters results by noteId", async () => {
146+
const results = await service.search("any query", {
147+
noteId: "cooking-note",
148+
topK: 10,
149+
});
150+
expect(results.length).toBeGreaterThan(0);
151+
expect(results.every((r) => r.chunk.noteId === "cooking-note")).toBe(true);
152+
});
153+
154+
it("zeroing semanticWeight degrades to keyword-only ranking", async () => {
155+
const results = await service.search("machine learning", {
156+
semanticWeight: 0,
157+
keywordWeight: 1,
158+
topK: 5,
159+
});
160+
const noteIds = results.map((r) => r.chunk.noteId);
161+
expect(noteIds).toContain("ml-note");
162+
});
163+
164+
it("zeroing keywordWeight degrades to semantic-only ranking", async () => {
165+
const results = await service.search("machine learning", {
166+
semanticWeight: 1,
167+
keywordWeight: 0,
168+
topK: 5,
169+
});
170+
expect(results[0].chunk.noteId).toBe("ml-note");
171+
});
172+
});
173+
174+
describe("removeNote", () => {
175+
it("removed note does not appear in results", async () => {
176+
await service.indexNote("ml-note", "machine learning neural networks");
177+
service.removeNote("ml-note");
178+
const results = await service.search("machine learning", { topK: 10 });
179+
expect(results.every((r) => r.chunk.noteId !== "ml-note")).toBe(true);
180+
});
181+
});
182+
183+
describe("empty index", () => {
184+
it("returns empty array without crashing", async () => {
185+
const results = await service.search("anything");
186+
expect(results).toEqual([]);
187+
});
188+
});
189+
});
Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
import { describe, it, expect, beforeEach } from "vitest";
2+
import { KeywordSearchEngine, tokenize } from "../keywordSearch.js";
3+
import type { TextChunk } from "../types.js";
4+
5+
6+
function makeChunk(noteId: string, index: number, content: string): TextChunk {
7+
return { id: `${noteId}:${index}`, noteId, content, chunkIndex: index };
8+
}
9+
10+
11+
describe("tokenize", () => {
12+
it("lowercases and splits on non-word characters", () => {
13+
expect(tokenize("Hello, World!")).toEqual(["hello", "world"]);
14+
});
15+
16+
it("removes stop words", () => {
17+
const tokens = tokenize("the cat is a mammal");
18+
expect(tokens).toContain("cat");
19+
expect(tokens).toContain("mammal");
20+
expect(tokens).not.toContain("the");
21+
expect(tokens).not.toContain("is");
22+
expect(tokens).not.toContain("a");
23+
});
24+
25+
it("returns empty array for empty input", () => {
26+
expect(tokenize("")).toEqual([]);
27+
});
28+
29+
it("returns empty array when all tokens are stop words", () => {
30+
expect(tokenize("the is a an")).toEqual([]);
31+
});
32+
});
33+
34+
35+
describe("KeywordSearchEngine", () => {
36+
let engine: KeywordSearchEngine;
37+
38+
beforeEach(() => {
39+
engine = new KeywordSearchEngine();
40+
});
41+
42+
describe("indexChunks + search", () => {
43+
it("returns the indexed chunk for a matching query", () => {
44+
engine.indexChunks([makeChunk("n1", 0, "machine learning is fascinating")]);
45+
const results = engine.search("machine learning");
46+
expect(results).toHaveLength(1);
47+
expect(results[0].chunk.noteId).toBe("n1");
48+
expect(results[0].score).toBeGreaterThan(0);
49+
});
50+
51+
it("returns empty array when nothing matches", () => {
52+
engine.indexChunks([makeChunk("n1", 0, "pasta and cooking recipes")]);
53+
expect(engine.search("quantum physics")).toEqual([]);
54+
});
55+
56+
it("ranks the doc with more matching terms higher", () => {
57+
engine.indexChunks([
58+
makeChunk("n1", 0, "neural networks"),
59+
makeChunk("n2", 0, "neural networks deep learning machine learning"),
60+
]);
61+
const results = engine.search("neural networks machine learning");
62+
expect(results.length).toBe(2);
63+
expect(results[0].chunk.noteId).toBe("n2");
64+
});
65+
66+
it("handles partial query matches — returns docs with any matching term", () => {
67+
engine.indexChunks([
68+
makeChunk("n1", 0, "apple orange banana"),
69+
makeChunk("n2", 0, "quantum gravity"),
70+
]);
71+
const results = engine.search("apple quantum");
72+
// Both docs should appear since each matches one term
73+
const noteIds = results.map((r) => r.chunk.noteId);
74+
expect(noteIds).toContain("n1");
75+
expect(noteIds).toContain("n2");
76+
});
77+
78+
it("respects topK limit", () => {
79+
for (let i = 0; i < 10; i++) {
80+
engine.indexChunks([makeChunk(`n${i}`, 0, "machine learning ai")]);
81+
}
82+
const results = engine.search("machine", 3);
83+
expect(results.length).toBeLessThanOrEqual(3);
84+
});
85+
86+
it("returns empty array on empty index", () => {
87+
expect(engine.search("anything")).toEqual([]);
88+
});
89+
});
90+
91+
describe("removeByNoteId", () => {
92+
it("removes all chunks for a note; subsequent search excludes it", () => {
93+
engine.indexChunks([
94+
makeChunk("n1", 0, "machine learning"),
95+
makeChunk("n1", 1, "deep learning"),
96+
makeChunk("n2", 0, "machine learning"),
97+
]);
98+
const removed = engine.removeByNoteId("n1");
99+
expect(removed).toBe(2);
100+
101+
const results = engine.search("machine learning");
102+
expect(results.every((r) => r.chunk.noteId !== "n1")).toBe(true);
103+
expect(results.some((r) => r.chunk.noteId === "n2")).toBe(true);
104+
});
105+
});
106+
107+
describe("clear", () => {
108+
it("empties the entire index", () => {
109+
engine.indexChunks([makeChunk("n1", 0, "machine learning")]);
110+
engine.clear();
111+
expect(engine.size).toBe(0);
112+
expect(engine.search("machine")).toEqual([]);
113+
});
114+
});
115+
116+
describe("re-indexing", () => {
117+
it("updates a chunk when re-indexed with the same id", () => {
118+
engine.indexChunks([makeChunk("n1", 0, "cooking pasta")]);
119+
engine.indexChunks([makeChunk("n1", 0, "machine learning")]);
120+
expect(engine.search("pasta")).toEqual([]);
121+
expect(engine.search("machine")).toHaveLength(1);
122+
});
123+
});
124+
});

0 commit comments

Comments
 (0)