diff --git a/.gitignore b/.gitignore index 9308a4b..20d4870 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,7 @@ +dist/ +# Node.js +node_modules/ + ## Core latex/pdflatex auxiliary files: *.aux *.lof diff --git a/apps/retriever/package-lock.json b/apps/retriever/package-lock.json new file mode 100644 index 0000000..cfd9f5d --- /dev/null +++ b/apps/retriever/package-lock.json @@ -0,0 +1,48 @@ +{ + "name": "retriever", + "version": "1.0.0", + "lockfileVersion": 3, + "requires": true, + "packages": { + "": { + "name": "retriever", + "version": "1.0.0", + "license": "ISC", + "devDependencies": { + "@types/node": "^25.4.0", + "typescript": "^5.9.3" + } + }, + "node_modules/@types/node": { + "version": "25.4.0", + "resolved": "https://registry.npmjs.org/@types/node/-/node-25.4.0.tgz", + "integrity": "sha512-9wLpoeWuBlcbBpOY3XmzSTG3oscB6xjBEEtn+pYXTfhyXhIxC5FsBer2KTopBlvKEiW9l13po9fq+SJY/5lkhw==", + "dev": true, + "license": "MIT", + "dependencies": { + "undici-types": "~7.18.0" + } + }, + "node_modules/typescript": { + "version": "5.9.3", + "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", + "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", + "dev": true, + "license": "Apache-2.0", + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "7.18.2", + "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.18.2.tgz", + "integrity": "sha512-AsuCzffGHJybSaRrmr5eHr81mwJU3kjw6M+uprWvCXiNeN9SOGwQ3Jn8jb8m3Z6izVgknn1R0FTCEAP2QrLY/w==", + "dev": true, + "license": "MIT" + } + } +} diff --git a/apps/retriever/package.json b/apps/retriever/package.json new file mode 100644 index 0000000..14cfae7 --- /dev/null +++ b/apps/retriever/package.json @@ -0,0 +1,13 @@ +{ + "name": "retriever", + "version": "1.0.0", + "description": "Hybrid retrieval engine for Smart Notes", + "main": "dist/index.js", + "scripts": { + "build": "tsc" + }, + "devDependencies": { + "@types/node": "^20.0.0", + "typescript": "^5.0.0" + } +} \ No newline at end of file diff --git a/apps/retriever/src/CosineSimilarity.ts b/apps/retriever/src/CosineSimilarity.ts new file mode 100644 index 0000000..d6293a7 --- /dev/null +++ b/apps/retriever/src/CosineSimilarity.ts @@ -0,0 +1,106 @@ +/** + * @file CosineSimilarity.ts + * @description Utility functions for computing cosine similarity between + * dense embedding vectors. Used by the hybrid retrieval engine to measure + * semantic closeness between a query embedding and stored chunk embeddings. + */ + +// --------------------------------------------------------------------------- +// Helpers +// --------------------------------------------------------------------------- + +/** + * Computes the Euclidean magnitude (L2 norm) of a vector. + * + * The magnitude is defined as: + * ``` + * ||v|| = sqrt(v[0]² + v[1]² + ... + v[n-1]²) + * ``` + * + * @param v - A dense numeric vector. + * @returns The non-negative scalar magnitude of `v`. + */ +export function vectorMagnitude(v: number[]): number { + let sumOfSquares = 0; + for (let i = 0; i < v.length; i++) { + sumOfSquares += v[i] * v[i]; + } + return Math.sqrt(sumOfSquares); +} + +// --------------------------------------------------------------------------- +// Core +// --------------------------------------------------------------------------- + +/** + * Computes the cosine similarity between two dense numeric vectors. + * + * Cosine similarity measures the cosine of the angle between two vectors in + * an inner-product space. It is widely used in semantic search to gauge how + * similar two embedding vectors are, regardless of their magnitude: + * + * ``` + * cosineSimilarity(A, B) = (A · B) / (||A|| × ||B||) + * ``` + * + * The result lies in the range **[-1, 1]**: + * - `1` → vectors point in the same direction (identical semantics) + * - `0` → vectors are orthogonal (unrelated) + * - `-1` → vectors point in opposite directions + * + * For typical text embeddings produced by transformer models the practical + * range is **[0, 1]**. + * + * ### Validation + * - Throws a `RangeError` if `a` and `b` have different lengths, because the + * dot product is undefined for vectors of unequal dimension. + * - Returns `0` if either vector has zero magnitude to avoid division by zero; + * a zero vector carries no directional information so similarity is + * treated as neutral. + * + * ### Performance + * The dot product and both sum-of-squares accumulators are computed in a + * **single pass** over the vectors, avoiding extra allocations or iterations. + * + * @param a - First dense numeric vector (e.g. a query embedding). + * @param b - Second dense numeric vector (e.g. a chunk embedding). + * @returns Cosine similarity in the range [-1, 1], or 0 if either vector + * has zero magnitude. + * @throws {RangeError} When `a` and `b` have different lengths. + * + * @example + * ```ts + * const score = cosineSimilarity([1, 0, 0], [1, 0, 0]); // 1 + * const score = cosineSimilarity([1, 0], [0, 1]); // 0 + * ``` + */ +export function cosineSimilarity(a: number[], b: number[]): number { + if (a.length !== b.length) { + throw new RangeError( + `cosineSimilarity: vectors must have the same length ` + + `(got ${a.length} and ${b.length}).` + ); + } + + let dotProduct = 0; + let sumOfSquaresA = 0; + let sumOfSquaresB = 0; + + // Single pass: accumulate dot product and both magnitudes simultaneously. + for (let i = 0; i < a.length; i++) { + dotProduct += a[i] * b[i]; + sumOfSquaresA += a[i] * a[i]; + sumOfSquaresB += b[i] * b[i]; + } + + const magnitudeA = Math.sqrt(sumOfSquaresA); + const magnitudeB = Math.sqrt(sumOfSquaresB); + + // Guard against division by zero for zero-magnitude vectors. + if (magnitudeA === 0 || magnitudeB === 0) { + return 0; + } + + // Clamp to [-1, 1] to correct for floating-point drift. + return Math.max(-1, Math.min(1, dotProduct / (magnitudeA * magnitudeB))); +} \ No newline at end of file diff --git a/apps/retriever/src/HybridScorer.ts b/apps/retriever/src/HybridScorer.ts new file mode 100644 index 0000000..2530199 --- /dev/null +++ b/apps/retriever/src/HybridScorer.ts @@ -0,0 +1,152 @@ +/** + * @file HybridScorer.ts + * @description Hybrid ranking utility for the Smart Notes retrieval engine. + * + * Combines lexical relevance scores (e.g. BM25 from SQLite FTS5) with semantic + * similarity scores (cosine similarity between query and chunk embeddings) into + * a single blended ranking signal. + */ + +import type { + SearchCandidate, + SearchResult, + HybridScoreWeights, +} from "./types"; + +// --------------------------------------------------------------------------- +// Constants +// --------------------------------------------------------------------------- + +/** + * Tolerance used when validating that alpha + beta ≈ 1. + * Accounts for normal IEEE-754 floating-point rounding. + */ +const WEIGHT_SUM_TOLERANCE = 1e-6; + +// --------------------------------------------------------------------------- +// Core +// --------------------------------------------------------------------------- + +/** + * Combines lexical and semantic scores into a ranked list of {@link SearchResult}s. + * + * ### Hybrid Ranking + * Pure lexical search (BM25 / FTS) excels at exact keyword matching but misses + * paraphrases and synonyms. Pure semantic search captures conceptual similarity + * but can surface results that share no keywords with the query. Hybrid ranking + * blends both signals to get the best of both worlds. + * + * ### Scoring Formula + * For each candidate at index `i`: + * ``` + * finalScore = (alpha × semanticScore[i]) + (beta × candidates[i].lexicalScore) + * ``` + * `alpha` and `beta` should sum to `1.0` for a standard weighted average, + * though values outside this range are accepted with a console warning. + * + * Results are returned **sorted in descending order** by `finalScore` so that + * the most relevant chunk appears first. + * + * @param candidates - Lexical search candidates produced by the retrieval store. + * Each carries a `lexicalScore` and identifying metadata. + * @param semanticScores - Cosine similarity scores in the same order as `candidates`. + * `semanticScores[i]` must correspond to `candidates[i]`. + * @param weights - Blending weights `{ alpha, beta }` applied to the + * semantic and lexical scores respectively. + * + * @returns An array of {@link SearchResult} objects sorted by `finalScore` (desc). + * + * @throws {RangeError} When `candidates` and `semanticScores` have different lengths, + * as a 1-to-1 correspondence is required for correct scoring. + * + * @example + * ```ts + * const results = scoreHybridResults(candidates, semanticScores, { alpha: 0.7, beta: 0.3 }); + * console.log(results[0].finalScore); // highest scoring chunk + * ``` + */ +export function scoreHybridResults( + candidates: SearchCandidate[], + semanticScores: number[], + weights: HybridScoreWeights +): SearchResult[] { + // ------------------------------------------------------------------ + // Validation + // ------------------------------------------------------------------ + + if (candidates.length !== semanticScores.length) { + throw new RangeError( + `scoreHybridResults: candidates and semanticScores must have the same length ` + + `(got ${candidates.length} candidates and ${semanticScores.length} scores).` + ); + } + + if (!Number.isFinite(weights.alpha) || !Number.isFinite(weights.beta)) { + throw new RangeError( + `scoreHybridResults: weights.alpha and weights.beta must be finite numbers ` + + `(got ${weights.alpha} and ${weights.beta}).` + ); + } + + if (weights.alpha < 0 || weights.beta < 0) { + throw new RangeError( + `scoreHybridResults: weights.alpha and weights.beta must be non-negative ` + + `(got ${weights.alpha} and ${weights.beta}).` + ); + } + + const weightSum = weights.alpha + weights.beta; + if (Math.abs(weightSum - 1) > WEIGHT_SUM_TOLERANCE) { + throw new RangeError( + `scoreHybridResults: weights.alpha (${weights.alpha}) + weights.beta (${weights.beta}) ` + + `must sum to 1 (got ${weightSum}).` + ); + } + + // ------------------------------------------------------------------ + // Scoring + // ------------------------------------------------------------------ + + const minLexicalScore = candidates.reduce( + (minScore: number, candidate: SearchCandidate): number => + Math.min(minScore, candidate.lexicalScore), + Number.POSITIVE_INFINITY + ); + const maxLexicalScore = candidates.reduce( + (maxScore: number, candidate: SearchCandidate): number => + Math.max(maxScore, candidate.lexicalScore), + Number.NEGATIVE_INFINITY + ); + const range = maxLexicalScore - minLexicalScore; + + const results: SearchResult[] = candidates.map( + (candidate: SearchCandidate, i: number): SearchResult => { + const semanticScore = semanticScores[i]; + const lexicalScore = candidate.lexicalScore; + const normalizedLexicalScore = + range > 0 ? (lexicalScore - minLexicalScore) / range : 0; + const finalScore = + weights.alpha * semanticScore + + weights.beta * normalizedLexicalScore; + + return { + chunkId: candidate.chunkId, + notePath: candidate.notePath, + text: candidate.text, + lexicalScore, + semanticScore, + finalScore, + }; + } + ); + + // ------------------------------------------------------------------ + // Ranking — descending by finalScore + // ------------------------------------------------------------------ + + results.sort( + (a: SearchResult, b: SearchResult): number => b.finalScore - a.finalScore + ); + + return results; +} diff --git a/apps/retriever/src/RetrievalEngine.ts b/apps/retriever/src/RetrievalEngine.ts new file mode 100644 index 0000000..fee7b2c --- /dev/null +++ b/apps/retriever/src/RetrievalEngine.ts @@ -0,0 +1,175 @@ +/** + * @file RetrievalEngine.ts + * @description Core orchestrator for the Smart Notes hybrid retrieval pipeline. + * + * The `RetrievalEngine` coordinates all stages of hybrid search — from lexical + * candidate retrieval and embedding generation through to semantic scoring and + * final ranked output. It is the single entry point consumers call at search time. + */ + +import type { + RetrievalStore, + QueryEmbeddingProvider, + HybridScoreWeights, + SearchResult, +} from "./types"; +import { cosineSimilarity } from "./CosineSimilarity"; +import { scoreHybridResults } from "./HybridScorer"; + +function wrapOperationError(operation: string, details: string, error: unknown): Error { + const wrapped = new Error( + `RetrievalEngine.search: ${operation} failed (${details}): ${getErrorMessage(error)}` + ); + + if (error instanceof Error && error.stack) { + wrapped.stack = `${wrapped.name}: ${wrapped.message}\nCaused by: ${error.stack}`; + } + + return wrapped; +} + +function getErrorMessage(error: unknown): string { + if (error instanceof Error) { + return error.message; + } + + return String(error); +} + +/** + * Orchestrates the full hybrid retrieval pipeline for Smart Notes. + * + * ### Pipeline Overview + * ``` + * query + * │ + * ├─ 1. Lexical search → SearchCandidate[] (RetrievalStore.searchLexical) + * ├─ 2. Query embedding → number[] (QueryEmbeddingProvider.embedQuery) + * ├─ 3. Chunk embeddings → number[][] (RetrievalStore.loadEmbeddings) + * ├─ 4. Cosine similarity → number[] (CosineSimilarity.cosineSimilarity) + * ├─ 5. Hybrid scoring → SearchResult[] sorted (HybridScorer.scoreHybridResults) + * └─ 6. Slice to limit → SearchResult[] final + * ``` + * + * The engine is intentionally stateless beyond its injected dependencies, making + * it straightforward to unit-test by supplying mock implementations of + * {@link RetrievalStore} and {@link QueryEmbeddingProvider}. + */ +export class RetrievalEngine { + /** + * Creates a new `RetrievalEngine` instance. + * + * @param store - Storage adapter that provides lexical search and embedding + * retrieval. Typically backed by SQLite FTS5 + a vector table. + * @param embedder - Provider that converts a raw query string into a dense + * embedding vector in the same space as the stored chunk embeddings. + * @param weights - Blending weights `{ alpha, beta }` controlling how much the + * semantic score (alpha) and lexical score (beta) each contribute + * to the final ranking. Should sum to 1 for a standard weighted + * average. + */ + constructor( + private readonly store: RetrievalStore, + private readonly embedder: QueryEmbeddingProvider, + private readonly weights: HybridScoreWeights + ) {} + + /** + * Executes a hybrid search for the given query and returns the top-ranked results. + * + * The method runs the six-stage retrieval pipeline described in the class-level + * documentation. Each stage is designed to fail fast — an empty lexical result + * set short-circuits the pipeline immediately and returns `[]` without incurring + * the cost of embedding generation or similarity computation. + * + * @param query - The user's natural-language search query. + * @param limit - Maximum number of results to return. + * The retrieval engine requests an expanded candidate pool + * from lexical search (currently `limit * 5`) before semantic + * reranking. The final ranked output is trimmed to `limit`. + * @returns A promise that resolves to an array of {@link SearchResult} objects + * sorted in descending order by `finalScore`, containing at most + * `limit` entries. + * + * @example + * ```ts + * const engine = new RetrievalEngine(store, embedder, { alpha: 0.7, beta: 0.3 }); + * const results = await engine.search("transformer attention mechanism", 10); + * results.forEach(r => console.log(r.finalScore, r.notePath)); + * ``` + */ + async search(query: string, limit: number): Promise { + if (!Number.isInteger(limit) || limit <= 0) { + throw new RangeError( + `RetrievalEngine.search: limit must be a positive integer (got ${limit}).` + ); + } + + // ------------------------------------------------------------------ + // Step 1 — Lexical search + // Retrieve the top `limit` candidate chunks using full-text search. + // Short-circuit immediately when no lexical matches exist — there is + // nothing to re-rank semantically. + // ------------------------------------------------------------------ + const candidatePoolSize = limit * 5; + const candidates = await this.store.searchLexical(query, candidatePoolSize); + + if (candidates.length === 0) { + return []; + } + + // ------------------------------------------------------------------ + // Step 2 — Query embedding + // Encode the raw query string into a dense vector so it can be + // compared against the stored chunk embeddings via cosine similarity. + // ------------------------------------------------------------------ + let queryEmbedding: number[]; + try { + queryEmbedding = await this.embedder.embedQuery(query); + } catch (error: unknown) { + throw wrapOperationError("embedQuery", `query="${query}"`, error); + } + + // ------------------------------------------------------------------ + // Step 3 — Chunk embeddings + // Load precomputed embeddings for every candidate chunk in one batch + // call to minimise round-trip overhead against the storage layer. + // ------------------------------------------------------------------ + const chunkIds = candidates.map((c) => c.chunkId); + let embeddings: number[][]; + try { + embeddings = await this.store.loadEmbeddings(chunkIds); + } catch (error: unknown) { + throw wrapOperationError( + "loadEmbeddings", + `chunkCount=${chunkIds.length}, chunkIds=[${chunkIds.join(", ")}]`, + error + ); + } + + // ------------------------------------------------------------------ + // Step 4 — Cosine similarity + // Compute a scalar semantic score for each candidate by measuring the + // cosine angle between the query embedding and the chunk embedding. + // Scores lie in [-1, 1]; typical text embeddings produce values in [0, 1]. + // ------------------------------------------------------------------ + const semanticScores = embeddings.map( + (embedding: number[]): number => + (cosineSimilarity(queryEmbedding, embedding) + 1) / 2 + ); + + // ------------------------------------------------------------------ + // Step 5 — Hybrid scoring & ranking + // Blend lexical and semantic scores using the configured weights and + // return the candidates sorted in descending order by finalScore. + // ------------------------------------------------------------------ + const results = scoreHybridResults(candidates, semanticScores, this.weights); + + // ------------------------------------------------------------------ + // Step 6 — Truncate to limit + // scoreHybridResults may return all candidates sorted; we honour the + // caller's requested limit here at the output boundary. + // ------------------------------------------------------------------ + return results.slice(0, limit); + } +} diff --git a/apps/retriever/src/demo/RetrievalDemo.ts b/apps/retriever/src/demo/RetrievalDemo.ts new file mode 100644 index 0000000..50dc0c4 --- /dev/null +++ b/apps/retriever/src/demo/RetrievalDemo.ts @@ -0,0 +1,210 @@ +/** + * @file RetrievalDemo.ts + * @description End-to-end demonstration of the Smart Notes hybrid retrieval pipeline. + * + * This file wires together lightweight mock implementations of {@link RetrievalStore} + * and {@link QueryEmbeddingProvider} to show how {@link RetrievalEngine} orchestrates + * the full six-stage retrieval pipeline without requiring a real database or model. + * + * Run with: + * npm run build + * node dist/demo/RetrievalDemo.js + */ + +import { RetrievalEngine } from "../RetrievalEngine"; +import type { + RetrievalStore, + QueryEmbeddingProvider, + HybridScoreWeights, + SearchCandidate, +} from "../types"; + +// --------------------------------------------------------------------------- +// Mock Store +// --------------------------------------------------------------------------- + +/** + * In-memory mock implementation of {@link RetrievalStore}. + * + * Returns a fixed set of lexical candidates and pre-computed embeddings so the + * demo can run without a real SQLite database. In production this would be + * replaced by a concrete adapter that queries FTS5 and a vector table. + */ +class MockStore implements RetrievalStore { + /** + * Simulates a full-text search by returning a fixed list of candidate chunks. + * + * The `query` parameter is accepted to satisfy the interface contract. + * The mock still respects `limit` so its behaviour matches the real store. + * + * @param query - The user's search query (unused in mock). + * @param limit - Maximum results requested. + * @returns A promise resolving to at most `limit` hard-coded {@link SearchCandidate} objects. + */ + async searchLexical( + query: string, + limit: number + ): Promise { + console.log(` [MockStore] searchLexical("${query}", limit=${limit})`); + + const candidates: SearchCandidate[] = [ + { + chunkId: "1", + notePath: "note1.md", + text: "JavaScript async patterns", + lexicalScore: 0.9, + }, + { + chunkId: "2", + notePath: "note2.md", + text: "Node.js event loop guide", + lexicalScore: 0.8, + }, + { + chunkId: "3", + notePath: "note3.md", + text: "Understanding promises", + lexicalScore: 0.7, + }, + ]; + + return candidates.slice(0, Math.max(0, limit)); + } + + /** + * Returns pre-computed embedding vectors for the requested chunk IDs. + * + * Vectors are intentionally simple 3-dimensional floats so the cosine + * similarity calculations are easy to verify by hand during development. + * + * @param chunkIds - Chunk identifiers whose embeddings should be loaded. + * @returns A promise resolving to one fixed embedding vector per requested ID. + */ + async loadEmbeddings(chunkIds: string[]): Promise { + console.log(` [MockStore] loadEmbeddings([${chunkIds.join(", ")}])`); + + const embeddingsById: Record = { + "1": [0.1, 0.2, 0.3], + "2": [0.2, 0.1, 0.4], + "3": [0.5, 0.4, 0.1], + }; + + return chunkIds.map((chunkId: string): number[] => { + const embedding = embeddingsById[chunkId]; + + if (embedding === undefined) { + throw new Error(`Missing embedding for chunkId: ${chunkId}`); + } + + return embedding; + }); + } +} + +// --------------------------------------------------------------------------- +// Mock Embedder +// --------------------------------------------------------------------------- + +/** + * In-memory mock implementation of {@link QueryEmbeddingProvider}. + * + * Returns a constant query vector so the demo produces deterministic, easily + * verifiable similarity scores without needing a real embedding model. A + * production implementation would delegate to an ONNX runtime or a local + * transformer model. + */ +class MockEmbedder implements QueryEmbeddingProvider { + /** + * Generates a fixed embedding vector for any query string. + * + * @param query - The user's search query (logged but otherwise unused in mock). + * @returns A promise resolving to the constant vector `[0.2, 0.2, 0.2]`. + */ + async embedQuery(query: string): Promise { + console.log(` [MockEmbedder] embedQuery("${query}")`); + return [0.2, 0.2, 0.2]; + } +} + +// --------------------------------------------------------------------------- +// Demo Runner +// --------------------------------------------------------------------------- + +/** + * Runs a complete end-to-end demonstration of the hybrid retrieval engine. + * + * ### What this demo shows + * 1. Wiring up mock store and embedder dependencies. + * 2. Configuring hybrid score weights (60 % semantic, 40 % lexical). + * 3. Issuing a search query through {@link RetrievalEngine}. + * 4. Printing the ranked {@link SearchResult} array to stdout. + * + * The output lets developers verify that: + * - All six pipeline stages execute in the correct order. + * - Final scores are a weighted blend of lexical and semantic signals. + * - Results arrive sorted by `finalScore` in descending order. + */ +async function runDemo(): Promise { + console.log("=".repeat(60)); + console.log(" Smart Notes — Hybrid Retrieval Engine Demo"); + console.log("=".repeat(60)); + + // ── Step 1: Instantiate dependencies ────────────────────────────── + const store = new MockStore(); + const embedder = new MockEmbedder(); + + // ── Step 2: Configure hybrid weights ────────────────────────────── + // alpha drives the semantic (cosine similarity) contribution. + // beta drives the lexical (BM25 / FTS5) contribution. + const weights: HybridScoreWeights = { + alpha: 0.6, // 60 % semantic + beta: 0.4, // 40 % lexical + }; + + console.log("\nWeights:"); + console.log(` alpha (semantic) = ${weights.alpha}`); + console.log(` beta (lexical) = ${weights.beta}`); + + // ── Step 3: Build the engine ─────────────────────────────────────── + const engine = new RetrievalEngine(store, embedder, weights); + + // ── Step 4: Run the search ───────────────────────────────────────── + const query = "async javascript"; + const limit = 5; + + console.log(`\nRunning search: "${query}" (limit=${limit})\n`); + + const results = await engine.search(query, limit); + + // ── Step 5: Print results ────────────────────────────────────────── + console.log("\n" + "─".repeat(60)); + console.log(` Results (${results.length} returned)`); + console.log("─".repeat(60)); + + if (results.length === 0) { + console.log(" No results found."); + } else { + results.forEach((result, index) => { + console.log(`\n #${index + 1}`); + console.log(` chunkId : ${result.chunkId}`); + console.log(` notePath : ${result.notePath}`); + console.log(` text : ${result.text}`); + console.log(` lexicalScore : ${result.lexicalScore.toFixed(4)}`); + console.log(` semanticScore : ${result.semanticScore.toFixed(4)}`); + console.log(` finalScore : ${result.finalScore.toFixed(4)}`); + }); + } + + console.log("\n" + "=".repeat(60)); + console.log(" Demo complete."); + console.log("=".repeat(60) + "\n"); +} + +// --------------------------------------------------------------------------- +// Entry Point +// --------------------------------------------------------------------------- + +runDemo().catch((error: unknown) => { + console.error("Demo failed with error:", error); + process.exit(1); +}); diff --git a/apps/retriever/src/index.ts b/apps/retriever/src/index.ts new file mode 100644 index 0000000..374eac2 --- /dev/null +++ b/apps/retriever/src/index.ts @@ -0,0 +1,30 @@ +/** + * @file index.ts + * @description Public API surface for the Smart Notes hybrid retrieval engine. + * + * Consumers should import exclusively from this entry point rather than + * reaching into internal modules directly. This keeps the module boundary + * stable and allows internal refactoring without breaking callers. + * + * @example + * ```ts + * import { RetrievalEngine, cosineSimilarity, scoreHybridResults } from "retriever"; + * import type { SearchResult, HybridScoreWeights } from "retriever"; + * ``` + */ + +/** Core orchestrator for the hybrid retrieval pipeline. */ +export { RetrievalEngine } from "./RetrievalEngine"; + +/** Cosine similarity utility for semantic vector comparison. */ +export { cosineSimilarity } from "./CosineSimilarity"; + +/** Hybrid scoring and ranking utility. */ +export { scoreHybridResults } from "./HybridScorer"; + +/** + * All shared types and interfaces: + * SearchCandidate, SearchResult, RetrievalStore, + * QueryEmbeddingProvider, HybridScoreWeights + */ +export * from "./types"; diff --git a/apps/retriever/src/types.ts b/apps/retriever/src/types.ts new file mode 100644 index 0000000..9a2fa48 --- /dev/null +++ b/apps/retriever/src/types.ts @@ -0,0 +1,169 @@ +/** + * @file types.ts + * @description Core types and interfaces for the Smart Notes hybrid retrieval engine. + * + * This module defines the foundational data structures and contracts used across + * the retrieval pipeline — from lexical search candidates to final ranked results, + * storage access, embedding generation, and scoring configuration. + */ + +// --------------------------------------------------------------------------- +// Search Candidates & Results +// --------------------------------------------------------------------------- + +/** + * Represents a chunk returned from lexical search (e.g., SQLite FTS5). + * + * A `SearchCandidate` is the raw result of a full-text search query before + * semantic re-ranking. It carries the chunk's identity, source note path, + * raw text, and the lexical relevance score assigned by the search engine. + */ +export interface SearchCandidate { + /** Unique identifier for this text chunk within the note store. */ + chunkId: string; + + /** Absolute or relative file path of the note this chunk belongs to. */ + notePath: string; + + /** The raw text content of this chunk. */ + text: string; + + /** + * Relevance score assigned by the lexical search engine (e.g., BM25 from FTS5). + * Higher values indicate stronger lexical relevance to the query. + */ + lexicalScore: number; +} + +/** + * Represents a final ranked search result after hybrid scoring. + * + * A `SearchResult` extends the information in a `SearchCandidate` by adding + * the semantic similarity score and the final blended score produced by the + * hybrid ranking step. These results are what the UI ultimately displays. + */ +export interface SearchResult { + /** Unique identifier for this text chunk within the note store. */ + chunkId: string; + + /** Absolute or relative file path of the note this chunk belongs to. */ + notePath: string; + + /** The raw text content of this chunk. */ + text: string; + + /** + * Relevance score from lexical search (e.g., BM25). + * Preserved from the original `SearchCandidate` for transparency and debugging. + * When {@link scoreHybridResults} computes `finalScore`, this raw `lexicalScore` + * is first normalized before blending so lexical and semantic signals are on + * comparable scales. + */ + lexicalScore: number; + + /** + * Semantic score used by {@link scoreHybridResults}. + * Derived from cosine similarity and normalized from the native `[-1, 1]` + * range into `[0, 1]` before hybrid blending. + */ + semanticScore: number; + + /** + * Final blended score used to rank this result. + * {@link scoreHybridResults} computes: + * `finalScore = alpha * semanticScore + beta * normalizedLexicalScore` + * where `semanticScore` is the normalized cosine similarity and + * `normalizedLexicalScore` is derived from `lexicalScore`. + */ + finalScore: number; +} + +// --------------------------------------------------------------------------- +// Storage Abstraction +// --------------------------------------------------------------------------- + +/** + * Abstracts all database access required by the retrieval engine. + * + * Implementations of this interface handle the underlying persistence layer + * (e.g., SQLite with FTS5 for lexical search and a vector table for embeddings), + * keeping the retrieval logic decoupled from storage details. + */ +export interface RetrievalStore { + /** + * Performs a lexical (full-text) search and returns the top matching chunks. + * + * @param query - The user's raw search query string. + * @param limit - Maximum number of candidates to return. + * @returns A promise that resolves to an ordered array of {@link SearchCandidate} objects. + */ + searchLexical(query: string, limit: number): Promise; + + /** + * Loads precomputed embeddings for the specified chunk IDs. + * + * Implementations must return embeddings in the same order as the input + * `chunkIds`, with exactly one embedding for each chunk ID. + * Embeddings must not be omitted. + * + * @param chunkIds - Array of chunk identifiers whose embeddings should be loaded. + * @returns A promise resolving to a 2-D array where each inner array is a + * dense floating-point embedding vector for the corresponding chunk. + */ + loadEmbeddings(chunkIds: string[]): Promise; +} + +// --------------------------------------------------------------------------- +// Embedding Provider +// --------------------------------------------------------------------------- + +/** + * Responsible for generating embedding vectors for user queries at search time. + * + * The retrieval engine uses this interface to convert a raw query string into + * the same embedding space as the stored chunk embeddings, enabling cosine + * similarity scoring. + * + * Implementations may delegate to a local ONNX model, a bundled transformer, + * or any other embedding backend that fits the offline-first constraint. + */ +export interface QueryEmbeddingProvider { + /** + * Generates a dense embedding vector for the given query string. + * + * @param query - The user's search query to embed. + * @returns A promise resolving to a floating-point vector representing the + * semantic meaning of the query in the model's embedding space. + */ + embedQuery(query: string): Promise; +} + +// --------------------------------------------------------------------------- +// Scoring Configuration +// --------------------------------------------------------------------------- + +/** + * Weights used to blend lexical and semantic scores in hybrid ranking. + * + * {@link scoreHybridResults} computes the blended rank as: + * ``` + * finalScore = (alpha * semanticScore) + (beta * normalizedLexicalScore) + * ``` + * `alpha` controls the normalized semantic contribution and `beta` controls + * the normalized lexical contribution. For deterministic weighted blending, + * `alpha + beta` must sum to approximately `1`. + */ +export interface HybridScoreWeights { + /** + * Weight applied to the semantic score used by {@link scoreHybridResults}. + * This multiplies the normalized cosine similarity signal. + * A higher value biases results towards conceptual/semantic relevance. + */ + alpha: number; + + /** + * Weight applied to the normalized `lexicalScore` in {@link scoreHybridResults}. + * A higher value biases results towards keyword relevance. + */ + beta: number; +} diff --git a/apps/retriever/tsconfig.json b/apps/retriever/tsconfig.json new file mode 100644 index 0000000..f993f7b --- /dev/null +++ b/apps/retriever/tsconfig.json @@ -0,0 +1,14 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "CommonJS", + "rootDir": "src", + "outDir": "dist", + "declaration": true, + "declarationMap": true, + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true + }, + "include": ["src"] +}