Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions ai/vector-search-typescript/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
DEBUG=true

# ========================================
# Azure OpenAI Embedding Settings
# ========================================
AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002
AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15
AZURE_OPENAI_EMBEDDING_KEY=
AZURE_OPENAI_EMBEDDING_ENDPOINT=https://<RESOURCE-NAME>.openai.azure.com
EMBEDDING_SIZE_BATCH=16

# ========================================
# Data File Paths and Vector Configuration
# ========================================
DATA_FILE_WITHOUT_VECTORS=../data/HotelsData_toCosmosDB.JSON
DATA_FILE_WITH_VECTORS=../data/HotelsData_toCosmosDB_Vector.json
DATA_FILE_WITH_SIMILARITY=../data/HotelsData_toCosmosDB_Vector_Similarity.json
QUERY_FILE_WITH_VECTORS=../data/HotelsData_Query_Vector.json
DATA_FOLDER=../data/
FIELD_TO_EMBED=Description
EMBEDDED_FIELD=text_embedding_ada_002
EMBEDDING_DIMENSIONS=1536
LOAD_SIZE_BATCH=100

# ========================================
# MongoDB/Cosmos DB Connection Settings
# ========================================
MONGO_CONNECTION_STRING=mongodb+srv://<USERNAME>:<PASSWORD>@<CLUSTER-NAME>.global.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000
MONGO_CLUSTER_NAME=<CLUSTER-NAME>


25 changes: 25 additions & 0 deletions ai/vector-search-typescript/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "ts-cosmos-nodejs-vector-samples",
"version": "1.0.0",
"description": "Samples for MongoDB vCore vector search with Cosmos DB",
"main": "index.js",
"type": "module",
"scripts": {
"build": "tsc",
"start:one-insert": "node --env-file .env dist/insert-one-document.js",
"start:embed": "node --env-file .env dist/create-embeddings.js",
"start:show-indexes": "node --env-file .env dist/showIndexes.js",
"start:ivf": "node --env-file .env dist/ivf.js",
"start:hnsw": "node --env-file .env dist/hnsw.js",
"start:diskann": "node --env-file .env dist/diskann.js"
},
"dependencies": {
"@azure/identity": "^4.11.1",
"mongodb": "^6.18.0",
"openai": "^5.16.0"
},
"devDependencies": {
"@types/node": "^24.3.0",
"typescript": "^5.9.2"
}
}
132 changes: 132 additions & 0 deletions ai/vector-search-typescript/src/create-embeddings.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,132 @@
/**
* Module for creating embedding vectors using OpenAI API
* Supports text embedding models for generating embeddings
* that can be used with Cosmos DB MongoDB vCore vector search
*/
import * as path from "node:path";
import { AzureOpenAI } from "openai";
import { Embedding } from "openai/resources";
import { readFileReturnJson, writeFileJson, JsonData } from "./utils.js";

// ESM specific features - create __dirname equivalent
import { fileURLToPath } from "node:url";
import { dirname } from "node:path";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

const apiKey = process.env.AZURE_OPENAI_EMBEDDING_KEY;
const apiVersion = process.env.AZURE_OPENAI_EMBEDDING_API_VERSION;
const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT;
console.log(`Using OpenAI endpoint: ${endpoint}`);
const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!;

const dataWithVectors = process.env.DATA_FILE_WITH_VECTORS!;
const dataWithoutVectors = process.env.DATA_FILE_WITHOUT_VECTORS!;
const fieldToEmbed = process.env.FIELD_TO_EMBED! || "description";
const newEmbeddedField = process.env.EMBEDDED_FIELD! || deployment;
const batchSize = parseInt(process.env.EMBEDDING_BATCH_SIZE || '16', 10);

// Define a reusable delay function
async function delay(ms: number = 200): Promise<void> {
await new Promise(resolve => setTimeout(resolve, ms));
}

export async function createEmbeddings(client: AzureOpenAI, model: string, inputItems: string[]): Promise<Embedding[]> {
const response = await client.embeddings.create({
model,
input: inputItems
});

if (!response.data || response.data.length === 0) {
throw new Error(`No embedding data returned`);
}
return response.data;
}

export async function processEmbeddingBatch<T>(
client: AzureOpenAI,
model: string,
fieldToEmbed: string,
newEmbeddedField: string,
maxEmbeddings: number,
items: T[]

): Promise<T[]> {
if (!Array.isArray(items) || items.length === 0) {
throw new Error("Items must be a non-empty array");
}

if (!fieldToEmbed) {
throw new Error("Field to embed must be specified");
}

const itemsWithEmbeddings: T[] = [];
maxEmbeddings = maxEmbeddings || items.length;

// Process in batches to avoid rate limits and memory issues
for (let i = 0; i < maxEmbeddings; i += batchSize) {
const batchEnd = Math.min(i + batchSize, items.length);
console.log(`Processing batch: ${i} to ${batchEnd - 1} (of ${items.length} items)`);

const batchItems = items.slice(i, batchEnd);
const textsToEmbed = batchItems.map(item => {
if (!item[fieldToEmbed]) {
console.warn(`Item is missing the field to embed: ${fieldToEmbed}`);
return ""; // Provide a fallback value to prevent API errors
}
return item[fieldToEmbed];
});

try {
const embeddings = await createEmbeddings(client, model, textsToEmbed);

embeddings.forEach((embeddingData, index) => {
const originalItem = batchItems[index];
const newItem = {
...originalItem,
[newEmbeddedField]: embeddingData.embedding
};
itemsWithEmbeddings.push(newItem);
});

// Add a small delay between batches to avoid rate limiting
if (batchEnd < items.length) {
await delay();
}
} catch (error) {
console.error(`Error generating embeddings for batch ${i}:`, error);
throw error;
}
}

return itemsWithEmbeddings;
}


try {

const client = new AzureOpenAI( {
apiKey,
apiVersion,
endpoint,
deployment
});

const data = await readFileReturnJson(path.join(__dirname, "..", dataWithoutVectors!));
const model = deployment;
const maxEmbeddings = data.length;

const embeddings = await processEmbeddingBatch<JsonData>(
client,
model,
fieldToEmbed,
newEmbeddedField,
maxEmbeddings,
data
);

await writeFileJson(path.join(__dirname, "..", dataWithVectors!), embeddings);

} catch (error) {
console.error(`Failed to save embeddings to file: ${(error as Error).message}`);
}
108 changes: 108 additions & 0 deletions ai/vector-search-typescript/src/diskann.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
import path from 'path';
import { readFileReturnJson, getClientsPasswordless, insertData, printSearchResults } from './utils.js';

// ESM specific features - create __dirname equivalent
import { fileURLToPath } from "node:url";
import { dirname } from "node:path";
const __filename = fileURLToPath(import.meta.url);
const __dirname = dirname(__filename);

const config = {
query: "quintessential lodging near running trails, eateries, retail",
dbName: "Hotels",
collectionName: "hotels_diskann",
indexName: "vectorIndex_diskann",
dataFile: process.env.DATA_FILE_WITH_VECTORS!,
batchSize: parseInt(process.env.LOAD_SIZE_BATCH! || '100', 10),
embeddedField: process.env.EMBEDDED_FIELD!,
embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS!, 10),
deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL!,
};

async function main() {

const { aiClient, dbClient } = getClientsPasswordless();

try {

if (!aiClient) {
throw new Error('AI client is not configured. Please check your environment variables.');
}
if (!dbClient) {
throw new Error('Database client is not configured. Please check your environment variables.');
}

await dbClient.connect();
const db = dbClient.db(config.dbName);
const collection = await db.createCollection(config.collectionName);
console.log('Created collection:', config.collectionName);
const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile));
const insertSummary = await insertData(config, collection, data);
console.log('Created vector index:', config.indexName);

// Create the vector index
const indexOptions = {
createIndexes: config.collectionName,
indexes: [
{
name: config.indexName,
key: {
[config.embeddedField]: 'cosmosSearch'
},
cosmosSearchOptions: {
kind: 'vector-diskann',
dimensions: config.embeddingDimensions,
similarity: 'COS', // 'COS', 'L2', 'IP'
maxDegree: 20, // 20 - 2048, edges per node
lBuild: 10 // 10 - 500, candidate neighbors evaluated
}
}
]
};
const vectorIndexSummary = await db.command(indexOptions);

// Create embedding for the query
const createEmbeddedForQueryResponse = await aiClient.embeddings.create({
model: config.deployment,
input: [config.query]
});

// Perform the vector similarity search
const searchResults = await collection.aggregate([
{
$search: {
cosmosSearch: {
vector: createEmbeddedForQueryResponse.data[0].embedding,
path: config.embeddedField,
k: 5
}
}
},
{
$project: {
score: {
$meta: "searchScore"
},
document: "$$ROOT"
}
}
]).toArray();

// Print the results
printSearchResults(insertSummary, vectorIndexSummary, searchResults);

} catch (error) {
console.error('App failed:', error);
process.exitCode = 1;
} finally {
console.log('Closing database connection...');
if (dbClient) await dbClient.close();
console.log('Database connection closed');
}
}

// Execute the main function
main().catch(error => {
console.error('Unhandled error:', error);
process.exitCode = 1;
});
Loading