diff --git a/ai/vector-search-typescript/.env.example b/ai/vector-search-typescript/.env.example new file mode 100644 index 0000000..9bf4f8a --- /dev/null +++ b/ai/vector-search-typescript/.env.example @@ -0,0 +1,31 @@ +DEBUG=true + +# ======================================== +# Azure OpenAI Embedding Settings +# ======================================== +AZURE_OPENAI_EMBEDDING_MODEL=text-embedding-ada-002 +AZURE_OPENAI_EMBEDDING_API_VERSION=2023-05-15 +AZURE_OPENAI_EMBEDDING_KEY= +AZURE_OPENAI_EMBEDDING_ENDPOINT=https://.openai.azure.com +EMBEDDING_SIZE_BATCH=16 + +# ======================================== +# Data File Paths and Vector Configuration +# ======================================== +DATA_FILE_WITHOUT_VECTORS=../data/HotelsData_toCosmosDB.JSON +DATA_FILE_WITH_VECTORS=../data/HotelsData_toCosmosDB_Vector.json +DATA_FILE_WITH_SIMILARITY=../data/HotelsData_toCosmosDB_Vector_Similarity.json +QUERY_FILE_WITH_VECTORS=../data/HotelsData_Query_Vector.json +DATA_FOLDER=../data/ +FIELD_TO_EMBED=Description +EMBEDDED_FIELD=text_embedding_ada_002 +EMBEDDING_DIMENSIONS=1536 +LOAD_SIZE_BATCH=100 + +# ======================================== +# MongoDB/Cosmos DB Connection Settings +# ======================================== +MONGO_CONNECTION_STRING=mongodb+srv://:@.global.mongocluster.cosmos.azure.com/?tls=true&authMechanism=SCRAM-SHA-256&retrywrites=false&maxIdleTimeMS=120000 +MONGO_CLUSTER_NAME= + + diff --git a/ai/vector-search-typescript/package.json b/ai/vector-search-typescript/package.json new file mode 100644 index 0000000..823fc94 --- /dev/null +++ b/ai/vector-search-typescript/package.json @@ -0,0 +1,25 @@ +{ + "name": "ts-cosmos-nodejs-vector-samples", + "version": "1.0.0", + "description": "Samples for MongoDB vCore vector search with Cosmos DB", + "main": "index.js", + "type": "module", + "scripts": { + "build": "tsc", + "start:one-insert": "node --env-file .env dist/insert-one-document.js", + "start:embed": "node --env-file .env dist/create-embeddings.js", + "start:show-indexes": "node --env-file .env dist/showIndexes.js", + "start:ivf": "node --env-file .env dist/ivf.js", + "start:hnsw": "node --env-file .env dist/hnsw.js", + "start:diskann": "node --env-file .env dist/diskann.js" + }, + "dependencies": { + "@azure/identity": "^4.11.1", + "mongodb": "^6.18.0", + "openai": "^5.16.0" + }, + "devDependencies": { + "@types/node": "^24.3.0", + "typescript": "^5.9.2" + } +} diff --git a/ai/vector-search-typescript/src/create-embeddings.ts b/ai/vector-search-typescript/src/create-embeddings.ts new file mode 100644 index 0000000..6524bce --- /dev/null +++ b/ai/vector-search-typescript/src/create-embeddings.ts @@ -0,0 +1,132 @@ +/** + * Module for creating embedding vectors using OpenAI API + * Supports text embedding models for generating embeddings + * that can be used with Cosmos DB MongoDB vCore vector search + */ +import * as path from "node:path"; +import { AzureOpenAI } from "openai"; +import { Embedding } from "openai/resources"; +import { readFileReturnJson, writeFileJson, JsonData } from "./utils.js"; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const apiKey = process.env.AZURE_OPENAI_EMBEDDING_KEY; +const apiVersion = process.env.AZURE_OPENAI_EMBEDDING_API_VERSION; +const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT; +console.log(`Using OpenAI endpoint: ${endpoint}`); +const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + +const dataWithVectors = process.env.DATA_FILE_WITH_VECTORS!; +const dataWithoutVectors = process.env.DATA_FILE_WITHOUT_VECTORS!; +const fieldToEmbed = process.env.FIELD_TO_EMBED! || "description"; +const newEmbeddedField = process.env.EMBEDDED_FIELD! || deployment; +const batchSize = parseInt(process.env.EMBEDDING_BATCH_SIZE || '16', 10); + +// Define a reusable delay function +async function delay(ms: number = 200): Promise { + await new Promise(resolve => setTimeout(resolve, ms)); +} + +export async function createEmbeddings(client: AzureOpenAI, model: string, inputItems: string[]): Promise { + const response = await client.embeddings.create({ + model, + input: inputItems + }); + + if (!response.data || response.data.length === 0) { + throw new Error(`No embedding data returned`); + } + return response.data; +} + +export async function processEmbeddingBatch( + client: AzureOpenAI, + model: string, + fieldToEmbed: string, + newEmbeddedField: string, + maxEmbeddings: number, + items: T[] + +): Promise { + if (!Array.isArray(items) || items.length === 0) { + throw new Error("Items must be a non-empty array"); + } + + if (!fieldToEmbed) { + throw new Error("Field to embed must be specified"); + } + + const itemsWithEmbeddings: T[] = []; + maxEmbeddings = maxEmbeddings || items.length; + + // Process in batches to avoid rate limits and memory issues + for (let i = 0; i < maxEmbeddings; i += batchSize) { + const batchEnd = Math.min(i + batchSize, items.length); + console.log(`Processing batch: ${i} to ${batchEnd - 1} (of ${items.length} items)`); + + const batchItems = items.slice(i, batchEnd); + const textsToEmbed = batchItems.map(item => { + if (!item[fieldToEmbed]) { + console.warn(`Item is missing the field to embed: ${fieldToEmbed}`); + return ""; // Provide a fallback value to prevent API errors + } + return item[fieldToEmbed]; + }); + + try { + const embeddings = await createEmbeddings(client, model, textsToEmbed); + + embeddings.forEach((embeddingData, index) => { + const originalItem = batchItems[index]; + const newItem = { + ...originalItem, + [newEmbeddedField]: embeddingData.embedding + }; + itemsWithEmbeddings.push(newItem); + }); + + // Add a small delay between batches to avoid rate limiting + if (batchEnd < items.length) { + await delay(); + } + } catch (error) { + console.error(`Error generating embeddings for batch ${i}:`, error); + throw error; + } + } + + return itemsWithEmbeddings; +} + + +try { + + const client = new AzureOpenAI( { + apiKey, + apiVersion, + endpoint, + deployment + }); + + const data = await readFileReturnJson(path.join(__dirname, "..", dataWithoutVectors!)); + const model = deployment; + const maxEmbeddings = data.length; + + const embeddings = await processEmbeddingBatch( + client, + model, + fieldToEmbed, + newEmbeddedField, + maxEmbeddings, + data + ); + + await writeFileJson(path.join(__dirname, "..", dataWithVectors!), embeddings); + +} catch (error) { + console.error(`Failed to save embeddings to file: ${(error as Error).message}`); +} \ No newline at end of file diff --git a/ai/vector-search-typescript/src/diskann.ts b/ai/vector-search-typescript/src/diskann.ts new file mode 100644 index 0000000..96b547c --- /dev/null +++ b/ai/vector-search-typescript/src/diskann.ts @@ -0,0 +1,108 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, insertData, printSearchResults } from './utils.js'; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const config = { + query: "quintessential lodging near running trails, eateries, retail", + dbName: "Hotels", + collectionName: "hotels_diskann", + indexName: "vectorIndex_diskann", + dataFile: process.env.DATA_FILE_WITH_VECTORS!, + batchSize: parseInt(process.env.LOAD_SIZE_BATCH! || '100', 10), + embeddedField: process.env.EMBEDDED_FIELD!, + embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS!, 10), + deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL!, +}; + +async function main() { + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + + if (!aiClient) { + throw new Error('AI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + const collection = await db.createCollection(config.collectionName); + console.log('Created collection:', config.collectionName); + const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); + const insertSummary = await insertData(config, collection, data); + console.log('Created vector index:', config.indexName); + + // Create the vector index + const indexOptions = { + createIndexes: config.collectionName, + indexes: [ + { + name: config.indexName, + key: { + [config.embeddedField]: 'cosmosSearch' + }, + cosmosSearchOptions: { + kind: 'vector-diskann', + dimensions: config.embeddingDimensions, + similarity: 'COS', // 'COS', 'L2', 'IP' + maxDegree: 20, // 20 - 2048, edges per node + lBuild: 10 // 10 - 500, candidate neighbors evaluated + } + } + ] + }; + const vectorIndexSummary = await db.command(indexOptions); + + // Create embedding for the query + const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [config.query] + }); + + // Perform the vector similarity search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: createEmbeddedForQueryResponse.data[0].embedding, + path: config.embeddedField, + k: 5 + } + } + }, + { + $project: { + score: { + $meta: "searchScore" + }, + document: "$$ROOT" + } + } + ]).toArray(); + + // Print the results + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('Closing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +// Execute the main function +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); \ No newline at end of file diff --git a/ai/vector-search-typescript/src/hnsw.ts b/ai/vector-search-typescript/src/hnsw.ts new file mode 100644 index 0000000..771146c --- /dev/null +++ b/ai/vector-search-typescript/src/hnsw.ts @@ -0,0 +1,108 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, insertData, printSearchResults } from './utils.js'; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const config = { + query: "quintessential lodging near running trails, eateries, retail", + dbName: "Hotels", + collectionName: "hotels_hnsw", + indexName: "vectorIndex_hnsw", + dataFile: process.env.DATA_FILE_WITH_VECTORS!, + batchSize: parseInt(process.env.LOAD_SIZE_BATCH! || '100', 10), + embeddedField: process.env.EMBEDDED_FIELD!, + embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS!, 10), + deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL!, +}; + +async function main() { + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + + if (!aiClient) { + throw new Error('AI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + const collection = await db.createCollection(config.collectionName); + console.log('Created collection:', config.collectionName); + const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create the vector index + const indexOptions = { + createIndexes: config.collectionName, + indexes: [ + { + name: config.indexName, + key: { + [config.embeddedField]: 'cosmosSearch' + }, + cosmosSearchOptions: { + kind: 'vector-hnsw', + m: 16, // 2 - 100, default = 16, number of connections per layer + efConstruction: 64, // 4 - 1000, default=64, size of the dynamic candidate list for constructing the graph + similarity: 'COS', // 'COS', 'L2', 'IP' + dimensions: config.embeddingDimensions + } + } + ] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log('Created vector index:', config.indexName); + + // Create embedding for the query + const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [config.query] + }); + + // Perform the vector similarity search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: createEmbeddedForQueryResponse.data[0].embedding, + path: config.embeddedField, + k: 5 + } + } + }, + { + $project: { + score: { + $meta: "searchScore" + }, + document: "$$ROOT" + } + } + ]).toArray(); + + // Print the results + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('Closing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +// Execute the main function +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); \ No newline at end of file diff --git a/ai/vector-search-typescript/src/ivf.ts b/ai/vector-search-typescript/src/ivf.ts new file mode 100644 index 0000000..e81ace8 --- /dev/null +++ b/ai/vector-search-typescript/src/ivf.ts @@ -0,0 +1,109 @@ +import path from 'path'; +import { readFileReturnJson, getClientsPasswordless, insertData, printSearchResults } from './utils.js'; + +// ESM specific features - create __dirname equivalent +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +const config = { + query: "quintessential lodging near running trails, eateries, retail", + dbName: "Hotels", + collectionName: "hotels_ivf", + indexName: "vectorIndex_ivf", + dataFile: process.env.DATA_FILE_WITH_VECTORS!, + batchSize: parseInt(process.env.LOAD_SIZE_BATCH! || '100', 10), + embeddedField: process.env.EMBEDDED_FIELD!, + embeddingDimensions: parseInt(process.env.EMBEDDING_DIMENSIONS!, 10), + deployment: process.env.AZURE_OPENAI_EMBEDDING_MODEL!, +}; + +async function main() { + + const { aiClient, dbClient } = getClientsPasswordless(); + + try { + + if (!aiClient) { + throw new Error('AI client is not configured. Please check your environment variables.'); + } + if (!dbClient) { + throw new Error('Database client is not configured. Please check your environment variables.'); + } + + await dbClient.connect(); + const db = dbClient.db(config.dbName); + const collection = await db.createCollection(config.collectionName); + console.log('Created collection:', config.collectionName); + const data = await readFileReturnJson(path.join(__dirname, "..", config.dataFile)); + const insertSummary = await insertData(config, collection, data); + + // Create the vector index + const indexOptions = { + createIndexes: config.collectionName, + indexes: [ + { + name: config.indexName, + key: { + [config.embeddedField]: 'cosmosSearch' + }, + cosmosSearchOptions: { + kind: 'vector-ivf', + numLists: 10, + similarity: 'COS', + dimensions: config.embeddingDimensions + } + } + ] + }; + const vectorIndexSummary = await db.command(indexOptions); + console.log('Created vector index:', config.indexName); + + // Create embedding for the query + const createEmbeddedForQueryResponse = await aiClient.embeddings.create({ + model: config.deployment, + input: [config.query] + }); + + // Perform the vector similarity search + const searchResults = await collection.aggregate([ + { + $search: { + cosmosSearch: { + vector: createEmbeddedForQueryResponse.data[0].embedding, + path: config.embeddedField, + k: 5 + }, + returnStoredSource: true + } + }, + { + $project: { + score: { + $meta: "searchScore" + }, + document: "$$ROOT" + } + } + + ]).toArray(); + + // Print the results + printSearchResults(insertSummary, vectorIndexSummary, searchResults); + + } catch (error) { + console.error('App failed:', error); + process.exitCode = 1; + } finally { + console.log('Closing database connection...'); + if (dbClient) await dbClient.close(); + console.log('Database connection closed'); + } +} + +// Execute the main function +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); \ No newline at end of file diff --git a/ai/vector-search-typescript/src/showIndexes.ts b/ai/vector-search-typescript/src/showIndexes.ts new file mode 100644 index 0000000..6b5acb6 --- /dev/null +++ b/ai/vector-search-typescript/src/showIndexes.ts @@ -0,0 +1,91 @@ +import { MongoClient } from 'mongodb'; + +const config = { + connectionString: process.env.MONGO_CONNECTION_STRING! +}; + +async function getAllDatabases(client: MongoClient): Promise { + try { + // Get all database names except admin/config/local + const dbList = await client.db().admin().listDatabases({ nameOnly: true }); + return dbList.databases + .map((db: any) => db.name) + .filter((name: string) => !['admin', 'config', 'local'].includes(name)); + } catch (error) { + console.error('Error listing databases:', error); + return []; + } +} +async function getAllCollections(db: any): Promise { + try { + const collections = await db.listCollections().toArray(); + return collections.map((coll: any) => coll.name); + } catch (error) { + console.error(`Error listing collections for database ${db.databaseName}:`, error); + return []; + } +} + +async function getAllIndexes(db: any, collectionName: string): Promise { + try { + const collection = db.collection(collectionName); + const indexes = await collection.indexes(); + console.log(`\n šŸ—ƒļø COLLECTION: ${collectionName} (${indexes.length} indexes)`); + console.log(JSON.stringify(indexes, null, 2)); + } catch (error) { + console.error(`Error listing indexes for collection ${collectionName}:`, error); + } +} + +async function main() { + + const client = new MongoClient(config.connectionString, { + maxPoolSize: 5, + minPoolSize: 1, + maxIdleTimeMS: 30000, + connectTimeoutMS: 30000, + socketTimeoutMS: 60000, + }); + + try { + + await client.connect(); + const dbNames = await getAllDatabases(client); + + if (dbNames.length === 0) { + console.log('No databases found or access denied'); + return; + } + + // Process each database + for (const dbName of dbNames) { + const db = client.db(dbName); + + // Get collections to process + let collections = await getAllCollections(db); + + if (collections.length === 0) { + console.log(`Database '${dbName}': No collections found`); + continue; + } + + console.log(`\nšŸ“‚ DATABASE: ${dbName} (${collections.length} collections)`); + + // Process each collection + for (const collName of collections) { + await getAllIndexes(db, collName); + } + } + } catch (error) { + console.error('Index retrieval failed:', error); + process.exitCode = 1; + } finally { + console.log('\nClosing database connection...'); + await client.close(); + console.log('Database connection closed'); + } +} +main().catch(error => { + console.error('Unhandled error:', error); + process.exitCode = 1; +}); \ No newline at end of file diff --git a/ai/vector-search-typescript/src/utils.ts b/ai/vector-search-typescript/src/utils.ts new file mode 100644 index 0000000..f557ade --- /dev/null +++ b/ai/vector-search-typescript/src/utils.ts @@ -0,0 +1,166 @@ +import { MongoClient, OIDCResponse, OIDCCallbackParams } from 'mongodb'; +import { AzureOpenAI } from 'openai/index.js'; +import { promises as fs } from "fs"; +import { AccessToken, DefaultAzureCredential, TokenCredential, getBearerTokenProvider } from '@azure/identity'; + +// Define a type for JSON data +export type JsonData = Record; + +export const AzureIdentityTokenCallback = async (params: OIDCCallbackParams, credential: TokenCredential): Promise => { + const tokenResponse: AccessToken | null = await credential.getToken(['https://ossrdbms-aad.database.windows.net/.default']); + return { + accessToken: tokenResponse?.token || '', + expiresInSeconds: (tokenResponse?.expiresOnTimestamp || 0) - Math.floor(Date.now() / 1000) + }; +}; +export function getClients(): { aiClient: AzureOpenAI; dbClient: MongoClient } { + const apiKey = process.env.AZURE_OPENAI_EMBEDDING_KEY!; + const apiVersion = process.env.AZURE_OPENAI_EMBEDDING_API_VERSION!; + const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + const aiClient = new AzureOpenAI({ + apiKey, + apiVersion, + endpoint, + deployment + }); + const dbClient = new MongoClient(process.env.MONGO_CONNECTION_STRING!, { + // Performance optimizations + maxPoolSize: 10, // Limit concurrent connections + minPoolSize: 1, // Maintain at least one connection + maxIdleTimeMS: 30000, // Close idle connections after 30 seconds + connectTimeoutMS: 30000, // Connection timeout + socketTimeoutMS: 360000, // Socket timeout (for long-running operations) + writeConcern: { // Optimize write concern for bulk operations + w: 1, // Acknowledge writes after primary has written + j: false // Don't wait for journal commit + } + }); + + return { aiClient, dbClient }; +} + +export function getClientsPasswordless(): { aiClient: AzureOpenAI | null; dbClient: MongoClient | null } { + let aiClient: AzureOpenAI | null = null; + let dbClient: MongoClient | null = null; + + // For Azure OpenAI with DefaultAzureCredential + const apiVersion = process.env.AZURE_OPENAI_EMBEDDING_API_VERSION!; + const endpoint = process.env.AZURE_OPENAI_EMBEDDING_ENDPOINT!; + const deployment = process.env.AZURE_OPENAI_EMBEDDING_MODEL!; + + if (apiVersion && endpoint && deployment) { + const credential = new DefaultAzureCredential(); + const scope = "https://cognitiveservices.azure.com/.default"; + const azureADTokenProvider = getBearerTokenProvider(credential, scope); + aiClient = new AzureOpenAI({ + apiVersion, + endpoint, + deployment, + azureADTokenProvider + }); + } + + // For Cosmos DB with DefaultAzureCredential + const clusterName = process.env.MONGO_CLUSTER_NAME!; + + if (clusterName) { + const credential = new DefaultAzureCredential(); + + dbClient = new MongoClient( + `mongodb+srv://${clusterName}.global.mongocluster.cosmos.azure.com/`, { + connectTimeoutMS: 30000, + tls: true, + retryWrites: true, + authMechanism: 'MONGODB-OIDC', + authMechanismProperties: { + OIDC_CALLBACK: (params: OIDCCallbackParams) => AzureIdentityTokenCallback(params, credential), + ALLOWED_HOSTS: ['*.azure.com'] + } + } + ); + } + + return { aiClient, dbClient }; +} + +export async function readFileReturnJson(filePath: string): Promise { + + console.log(`Reading JSON file from ${filePath}`); + + const fileAsString = await fs.readFile(filePath, "utf-8"); + return JSON.parse(fileAsString); +} +export async function writeFileJson(filePath: string, jsonData: JsonData): Promise { + const jsonString = JSON.stringify(jsonData, null, 2); + await fs.writeFile(filePath, jsonString, "utf-8"); + + console.log(`Wrote JSON file to ${filePath}`); +} +export async function insertData(config, collection, data) { + console.log(`Processing in batches of ${config.batchSize}...`); + const totalBatches = Math.ceil(data.length / config.batchSize); + + let inserted = 0; + let updated = 0; + let skipped = 0; + let failed = 0; + + for (let i = 0; i < totalBatches; i++) { + const start = i * config.batchSize; + const end = Math.min(start + config.batchSize, data.length); + const batch = data.slice(start, end); + + try { + const result = await collection.insertMany(batch, { ordered: false }); + inserted += result.insertedCount || 0; + console.log(`Batch ${i + 1} complete: ${result.insertedCount} inserted`); + } catch (error: any) { + if (error?.writeErrors) { + // Some documents may have been inserted despite errors + console.error(`Error in batch ${i + 1}: ${error?.writeErrors.length} failures`); + failed += error?.writeErrors.length; + inserted += batch.length - error?.writeErrors.length; + } else { + console.error(`Error in batch ${i + 1}:`, error); + failed += batch.length; + } + } + + // Small pause between batches to reduce resource contention + if (i < totalBatches - 1) { + await new Promise(resolve => setTimeout(resolve, 100)); + } + } + const indexColumns = [ + "HotelId", + "Category", + "Description", + "Description_fr" + ]; + for (const col of indexColumns) { + const indexSpec = {}; + indexSpec[col] = 1; // Ascending index + await collection.createIndex(indexSpec); + } + + return { total: data.length, inserted, updated, skipped, failed }; +} + +export function printSearchResults(insertSummary, indexSummary, searchResults) { + + + if (!searchResults || searchResults.length === 0) { + console.log('No search results found.'); + return; + } + + searchResults.map((result, index) => { + + const { document, score } = result as any; + + console.log(`${index + 1}. HotelName: ${document.HotelName}, Score: ${score.toFixed(4)}`); + //console.log(` Description: ${document.Description}`); + }); + +} diff --git a/ai/vector-search-typescript/tsconfig.json b/ai/vector-search-typescript/tsconfig.json new file mode 100644 index 0000000..6e44036 --- /dev/null +++ b/ai/vector-search-typescript/tsconfig.json @@ -0,0 +1,23 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "NodeNext", + "moduleResolution": "nodenext", + "declaration": true, + "outDir": "./dist", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "noImplicitAny": false, + "forceConsistentCasingInFileNames": true, + "sourceMap": true, + "resolveJsonModule": true, + }, + "include": [ + "src/**/*" + ], + "exclude": [ + "node_modules", + "dist" + ] +} \ No newline at end of file diff --git a/infra/documentdb.bicep b/infra/documentdb.bicep new file mode 100644 index 0000000..b701248 --- /dev/null +++ b/infra/documentdb.bicep @@ -0,0 +1,53 @@ +@description('Cluster name') +@minLength(8) +@maxLength(40) +param clusterName string = 'msdocs-${uniqueString(resourceGroup().id)}' + +@description('Location for the cluster.') +param location string = resourceGroup().location + +@description('Username for admin user') +param adminUsername string + +@secure() +@description('Password for admin user') +@minLength(8) +@maxLength(128) +param adminPassword string + +resource cluster 'Microsoft.DocumentDB/mongoClusters@2025-09-01' = { + name: clusterName + location: location + properties: { + administrator: { + userName: adminUsername + password: adminPassword + } + serverVersion: '8.0' + sharding: { + shardCount: 1 + } + storage: { + sizeGb: 32 + } + highAvailability: { + targetMode: 'Disabled' + } + compute: { + tier: 'M10' + } + } +} + +resource firewallRules 'Microsoft.DocumentDB/mongoClusters/firewallRules@2025-09-01' = { + parent: cluster + name: 'AllowAllAzureServices' + properties: { + startIpAddress: '0.0.0.0' + endIpAddress: '0.0.0.0' + } +} + +output clusterName string = cluster.name +output clusterId string = cluster.id + diff --git a/infra/main.bicep b/infra/main.bicep index 3827c81..1f7d1bb 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -22,6 +22,15 @@ param location string @description('Id of the principal to assign database and application roles.') param deploymentUserPrincipalId string = '' +@description('Username for DocumentDB admin user') +param documentDbAdminUsername string + +@secure() +@description('Password for DocumentDB admin user') +@minLength(8) +@maxLength(128) +param documentDbAdminPassword string + var resourceToken = toLower(uniqueString(subscription().id, environmentName, location)) var tags = { 'azd-env-name': environmentName } var prefix = '${environmentName}${resourceToken}' @@ -113,75 +122,100 @@ module openAi 'br/public:avm/res/cognitive-services/account:0.7.1' = { var databaseName = 'Hotels' -module documentDbAccount 'br/public:avm/res/document-db/database-account:0.8.1' = { - name: 'documentdb-account' +// Deploy Azure DocumentDB MongoDB Cluster (vCore) +module documentDbCluster './documentdb.bicep' = { + name: 'documentdb-cluster' scope: resourceGroup params: { - name: 'documentdb-nosql-${prefix}' + clusterName: 'docdb-${resourceToken}' location: location - locations: [ - { - failoverPriority: 0 - locationName: location - isZoneRedundant: false - } - ] - tags: tags - disableKeyBasedMetadataWriteAccess: true - disableLocalAuth: true - networkRestrictions: { - publicNetworkAccess: 'Enabled' - ipRules: [] - virtualNetworkRules: [] - } - capabilitiesToAdd: [ - 'EnableServerless' - ] - sqlRoleDefinitions: [ - { - name: 'nosql-data-plane-contributor' - dataAction: [ - 'Microsoft.DocumentDB/databaseAccounts/readMetadata' - 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/items/*' - 'Microsoft.DocumentDB/databaseAccounts/sqlDatabases/containers/*' - ] - } - ] - sqlRoleAssignmentsPrincipalIds: union( - [ - managedIdentity.outputs.principalId - ], - !empty(deploymentUserPrincipalId) ? [deploymentUserPrincipalId] : [] - ) - mongodbDatabases: [ - { - name: databaseName - tags: tags - collections: [ - { - name: 'hotels_diskann' - paths: [ - '/HotelId' - ] - } - { - name: 'hotels_ivf' - paths: [ - '/HotelId' - ] - } - { - name: 'hotels_hnsw' - paths: [ - '/HotelId' - ] - } - ] - } - ] + adminUsername: documentDbAdminUsername + adminPassword: documentDbAdminPassword } } +// // Deploy Azure Cosmos DB for MongoDB (Request Unit model with serverless) +// // This provides MongoDB API compatibility with vector search capabilities +// module documentDbAccount 'br/public:avm/res/document-db/database-account:0.11.3' = { +// name: 'documentdb-account' +// scope: resourceGroup +// params: { +// name: 'documentdb-nosql-${prefix}' +// location: location +// locations: [ +// { +// failoverPriority: 0 +// locationName: location +// isZoneRedundant: false +// } +// ] +// tags: tags +// disableKeyBasedMetadataWriteAccess: true +// disableLocalAuth: false +// networkRestrictions: { +// publicNetworkAccess: 'Enabled' +// ipRules: [] +// virtualNetworkRules: [] +// } +// capabilitiesToAdd: [ +// 'EnableServerless' +// ] +// mongodbDatabases: [ +// { +// name: databaseName +// collections: [ +// { +// name: 'hotels_diskann' +// indexes: [ +// { +// key: { +// keys: [ +// '_id' +// ] +// } +// } +// ] +// shardKey: { +// HotelId: 'Hash' +// } +// } +// { +// name: 'hotels_ivf' +// indexes: [ +// { +// key: { +// keys: [ +// '_id' +// ] +// } +// } +// ] +// shardKey: { +// HotelId: 'Hash' +// } +// } +// { +// name: 'hotels_hnsw' +// indexes: [ +// { +// key: { +// keys: [ +// '_id' +// ] +// } +// } +// ] +// shardKey: { +// HotelId: 'Hash' +// } +// } +// ] +// } +// ] +// } +// } + + // Azure Subscription and Resource Group outputs output AZURE_LOCATION string = location output AZURE_TENANT_ID string = tenant().tenantId @@ -202,9 +236,12 @@ output AZURE_OPENAI_EMBEDDING_ENDPOINT string = openAi.outputs.endpoint output AZURE_OPENAI_EMBEDDING_API_VERSION string = embeddingModelApiVersion // DocumentDB outputs -output AZURE_DOCUMENTDB_CLUSTER string = documentDbAccount.outputs.name -output AZURE_DOCUMENTDB_ENDPOINT string = documentDbAccount.outputs.endpoint -output AZURE_DOCUMENTDB_DATABASENAME string = databaseName +//output AZURE_DOCUMENTDB_CLUSTER string = documentDbAccount.outputs.name +//output AZURE_DOCUMENTDB_ENDPOINT string = documentDbAccount.outputs.endpoint +//output AZURE_DOCUMENTDB_DATABASENAME string = databaseName +output CLUSTER string = documentDbCluster.outputs.clusterName +//output AZURE_DOCUMENTDB_ADMIN_USERNAME string = documentDbAdminUsername +//output AZURE_DOCUMENTDB_VCORE_CLUSTER_NAME string = documentDbCluster.outputs.clusterName // Configuration for embedding creation and vector search output DATA_FILE_WITH_VECTORS string = dataFileWithVectors @@ -214,3 +251,4 @@ output EMBEDDED_FIELD string = embeddedFieldName output EMBEDDING_DIMENSIONS string = embeddingDimensions output EMBEDDING_BATCH_SIZE string = embeddingBatchSize output LOAD_SIZE_BATCH string = loadSizeBatch + diff --git a/infra/main.bicepparam b/infra/main.bicepparam index a536325..c5ebdde 100644 --- a/infra/main.bicepparam +++ b/infra/main.bicepparam @@ -3,3 +3,5 @@ using './main.bicep' param environmentName = readEnvironmentVariable('AZURE_ENV_NAME', 'development') param location = readEnvironmentVariable('AZURE_LOCATION', 'eastus2') param deploymentUserPrincipalId = readEnvironmentVariable('AZURE_PRINCIPAL_ID', '') +param documentDbAdminUsername = readEnvironmentVariable('DOCUMENTDB_ADMIN_USERNAME', 'docdbadmin') +param documentDbAdminPassword = 'TempP@ssw0rd123!'