MongoDB Atlas Search and Full-Text Indexing: SQL-Style Text Search with Advanced Analytics and Ranking
Modern applications require sophisticated search capabilities that go beyond simple text matching - semantic understanding, relevance scoring, faceted search, auto-completion, and real-time search analytics. Traditional relational databases provide basic full-text search through extensions like PostgreSQL's pg_trgm or MySQL's MATCH AGAINST, but struggle with advanced search features, relevance ranking, and the performance demands of modern search applications.
MongoDB Atlas Search provides enterprise-grade search capabilities built on Apache Lucene, delivering advanced full-text search, semantic search, vector search, and search analytics directly integrated with your MongoDB data. Unlike external search engines that require complex data synchronization, Atlas Search maintains real-time consistency with your database while providing powerful search features typically found only in dedicated search platforms.
The Traditional Search Challenge
Relational database search approaches have significant limitations for modern applications:
-- Traditional SQL full-text search - limited and inefficient
-- PostgreSQL full-text search approach
CREATE TABLE articles (
article_id SERIAL PRIMARY KEY,
title VARCHAR(500) NOT NULL,
content TEXT NOT NULL,
author_id INTEGER REFERENCES users(user_id),
category VARCHAR(100),
tags TEXT[],
published_date TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
view_count INTEGER DEFAULT 0,
-- Full-text search vectors
title_tsvector TSVECTOR,
content_tsvector TSVECTOR,
combined_tsvector TSVECTOR
);
-- Create full-text search indexes
CREATE INDEX idx_articles_title_fts ON articles USING GIN(title_tsvector);
CREATE INDEX idx_articles_content_fts ON articles USING GIN(content_tsvector);
CREATE INDEX idx_articles_combined_fts ON articles USING GIN(combined_tsvector);
-- Maintain search vectors with triggers
CREATE OR REPLACE FUNCTION update_article_search_vectors()
RETURNS TRIGGER AS $$
BEGIN
NEW.title_tsvector := to_tsvector('english', NEW.title);
NEW.content_tsvector := to_tsvector('english', NEW.content);
NEW.combined_tsvector := to_tsvector('english',
NEW.title || ' ' || NEW.content || ' ' || array_to_string(NEW.tags, ' '));
RETURN NEW;
END;
$$ LANGUAGE plpgsql;
CREATE TRIGGER trigger_update_search_vectors
BEFORE INSERT OR UPDATE ON articles
FOR EACH ROW EXECUTE FUNCTION update_article_search_vectors();
-- Basic full-text search query
SELECT
a.article_id,
a.title,
a.published_date,
a.view_count,
-- Simple relevance ranking
ts_rank(a.combined_tsvector, query) as relevance_score,
-- Highlight search terms (basic)
ts_headline('english', a.content, query,
'MaxWords=50, MinWords=10, ShortWord=3') as snippet
FROM articles a,
plainto_tsquery('english', 'machine learning algorithms') as query
WHERE a.combined_tsvector @@ query
ORDER BY ts_rank(a.combined_tsvector, query) DESC
LIMIT 20;
-- Problems with traditional full-text search:
-- 1. Limited language support and stemming capabilities
-- 2. Basic relevance scoring without advanced ranking factors
-- 3. No semantic understanding or synonym handling
-- 4. Limited faceting and aggregation capabilities
-- 5. Poor auto-completion and suggestion features
-- 6. No built-in analytics or search performance metrics
-- 7. Complex maintenance of search vectors and triggers
-- 8. Limited scalability for large document collections
-- MySQL full-text search (even more limited)
CREATE TABLE documents (
doc_id INT AUTO_INCREMENT PRIMARY KEY,
title VARCHAR(255),
content LONGTEXT,
category VARCHAR(100),
created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
FULLTEXT(title, content)
) ENGINE=InnoDB;
-- Basic MySQL full-text search
SELECT
doc_id,
title,
created_at,
MATCH(title, content) AGAINST('machine learning' IN NATURAL LANGUAGE MODE) as score
FROM documents
WHERE MATCH(title, content) AGAINST('machine learning' IN NATURAL LANGUAGE MODE)
ORDER BY score DESC
LIMIT 20;
-- MySQL limitations:
-- - Minimum word length restrictions
-- - Limited boolean query syntax
-- - Poor performance with large datasets
-- - No advanced ranking or analytics
-- - Limited customization options
MongoDB Atlas Search provides comprehensive search capabilities:
// MongoDB Atlas Search - enterprise-grade search with advanced features
const { MongoClient } = require('mongodb');
const client = new MongoClient('mongodb+srv://cluster.mongodb.net');
const db = client.db('content_platform');
const articles = db.collection('articles');
// Advanced Atlas Search query with multiple search techniques
const searchQuery = [
{
$search: {
index: "articles_search_index", // Custom search index
compound: {
must: [
// Text search with fuzzy matching
{
text: {
query: "machine learning algorithms",
path: ["title", "content"],
fuzzy: {
maxEdits: 2,
prefixLength: 1,
maxExpansions: 50
}
}
}
],
should: [
// Boost title matches
{
text: {
query: "machine learning algorithms",
path: "title",
score: { boost: { value: 3.0 } }
}
},
// Phrase matching with slop
{
phrase: {
query: "machine learning",
path: ["title", "content"],
slop: 2,
score: { boost: { value: 2.0 } }
}
},
// Semantic search using synonyms
{
text: {
query: "machine learning algorithms",
path: ["title", "content"],
synonyms: "tech_synonyms"
}
}
],
filter: [
// Date range filtering
{
range: {
path: "publishedDate",
gte: new Date("2023-01-01"),
lte: new Date("2025-12-31")
}
},
// Category filtering
{
text: {
query: ["technology", "science", "ai"],
path: "category"
}
}
],
mustNot: [
// Exclude draft articles
{
equals: {
path: "status",
value: "draft"
}
}
]
},
// Advanced highlighting
highlight: {
path: ["title", "content"],
maxCharsToExamine: 500000,
maxNumPassages: 3
},
// Count total matches
count: {
type: "total"
}
}
},
// Add computed relevance and metadata
{
$addFields: {
searchScore: { $meta: "searchScore" },
searchHighlights: { $meta: "searchHighlights" },
// Custom scoring factors
popularityScore: {
$divide: [
{ $add: ["$viewCount", "$likeCount"] },
{ $max: [{ $divide: [{ $subtract: [new Date(), "$publishedDate"] }, 86400000] }, 1] }
]
},
// Content quality indicators
contentQuality: {
$cond: {
if: { $gte: [{ $strLenCP: "$content" }, 1000] },
then: { $min: [{ $divide: [{ $strLenCP: "$content" }, 500] }, 5] },
else: 1
}
}
}
},
// Faceted aggregations for search filters
{
$facet: {
// Main search results
results: [
{
$addFields: {
finalScore: {
$add: [
"$searchScore",
{ $multiply: ["$popularityScore", 0.2] },
{ $multiply: ["$contentQuality", 0.1] }
]
}
}
},
{ $sort: { finalScore: -1 } },
{ $limit: 20 },
{
$project: {
articleId: "$_id",
title: 1,
author: 1,
category: 1,
tags: 1,
publishedDate: 1,
viewCount: 1,
searchScore: 1,
finalScore: 1,
searchHighlights: 1,
snippet: { $substr: ["$content", 0, 200] }
}
}
],
// Category facets
categoryFacets: [
{
$group: {
_id: "$category",
count: { $sum: 1 },
avgScore: { $avg: "$searchScore" }
}
},
{ $sort: { count: -1 } },
{ $limit: 10 }
],
// Author facets
authorFacets: [
{
$group: {
_id: "$author.name",
count: { $sum: 1 },
articles: { $push: "$title" }
}
},
{ $sort: { count: -1 } },
{ $limit: 10 }
],
// Date range facets
dateFacets: [
{
$group: {
_id: {
year: { $year: "$publishedDate" },
month: { $month: "$publishedDate" }
},
count: { $sum: 1 },
avgScore: { $avg: "$searchScore" }
}
},
{ $sort: { "_id.year": -1, "_id.month": -1 } }
],
// Search analytics
searchAnalytics: [
{
$group: {
_id: null,
totalResults: { $sum: 1 },
avgScore: { $avg: "$searchScore" },
maxScore: { $max: "$searchScore" },
scoreDistribution: {
$push: {
$switch: {
branches: [
{ case: { $gte: ["$searchScore", 10] }, then: "excellent" },
{ case: { $gte: ["$searchScore", 5] }, then: "good" },
{ case: { $gte: ["$searchScore", 2] }, then: "fair" }
],
default: "poor"
}
}
}
}
}
]
}
}
];
// Execute search with comprehensive results
const searchResults = await articles.aggregate(searchQuery).toArray();
// Benefits of MongoDB Atlas Search:
// - Advanced relevance scoring with custom ranking factors
// - Semantic search with synonym support and fuzzy matching
// - Real-time search index updates synchronized with data changes
// - Faceted search with complex aggregations
// - Advanced highlighting and snippet generation
// - Built-in analytics and search performance metrics
// - Support for multiple languages and custom analyzers
// - Vector search capabilities for AI and machine learning
// - Auto-completion and suggestion features
// - Geospatial search integration
// - Security and access control integration
Understanding MongoDB Atlas Search Architecture
Search Index Creation and Management
Implement comprehensive search indexes for optimal performance:
// Advanced Atlas Search index management system
class AtlasSearchManager {
constructor(db) {
this.db = db;
this.searchIndexes = new Map();
this.searchAnalytics = db.collection('search_analytics');
}
async createComprehensiveSearchIndex(collection, indexName, indexDefinition) {
// Create sophisticated search index with multiple field types
const advancedIndexDefinition = {
name: indexName,
definition: {
// Text search fields with different analyzers
mappings: {
dynamic: false,
fields: {
// Title field with enhanced text analysis
title: {
type: "string",
analyzer: "lucene.english",
searchAnalyzer: "lucene.keyword",
highlightAnalyzer: "lucene.english",
store: true,
indexOptions: "freqs"
},
// Content field with full-text capabilities
content: {
type: "string",
analyzer: "content_analyzer",
maxGrams: 15,
minGrams: 2,
store: true
},
// Category as both text and facet
category: [
{
type: "string",
analyzer: "lucene.keyword"
},
{
type: "stringFacet"
}
],
// Tags for exact and fuzzy matching
tags: {
type: "string",
analyzer: "lucene.standard",
multi: {
keyword: {
type: "string",
analyzer: "lucene.keyword"
}
}
},
// Author information
"author.name": {
type: "string",
analyzer: "lucene.standard",
store: true
},
"author.expertise": {
type: "stringFacet"
},
// Numeric fields for sorting and filtering
publishedDate: {
type: "date"
},
viewCount: {
type: "number",
indexIntegers: true,
indexDoubles: false
},
likeCount: {
type: "number"
},
readingTime: {
type: "number"
},
// Geospatial data
"location.coordinates": {
type: "geo"
},
// Vector field for semantic search
contentEmbedding: {
type: "knnVector",
dimensions: 1536,
similarity: "cosine"
}
}
},
// Custom analyzers
analyzers: [
{
name: "content_analyzer",
charFilters: [
{
type: "htmlStrip"
},
{
type: "mapping",
mappings: {
"& => and",
"@ => at"
}
}
],
tokenizer: {
type: "standard"
},
tokenFilters: [
{
type: "lowercase"
},
{
type: "stop",
stopwords: ["the", "a", "an", "and", "or", "but"]
},
{
type: "snowballStemming",
language: "english"
},
{
type: "length",
min: 2,
max: 100
}
]
},
{
name: "autocomplete_analyzer",
tokenizer: {
type: "edgeGram",
minGrams: 1,
maxGrams: 20
},
tokenFilters: [
{
type: "lowercase"
}
]
}
],
// Synonym mappings
synonyms: [
{
name: "tech_synonyms",
source: {
collection: "synonyms",
analyzer: "lucene.standard"
}
}
],
// Search configuration
storedSource: {
include: ["title", "author.name", "category", "publishedDate"],
exclude: ["content", "internalNotes"]
}
}
};
try {
// Create the search index
const result = await this.db.collection(collection).createSearchIndex(advancedIndexDefinition);
// Store index metadata
this.searchIndexes.set(indexName, {
collection: collection,
indexName: indexName,
definition: advancedIndexDefinition,
createdAt: new Date(),
status: 'creating'
});
console.log(`Search index '${indexName}' created for collection '${collection}'`);
return result;
} catch (error) {
console.error(`Failed to create search index '${indexName}':`, error);
throw error;
}
}
async createAutoCompleteIndex(collection, fields, indexName = 'autocomplete_index') {
// Create specialized index for auto-completion
const autoCompleteIndex = {
name: indexName,
definition: {
mappings: {
dynamic: false,
fields: fields.reduce((acc, field) => {
acc[field.path] = {
type: "autocomplete",
analyzer: "autocomplete_analyzer",
tokenization: "edgeGram",
maxGrams: field.maxGrams || 15,
minGrams: field.minGrams || 2,
foldDiacritics: true
};
return acc;
}, {})
},
analyzers: [
{
name: "autocomplete_analyzer",
tokenizer: {
type: "edgeGram",
minGrams: 2,
maxGrams: 15
},
tokenFilters: [
{
type: "lowercase"
},
{
type: "diacriticFolding"
}
]
}
]
}
};
return await this.db.collection(collection).createSearchIndex(autoCompleteIndex);
}
async performAdvancedSearch(collection, searchParams) {
// Execute sophisticated search with multiple techniques
const pipeline = [];
// Build complex search stage
const searchStage = {
$search: {
index: searchParams.index || 'default_search_index',
compound: {
must: [],
should: [],
filter: [],
mustNot: []
}
}
};
// Text search with boosting
if (searchParams.query) {
searchStage.$search.compound.must.push({
text: {
query: searchParams.query,
path: searchParams.searchFields || ['title', 'content'],
fuzzy: searchParams.fuzzy || {
maxEdits: 2,
prefixLength: 1
}
}
});
// Boost title matches
searchStage.$search.compound.should.push({
text: {
query: searchParams.query,
path: 'title',
score: { boost: { value: 3.0 } }
}
});
// Phrase matching
if (searchParams.phraseSearch) {
searchStage.$search.compound.should.push({
phrase: {
query: searchParams.query,
path: ['title', 'content'],
slop: 2,
score: { boost: { value: 2.0 } }
}
});
}
}
// Vector search for semantic similarity
if (searchParams.vectorQuery) {
searchStage.$search = {
knnBeta: {
vector: searchParams.vectorQuery,
path: "contentEmbedding",
k: searchParams.vectorK || 50,
score: {
boost: {
value: searchParams.vectorBoost || 1.5
}
}
}
};
}
// Filters
if (searchParams.filters) {
if (searchParams.filters.category) {
searchStage.$search.compound.filter.push({
text: {
query: searchParams.filters.category,
path: "category"
}
});
}
if (searchParams.filters.dateRange) {
searchStage.$search.compound.filter.push({
range: {
path: "publishedDate",
gte: new Date(searchParams.filters.dateRange.start),
lte: new Date(searchParams.filters.dateRange.end)
}
});
}
if (searchParams.filters.author) {
searchStage.$search.compound.filter.push({
text: {
query: searchParams.filters.author,
path: "author.name"
}
});
}
if (searchParams.filters.minViewCount) {
searchStage.$search.compound.filter.push({
range: {
path: "viewCount",
gte: searchParams.filters.minViewCount
}
});
}
}
// Highlighting
if (searchParams.highlight !== false) {
searchStage.$search.highlight = {
path: searchParams.highlightFields || ['title', 'content'],
maxCharsToExamine: 500000,
maxNumPassages: 5
};
}
// Count configuration
if (searchParams.count) {
searchStage.$search.count = {
type: searchParams.count.type || 'total',
threshold: searchParams.count.threshold || 1000
};
}
pipeline.push(searchStage);
// Add scoring and ranking
pipeline.push({
$addFields: {
searchScore: { $meta: "searchScore" },
searchHighlights: { $meta: "searchHighlights" },
// Custom relevance scoring
relevanceScore: {
$add: [
"$searchScore",
// Boost recent content
{
$multiply: [
{
$max: [
0,
{
$subtract: [
30,
{
$divide: [
{ $subtract: [new Date(), "$publishedDate"] },
86400000
]
}
]
}
]
},
0.1
]
},
// Boost popular content
{
$multiply: [
{ $log10: { $max: [1, "$viewCount"] } },
0.2
]
},
// Boost quality content
{
$multiply: [
{ $min: [{ $divide: [{ $strLenCP: "$content" }, 1000] }, 3] },
0.15
]
}
]
}
}
});
// Faceted search results
if (searchParams.facets) {
pipeline.push({
$facet: {
results: [
{ $sort: { relevanceScore: -1 } },
{ $skip: searchParams.skip || 0 },
{ $limit: searchParams.limit || 20 },
{
$project: {
_id: 1,
title: 1,
author: 1,
category: 1,
tags: 1,
publishedDate: 1,
viewCount: 1,
likeCount: 1,
searchScore: 1,
relevanceScore: 1,
searchHighlights: 1,
snippet: { $substr: ["$content", 0, 250] },
readingTime: 1
}
}
],
facets: this.buildFacetPipeline(searchParams.facets),
totalCount: [
{ $count: "total" }
]
}
});
} else {
// Simple results without faceting
pipeline.push(
{ $sort: { relevanceScore: -1 } },
{ $skip: searchParams.skip || 0 },
{ $limit: searchParams.limit || 20 }
);
}
// Execute search and track analytics
const startTime = Date.now();
const results = await this.db.collection(collection).aggregate(pipeline).toArray();
const executionTime = Date.now() - startTime;
// Log search analytics
await this.logSearchAnalytics(searchParams, results, executionTime);
return results;
}
buildFacetPipeline(facetConfig) {
const facetPipeline = {};
if (facetConfig.category) {
facetPipeline.categories = [
{
$group: {
_id: "$category",
count: { $sum: 1 },
avgScore: { $avg: "$searchScore" }
}
},
{ $sort: { count: -1 } },
{ $limit: 20 }
];
}
if (facetConfig.author) {
facetPipeline.authors = [
{
$group: {
_id: "$author.name",
count: { $sum: 1 },
avgScore: { $avg: "$searchScore" },
expertise: { $first: "$author.expertise" }
}
},
{ $sort: { count: -1 } },
{ $limit: 15 }
];
}
if (facetConfig.tags) {
facetPipeline.tags = [
{ $unwind: "$tags" },
{
$group: {
_id: "$tags",
count: { $sum: 1 },
avgScore: { $avg: "$searchScore" }
}
},
{ $sort: { count: -1 } },
{ $limit: 25 }
];
}
if (facetConfig.dateRanges) {
facetPipeline.dateRanges = [
{
$bucket: {
groupBy: "$publishedDate",
boundaries: [
new Date("2020-01-01"),
new Date("2022-01-01"),
new Date("2023-01-01"),
new Date("2024-01-01"),
new Date("2025-01-01"),
new Date("2030-01-01")
],
default: "older",
output: {
count: { $sum: 1 },
avgScore: { $avg: "$searchScore" }
}
}
}
];
}
if (facetConfig.viewRanges) {
facetPipeline.viewRanges = [
{
$bucket: {
groupBy: "$viewCount",
boundaries: [0, 100, 1000, 10000, 100000, 1000000],
default: "very_popular",
output: {
count: { $sum: 1 },
avgScore: { $avg: "$searchScore" }
}
}
}
];
}
return facetPipeline;
}
async performAutoComplete(collection, query, field, limit = 10) {
// Auto-completion search
const pipeline = [
{
$search: {
index: 'autocomplete_index',
autocomplete: {
query: query,
path: field,
tokenOrder: "sequential",
fuzzy: {
maxEdits: 1,
prefixLength: 1
}
}
}
},
{
$group: {
_id: `$${field}`,
score: { $max: { $meta: "searchScore" } },
count: { $sum: 1 }
}
},
{ $sort: { score: -1, count: -1 } },
{ $limit: limit },
{
$project: {
suggestion: "$_id",
score: 1,
frequency: "$count",
_id: 0
}
}
];
return await this.db.collection(collection).aggregate(pipeline).toArray();
}
async performSemanticSearch(collection, queryVector, filters = {}, limit = 20) {
// Vector-based semantic search
const pipeline = [
{
$vectorSearch: {
index: "vector_search_index",
path: "contentEmbedding",
queryVector: queryVector,
numCandidates: limit * 10,
limit: limit,
filter: filters
}
},
{
$addFields: {
vectorScore: { $meta: "vectorSearchScore" }
}
},
{
$project: {
title: 1,
content: { $substr: ["$content", 0, 200] },
author: 1,
category: 1,
publishedDate: 1,
vectorScore: 1,
similarity: { $multiply: ["$vectorScore", 100] }
}
}
];
return await this.db.collection(collection).aggregate(pipeline).toArray();
}
async createSearchSuggestions(collection, userQuery, suggestionTypes = ['spelling', 'query', 'category']) {
// Generate search suggestions and corrections
const suggestions = {
spelling: [],
queries: [],
categories: [],
authors: []
};
// Spelling suggestions using fuzzy search
if (suggestionTypes.includes('spelling')) {
const spellingPipeline = [
{
$search: {
index: 'default_search_index',
text: {
query: userQuery,
path: ['title', 'content'],
fuzzy: {
maxEdits: 2,
prefixLength: 0
}
}
}
},
{ $limit: 5 },
{
$project: {
title: 1,
score: { $meta: "searchScore" }
}
}
];
suggestions.spelling = await this.db.collection(collection).aggregate(spellingPipeline).toArray();
}
// Query suggestions from search history
if (suggestionTypes.includes('query')) {
suggestions.queries = await this.searchAnalytics.find({
query: new RegExp(userQuery, 'i'),
resultCount: { $gt: 0 }
})
.sort({ searchCount: -1 })
.limit(5)
.project({ query: 1, resultCount: 1 })
.toArray();
}
// Category suggestions
if (suggestionTypes.includes('category')) {
const categoryPipeline = [
{
$search: {
index: 'default_search_index',
text: {
query: userQuery,
path: 'category'
}
}
},
{
$group: {
_id: "$category",
count: { $sum: 1 },
score: { $max: { $meta: "searchScore" } }
}
},
{ $sort: { score: -1, count: -1 } },
{ $limit: 5 }
];
suggestions.categories = await this.db.collection(collection).aggregate(categoryPipeline).toArray();
}
return suggestions;
}
async logSearchAnalytics(searchParams, results, executionTime) {
// Track search analytics for optimization
const analyticsDoc = {
query: searchParams.query,
searchType: this.determineSearchType(searchParams),
filters: searchParams.filters || {},
resultCount: Array.isArray(results) ? results.length :
(results[0] && results[0].totalCount ? results[0].totalCount[0]?.total : 0),
executionTime: executionTime,
timestamp: new Date(),
// Search quality metrics
avgScore: this.calculateAverageScore(results),
scoreDistribution: this.analyzeScoreDistribution(results),
// User experience metrics
hasResults: (results && results.length > 0),
fastResponse: executionTime < 500,
// Technical metrics
index: searchParams.index,
facetsRequested: !!searchParams.facets,
highlightRequested: searchParams.highlight !== false
};
await this.searchAnalytics.insertOne(analyticsDoc);
// Update search frequency
await this.searchAnalytics.updateOne(
{
query: searchParams.query,
searchType: analyticsDoc.searchType
},
{
$inc: { searchCount: 1 },
$set: { lastSearched: new Date() }
},
{ upsert: true }
);
}
determineSearchType(searchParams) {
if (searchParams.vectorQuery) return 'vector';
if (searchParams.phraseSearch) return 'phrase';
if (searchParams.fuzzy) return 'fuzzy';
return 'text';
}
calculateAverageScore(results) {
if (!results || !results.length) return 0;
const scores = results.map(r => r.searchScore || r.relevanceScore || 0);
return scores.reduce((sum, score) => sum + score, 0) / scores.length;
}
analyzeScoreDistribution(results) {
if (!results || !results.length) return {};
const scores = results.map(r => r.searchScore || r.relevanceScore || 0);
const distribution = {
excellent: scores.filter(s => s >= 10).length,
good: scores.filter(s => s >= 5 && s < 10).length,
fair: scores.filter(s => s >= 2 && s < 5).length,
poor: scores.filter(s => s < 2).length
};
return distribution;
}
async getSearchAnalytics(dateRange = {}, groupBy = 'day') {
// Comprehensive search analytics
const matchStage = {
timestamp: {
$gte: dateRange.start || new Date(Date.now() - 30 * 24 * 60 * 60 * 1000),
$lte: dateRange.end || new Date()
}
};
const pipeline = [
{ $match: matchStage },
{
$group: {
_id: this.getGroupingExpression(groupBy),
totalSearches: { $sum: 1 },
uniqueQueries: { $addToSet: "$query" },
avgExecutionTime: { $avg: "$executionTime" },
avgResultCount: { $avg: "$resultCount" },
successfulSearches: {
$sum: { $cond: [{ $gt: ["$resultCount", 0] }, 1, 0] }
},
fastSearches: {
$sum: { $cond: [{ $lt: ["$executionTime", 500] }, 1, 0] }
},
searchTypes: { $push: "$searchType" },
popularQueries: { $push: "$query" }
}
},
{
$addFields: {
uniqueQueryCount: { $size: "$uniqueQueries" },
successRate: { $divide: ["$successfulSearches", "$totalSearches"] },
performanceRate: { $divide: ["$fastSearches", "$totalSearches"] },
topQueries: {
$slice: [
{
$sortArray: {
input: {
$reduce: {
input: "$popularQueries",
initialValue: [],
in: {
$concatArrays: [
"$$value",
[{ query: "$$this", count: 1 }]
]
}
}
},
sortBy: { count: -1 }
}
},
10
]
}
}
},
{ $sort: { _id: -1 } }
];
return await this.searchAnalytics.aggregate(pipeline).toArray();
}
getGroupingExpression(groupBy) {
const dateExpressions = {
hour: {
year: { $year: "$timestamp" },
month: { $month: "$timestamp" },
day: { $dayOfMonth: "$timestamp" },
hour: { $hour: "$timestamp" }
},
day: {
year: { $year: "$timestamp" },
month: { $month: "$timestamp" },
day: { $dayOfMonth: "$timestamp" }
},
week: {
year: { $year: "$timestamp" },
week: { $week: "$timestamp" }
},
month: {
year: { $year: "$timestamp" },
month: { $month: "$timestamp" }
}
};
return dateExpressions[groupBy] || dateExpressions.day;
}
async optimizeSearchPerformance(collection, analysisRange = 30) {
// Analyze and optimize search performance
const analysisDate = new Date(Date.now() - analysisRange * 24 * 60 * 60 * 1000);
const performanceAnalysis = await this.searchAnalytics.aggregate([
{ $match: { timestamp: { $gte: analysisDate } } },
{
$group: {
_id: null,
totalSearches: { $sum: 1 },
avgExecutionTime: { $avg: "$executionTime" },
slowSearches: {
$sum: { $cond: [{ $gt: ["$executionTime", 2000] }, 1, 0] }
},
emptyResults: {
$sum: { $cond: [{ $eq: ["$resultCount", 0] }, 1, 0] }
},
commonQueries: { $push: "$query" },
slowQueries: {
$push: {
$cond: [
{ $gt: ["$executionTime", 1000] },
{ query: "$query", executionTime: "$executionTime" },
null
]
}
}
}
}
]).toArray();
const analysis = performanceAnalysis[0];
const recommendations = [];
// Performance recommendations
if (analysis.avgExecutionTime > 1000) {
recommendations.push({
type: 'performance',
issue: 'High average execution time',
recommendation: 'Consider index optimization or query refinement',
priority: 'high'
});
}
if (analysis.slowSearches / analysis.totalSearches > 0.1) {
recommendations.push({
type: 'performance',
issue: 'High percentage of slow searches',
recommendation: 'Review index configuration and query complexity',
priority: 'high'
});
}
if (analysis.emptyResults / analysis.totalSearches > 0.3) {
recommendations.push({
type: 'relevance',
issue: 'High percentage of searches with no results',
recommendation: 'Improve fuzzy matching and synonyms configuration',
priority: 'medium'
});
}
return {
analysis: analysis,
recommendations: recommendations,
generatedAt: new Date()
};
}
}
SQL-Style Search Operations with QueryLeaf
QueryLeaf provides familiar SQL syntax for MongoDB Atlas Search operations:
-- QueryLeaf Atlas Search operations with SQL-familiar syntax
-- Create full-text search index
CREATE SEARCH INDEX articles_search_idx ON articles (
-- Text fields with different analyzers
title WITH (analyzer='lucene.english', boost=3.0),
content WITH (analyzer='content_analyzer', store=true),
-- Faceted fields
category AS FACET,
"author.name" AS FACET,
tags AS FACET,
-- Numeric and date fields
publishedDate AS DATE,
viewCount AS NUMBER,
likeCount AS NUMBER,
-- Auto-completion fields
title AS AUTOCOMPLETE WITH (maxGrams=15, minGrams=2),
-- Vector field for semantic search
contentEmbedding AS VECTOR WITH (dimensions=1536, similarity='cosine')
);
-- Advanced text search with ranking
SELECT
article_id,
title,
author,
category,
published_date,
view_count,
-- Search relevance scoring
SEARCH_SCORE() as search_score,
SEARCH_HIGHLIGHTS('title', 'content') as highlights,
-- Custom relevance calculation
(SEARCH_SCORE() +
LOG10(GREATEST(1, view_count)) * 0.2 +
CASE
WHEN published_date >= CURRENT_DATE - INTERVAL '30 days' THEN 2.0
WHEN published_date >= CURRENT_DATE - INTERVAL '90 days' THEN 1.0
ELSE 0
END) as final_score
FROM articles
WHERE SEARCH_TEXT('machine learning algorithms',
fields => ARRAY['title', 'content'],
fuzzy => JSON_BUILD_OBJECT('maxEdits', 2, 'prefixLength', 1),
boost => JSON_BUILD_OBJECT('title', 3.0, 'content', 1.0)
)
AND category IN ('technology', 'science', 'ai')
AND published_date >= '2023-01-01'
AND status != 'draft'
ORDER BY final_score DESC
LIMIT 20;
-- Faceted search with aggregations
WITH search_results AS (
SELECT *,
SEARCH_SCORE() as search_score,
SEARCH_HIGHLIGHTS('title', 'content') as highlights
FROM articles
WHERE SEARCH_TEXT('artificial intelligence',
fields => ARRAY['title', 'content'],
synonyms => 'tech_synonyms'
)
)
SELECT
-- Main results
json_build_object(
'results', json_agg(
json_build_object(
'article_id', article_id,
'title', title,
'author', author,
'category', category,
'search_score', search_score,
'highlights', highlights
) ORDER BY search_score DESC LIMIT 20
),
-- Category facets
'categoryFacets', (
SELECT json_agg(
json_build_object(
'category', category,
'count', COUNT(*),
'avgScore', AVG(search_score)
)
)
FROM (
SELECT category, search_score
FROM search_results
GROUP BY category, search_score
) cat_data
GROUP BY category
ORDER BY COUNT(*) DESC
),
-- Author facets
'authorFacets', (
SELECT json_agg(
json_build_object(
'author', author->>'name',
'count', COUNT(*),
'expertise', author->>'expertise'
)
)
FROM search_results
GROUP BY author->>'name', author->>'expertise'
ORDER BY COUNT(*) DESC
LIMIT 10
),
-- Search analytics
'analytics', json_build_object(
'totalResults', COUNT(*),
'avgScore', AVG(search_score),
'maxScore', MAX(search_score),
'scoreDistribution', json_build_object(
'excellent', COUNT(*) FILTER (WHERE search_score >= 10),
'good', COUNT(*) FILTER (WHERE search_score >= 5 AND search_score < 10),
'fair', COUNT(*) FILTER (WHERE search_score >= 2 AND search_score < 5),
'poor', COUNT(*) FILTER (WHERE search_score < 2)
)
)
)
FROM search_results;
-- Auto-completion search
SELECT
suggestion,
score,
frequency
FROM AUTOCOMPLETE_SEARCH('machine lear',
field => 'title',
limit => 10,
fuzzy => JSON_BUILD_OBJECT('maxEdits', 1)
)
ORDER BY score DESC, frequency DESC;
-- Semantic vector search
SELECT
article_id,
title,
author,
category,
published_date,
VECTOR_SCORE() as similarity_score,
ROUND(VECTOR_SCORE() * 100, 2) as similarity_percentage
FROM articles
WHERE VECTOR_SEARCH(@query_embedding,
field => 'contentEmbedding',
k => 20,
filter => JSON_BUILD_OBJECT('category', ARRAY['technology', 'ai'])
)
ORDER BY similarity_score DESC;
-- Combined text and vector search (hybrid search)
WITH text_search AS (
SELECT article_id, title, author, category, published_date,
SEARCH_SCORE() as text_score,
1 as search_type
FROM articles
WHERE SEARCH_TEXT('neural networks deep learning')
ORDER BY SEARCH_SCORE() DESC
LIMIT 50
),
vector_search AS (
SELECT article_id, title, author, category, published_date,
VECTOR_SCORE() as vector_score,
2 as search_type
FROM articles
WHERE VECTOR_SEARCH(@neural_networks_embedding, field => 'contentEmbedding', k => 50)
),
combined_results AS (
-- Combine and re-rank results
SELECT
COALESCE(t.article_id, v.article_id) as article_id,
COALESCE(t.title, v.title) as title,
COALESCE(t.author, v.author) as author,
COALESCE(t.category, v.category) as category,
COALESCE(t.published_date, v.published_date) as published_date,
-- Hybrid scoring
COALESCE(t.text_score, 0) * 0.6 + COALESCE(v.vector_score, 0) * 0.4 as hybrid_score,
CASE
WHEN t.article_id IS NOT NULL AND v.article_id IS NOT NULL THEN 'both'
WHEN t.article_id IS NOT NULL THEN 'text_only'
ELSE 'vector_only'
END as match_type
FROM text_search t
FULL OUTER JOIN vector_search v ON t.article_id = v.article_id
)
SELECT * FROM combined_results
ORDER BY hybrid_score DESC, match_type = 'both' DESC
LIMIT 20;
-- Search with custom scoring and boosting
SELECT
article_id,
title,
author,
category,
published_date,
view_count,
like_count,
-- Multi-factor scoring
(
SEARCH_SCORE() * 1.0 + -- Base search relevance
LOG10(GREATEST(1, view_count)) * 0.3 + -- Popularity boost
LOG10(GREATEST(1, like_count)) * 0.2 + -- Engagement boost
CASE
WHEN published_date >= CURRENT_DATE - INTERVAL '7 days' THEN 3.0
WHEN published_date >= CURRENT_DATE - INTERVAL '30 days' THEN 2.0
WHEN published_date >= CURRENT_DATE - INTERVAL '90 days' THEN 1.0
ELSE 0
END + -- Recency boost
CASE
WHEN LENGTH(content) >= 2000 THEN 1.5
WHEN LENGTH(content) >= 1000 THEN 1.0
ELSE 0.5
END -- Content quality boost
) as comprehensive_score
FROM articles
WHERE SEARCH_COMPOUND(
must => ARRAY[
SEARCH_TEXT('blockchain cryptocurrency', fields => ARRAY['title', 'content'])
],
should => ARRAY[
SEARCH_TEXT('blockchain', field => 'title', boost => 3.0),
SEARCH_PHRASE('blockchain technology', fields => ARRAY['title', 'content'], slop => 2)
],
filter => ARRAY[
SEARCH_RANGE('published_date', gte => '2022-01-01'),
SEARCH_TERMS('category', values => ARRAY['technology', 'finance'])
],
must_not => ARRAY[
SEARCH_TERM('status', value => 'draft')
]
)
ORDER BY comprehensive_score DESC;
-- Search analytics and performance monitoring
SELECT
DATE_TRUNC('day', search_timestamp) as search_date,
search_query,
COUNT(*) as search_count,
AVG(execution_time_ms) as avg_execution_time,
AVG(result_count) as avg_results,
-- Performance metrics
COUNT(*) FILTER (WHERE execution_time_ms < 500) as fast_searches,
COUNT(*) FILTER (WHERE result_count > 0) as successful_searches,
COUNT(*) FILTER (WHERE result_count = 0) as empty_searches,
-- Search quality metrics
AVG(CASE WHEN result_count > 0 THEN avg_search_score END) as avg_relevance,
-- User behavior indicators
COUNT(DISTINCT user_id) as unique_searchers,
AVG(click_through_rate) as avg_ctr
FROM search_analytics
WHERE search_timestamp >= CURRENT_DATE - INTERVAL '30 days'
AND search_query IS NOT NULL
GROUP BY DATE_TRUNC('day', search_timestamp), search_query
HAVING COUNT(*) >= 10 -- Only frequent searches
ORDER BY search_count DESC, avg_execution_time ASC;
-- Search optimization recommendations
WITH search_performance AS (
SELECT
search_query,
COUNT(*) as frequency,
AVG(execution_time_ms) as avg_time,
AVG(result_count) as avg_results,
STDDEV(execution_time_ms) as time_variance
FROM search_analytics
WHERE search_timestamp >= CURRENT_DATE - INTERVAL '7 days'
GROUP BY search_query
HAVING COUNT(*) >= 5
),
optimization_analysis AS (
SELECT *,
CASE
WHEN avg_time > 2000 THEN 'slow_query'
WHEN avg_results = 0 THEN 'no_results'
WHEN avg_results < 5 THEN 'few_results'
WHEN time_variance > avg_time THEN 'inconsistent_performance'
ELSE 'optimal'
END as performance_category,
CASE
WHEN avg_time > 2000 THEN 'Add more specific indexes or optimize query complexity'
WHEN avg_results = 0 THEN 'Improve fuzzy matching and synonym configuration'
WHEN avg_results < 5 THEN 'Review relevance scoring and boost popular content'
WHEN time_variance > avg_time THEN 'Investigate index fragmentation or resource contention'
ELSE 'Query performing well'
END as recommendation
)
SELECT
search_query,
frequency,
ROUND(avg_time, 2) as avg_execution_time_ms,
ROUND(avg_results, 1) as avg_result_count,
performance_category,
recommendation,
-- Priority scoring
CASE
WHEN performance_category = 'slow_query' AND frequency > 100 THEN 1
WHEN performance_category = 'no_results' AND frequency > 50 THEN 2
WHEN performance_category = 'inconsistent_performance' AND frequency > 75 THEN 3
ELSE 4
END as optimization_priority
FROM optimization_analysis
WHERE performance_category != 'optimal'
ORDER BY optimization_priority, frequency DESC;
-- QueryLeaf provides comprehensive Atlas Search capabilities:
-- 1. SQL-familiar search index creation and management
-- 2. Advanced text search with custom scoring and boosting
-- 3. Faceted search with aggregations and analytics
-- 4. Auto-completion and suggestion generation
-- 5. Vector search for semantic similarity
-- 6. Hybrid search combining text and vector approaches
-- 7. Search analytics and performance monitoring
-- 8. Automated optimization recommendations
-- 9. Real-time search index synchronization
-- 10. Integration with MongoDB's native Atlas Search features
Best Practices for Atlas Search Implementation
Search Index Optimization
Essential practices for optimal search performance:
- Index Design Strategy: Design indexes specifically for your search patterns and query types
- Field Analysis: Use appropriate analyzers for different content types and languages
- Relevance Tuning: Implement custom scoring with business logic and user behavior
- Performance Monitoring: Track search analytics and optimize based on real usage patterns
- Faceting Strategy: Design facets to support filtering and discovery workflows
- Auto-completion Design: Implement sophisticated suggestion systems for user experience
Search Quality and Relevance
Optimize search quality through comprehensive relevance engineering:
- Multi-factor Scoring: Combine text relevance with business metrics and user behavior
- Semantic Enhancement: Use synonyms and vector search for better understanding
- Query Understanding: Implement fuzzy matching and error correction
- Content Quality: Factor content quality metrics into relevance scoring
- Personalization: Incorporate user preferences and search history
- A/B Testing: Continuously test and optimize search relevance algorithms
Conclusion
MongoDB Atlas Search provides enterprise-grade search capabilities that eliminate the complexity of external search engines while delivering sophisticated full-text search, semantic understanding, and search analytics. The integration of advanced search features with familiar SQL syntax makes implementing modern search applications both powerful and accessible.
Key Atlas Search benefits include:
- Native Integration: Built-in search without external dependencies or synchronization
- Advanced Relevance: Sophisticated scoring with custom business logic
- Real-time Updates: Automatic search index synchronization with data changes
- Comprehensive Analytics: Built-in search performance and user behavior tracking
- Scalable Architecture: Enterprise-grade performance with horizontal scaling
- Developer Friendly: Familiar query syntax with powerful search capabilities
Whether you're building e-commerce search, content discovery platforms, knowledge bases, or applications requiring sophisticated text analysis, MongoDB Atlas Search with QueryLeaf's familiar SQL interface provides the foundation for modern search experiences. This combination enables you to implement advanced search capabilities while preserving familiar database interaction patterns.
QueryLeaf Integration: QueryLeaf automatically manages MongoDB Atlas Search operations while providing SQL-familiar search index creation, query syntax, and analytics. Advanced search features, relevance tuning, and performance optimization are seamlessly handled through familiar SQL patterns, making enterprise-grade search both powerful and accessible.
The integration of native search capabilities with SQL-style operations makes MongoDB an ideal platform for applications requiring both sophisticated search functionality and familiar database interaction patterns, ensuring your search solutions remain both effective and maintainable as they scale and evolve.