Skip to main content

AirLibrary/Indexing/Store/
QueryIndex.rs

1//! # QueryIndex
2//!
3//! ## File: Indexing/Store/QueryIndex.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides index query functionality for the File Indexer service,
8//! handling search operations across indexed files with multiple search modes.
9//!
10//! ## Primary Responsibility
11//!
12//! Query the file index to find symbols and content matching search criteria,
13//! supporting literal, regex, fuzzy, and exact search modes.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - Multi-mode search (literal, regex, fuzzy, exact)
18//! - Case sensitivity and whole word matching
19//! - Path and language filtering
20//! - Result pagination and ranking
21//! - Search query sanitization
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `regex` - Regular expression search patterns
27//! - `tokio` - Async file I/O operations
28//!
29//! **Internal Modules:**
30//! - `crate::Result` - Error handling type
31//! - `crate::AirError` - Error types
32//! - `super::super::FileIndex` - Index structure definitions
33//!
34//! ## Dependents
35//!
36//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
37//!
38//! ## VSCode Pattern Reference
39//!
40//! Inspired by VSCode's search functionality in
41//! `src/vs/workbench/services/search/common/`
42//!
43//! ## Security Considerations
44//!
45//! - Search query sanitization prevents injection
46//! - Query length limits
47//! - Control character filtering
48//!
49//! ## Performance Considerations
50//!
51//! - Content index for fast token lookup
52//! - Result pagination to limit memory usage
53//! - Efficient string matching algorithms
54//! - Fuzzy search with configurable distance
55//!
56//! ## Error Handling Strategy
57//!
58//! Query operations return detailed error messages for invalid queries
59//! or search failures, treating individual file read errors as warnings.
60//!
61//! ## Thread Safety
62//!
63//! Query operations read from shared Arc<RwLock<>> state and
64//! return safe-ownership results for the caller.
65
66use std::path::PathBuf;
67
68use regex::Regex;
69
70use crate::{AirError, Result, dev_log};
71// Use the full paths to types in State::CreateState
72use crate::Indexing::State::CreateState::{FileIndex, FileMetadata};
73
74/// Maximum search results per query (pagination default)
75pub const MAX_SEARCH_RESULTS_DEFAULT:u32 = 100;
76
77/// Search query with multiple modes
78#[derive(Debug, Clone)]
79pub struct SearchQuery {
80	/// Search text
81	pub query:String,
82	/// Query mode (regex, literal, fuzzy)
83	pub mode:SearchMode,
84	/// Case sensitive search
85	pub case_sensitive:bool,
86	/// Exact word match
87	pub whole_word:bool,
88	/// Regex pattern (only for regex mode)
89	pub regex:Option<Regex>,
90	/// Maximum results per page
91	pub max_results:u32,
92	/// Page number for pagination
93	pub page:u32,
94}
95
96/// Search mode
97#[derive(Debug, Clone, PartialEq)]
98pub enum SearchMode {
99	/// Literal text search
100	Literal,
101	/// Regular expression search
102	Regex,
103	/// Fuzzy search with typo tolerance
104	Fuzzy,
105	/// Exact match
106	Exact,
107}
108
109/// Search result with relevance scoring
110#[derive(Debug, Clone)]
111pub struct SearchResult {
112	/// File path
113	pub path:String,
114	/// File name
115	pub file_name:String,
116	/// Matched lines with context
117	pub matches:Vec<SearchMatch>,
118	/// Relevance score (higher = more relevant)
119	pub relevance:f64,
120	/// Matched language (if applicable)
121	pub language:Option<String>,
122}
123
124/// Search match with full context
125#[derive(Debug, Clone)]
126pub struct SearchMatch {
127	/// Line number (1-indexed)
128	pub line_number:u32,
129	/// Line content
130	pub line_content:String,
131	/// Match start position
132	pub match_start:usize,
133	/// Match end position
134	pub match_end:usize,
135	/// Lines before match for context
136	pub context_before:Vec<String>,
137	/// Lines after match for context
138	pub context_after:Vec<String>,
139}
140
141/// Paginated search results
142#[derive(Debug, Clone)]
143pub struct PaginatedSearchResults {
144	/// Current page of results
145	pub results:Vec<SearchResult>,
146	/// Total number of results (across all pages)
147	pub total_count:u32,
148	/// Current page number (0-indexed)
149	pub page:u32,
150	/// Number of pages
151	pub total_pages:u32,
152	/// Results per page
153	pub page_size:u32,
154}
155
156impl IntoIterator for PaginatedSearchResults {
157	type Item = SearchResult;
158	type IntoIter = std::vec::IntoIter<SearchResult>;
159
160	fn into_iter(self) -> Self::IntoIter { self.results.into_iter() }
161}
162
163impl<'a> IntoIterator for &'a PaginatedSearchResults {
164	type Item = &'a SearchResult;
165	type IntoIter = std::slice::Iter<'a, SearchResult>;
166
167	fn into_iter(self) -> Self::IntoIter { self.results.iter() }
168}
169
170/// Search files with multiple modes and comprehensive query handling
171///
172/// Features:
173/// - Sanitized search query
174/// - Multiple search modes (literal, regex, fuzzy, exact)
175/// - Case sensitivity option
176/// - Whole word matching
177/// - Path filtering
178/// - Result pagination
179/// - Relevance-based ranking
180/// - Language filtering
181pub async fn QueryIndexSearch(
182	index:&FileIndex,
183	query:SearchQuery,
184	path_filter:Option<String>,
185	language_filter:Option<String>,
186) -> Result<PaginatedSearchResults> {
187	dev_log!(
188		"indexing",
189		"[QueryIndex] Searching for: '{}' (mode: {:?})",
190		query.query,
191		query.mode
192	);
193	// Sanitize search query
194	let sanitized_query = SanitizeSearchQuery(&query.query)?;
195
196	// Build search parameters
197	let case_sensitive = query.case_sensitive;
198	let whole_word = query.whole_word;
199	let max_results = if query.max_results == 0 {
200		MAX_SEARCH_RESULTS_DEFAULT
201	} else {
202		query.max_results.min(1000) // Cap at 1000 results
203	};
204
205	let mut all_results = Vec::new();
206
207	// Search based on mode
208	match query.mode {
209		SearchMode::Literal => {
210			QueryIndexLiteral(
211				&sanitized_query,
212				case_sensitive,
213				whole_word,
214				path_filter.as_deref(),
215				language_filter.as_deref(),
216				index,
217				&mut all_results,
218			)
219			.await;
220		},
221		SearchMode::Regex => {
222			if let Some(regex) = &query.regex {
223				QueryIndexRegex(
224					regex,
225					path_filter.as_deref(),
226					language_filter.as_deref(),
227					index,
228					&mut all_results,
229				)
230				.await;
231			} else {
232				// Try to compile regex from query
233				if let Ok(regex) = Regex::new(&sanitized_query) {
234					QueryIndexRegex(
235						&regex,
236						path_filter.as_deref(),
237						language_filter.as_deref(),
238						index,
239						&mut all_results,
240					)
241					.await;
242				}
243			}
244		},
245		SearchMode::Fuzzy => {
246			QueryIndexFuzzy(
247				&sanitized_query,
248				case_sensitive,
249				path_filter.as_deref(),
250				language_filter.as_deref(),
251				index,
252				&mut all_results,
253			)
254			.await;
255		},
256		SearchMode::Exact => {
257			QueryIndexExact(
258				&sanitized_query,
259				case_sensitive,
260				path_filter.as_deref(),
261				language_filter.as_deref(),
262				index,
263				&mut all_results,
264			)
265			.await;
266		},
267	}
268
269	// Rank results by relevance
270	all_results.sort_by(|a, b| b.relevance.partial_cmp(&a.relevance).unwrap());
271
272	// Calculate pagination
273	let total_count = all_results.len() as u32;
274	let total_pages = if max_results == 0 { 0 } else { total_count.div_ceil(max_results) };
275	let page = query.page.min(total_pages.saturating_sub(1));
276
277	// Extract current page
278	let start = (page * max_results) as usize;
279	let end = ((page + 1) * max_results).min(total_count) as usize;
280	let page_results = all_results[start..end].to_vec();
281
282	dev_log!(
283		"indexing",
284		"[QueryIndex] Search completed: {} total results, page {} of {}",
285		total_count,
286		page + 1,
287		total_pages
288	);
289
290	Ok(PaginatedSearchResults { results:page_results, total_count, page, total_pages, page_size:max_results })
291}
292
293/// Sanitize search query to prevent injection and invalid patterns
294pub fn SanitizeSearchQuery(query:&str) -> Result<String> {
295	// Remove null bytes and control characters
296	let sanitized:String = query.chars().filter(|c| *c != '\0' && !c.is_control()).collect();
297
298	// Limit query length
299	if sanitized.len() > 1000 {
300		return Err(AirError::Validation(
301			"Search query exceeds maximum length of 1000 characters".to_string(),
302		));
303	}
304
305	Ok(sanitized)
306}
307
308/// Literal search (default mode)
309async fn QueryIndexLiteral(
310	query:&str,
311	case_sensitive:bool,
312	whole_word:bool,
313	path_filter:Option<&str>,
314	language_filter:Option<&str>,
315	index:&FileIndex,
316	results:&mut Vec<SearchResult>,
317) {
318	let search_query = if case_sensitive { query.to_string() } else { query.to_lowercase() };
319
320	// Search in content index first (faster)
321	if let Some(file_paths) = index.content_index.get(&search_query.to_lowercase()) {
322		for file_path in file_paths {
323			if let Some(metadata) = index.files.get(file_path) {
324				if MatchesFilters(file_path, metadata, path_filter, language_filter) {
325					if let Ok(search_result) =
326						FindMatchesInFile(file_path, &search_query, case_sensitive, whole_word, index).await
327					{
328						if !search_result.matches.is_empty() {
329							results.push(search_result);
330						}
331					}
332				}
333			}
334		}
335	}
336
337	// Also search in file names
338	for (file_path, metadata) in &index.files {
339		if results.len() >= 1000 {
340			break;
341		}
342
343		let file_name = file_path.file_name().unwrap_or_default().to_string_lossy().to_string();
344
345		let name_to_search = if case_sensitive { file_name.clone() } else { file_name.to_lowercase() };
346
347		if name_to_search.contains(&search_query) {
348			if MatchesFilters(file_path, metadata, path_filter, language_filter) {
349				// Filename match has lower relevance than content match
350				results.push(SearchResult {
351					path:file_path.to_string_lossy().to_string(),
352					file_name,
353					matches:Vec::new(),
354					relevance:0.3,
355					language:metadata.language.clone(),
356				});
357			}
358		}
359	}
360}
361
362/// Regex search mode
363async fn QueryIndexRegex(
364	regex:&Regex,
365	path_filter:Option<&str>,
366	language_filter:Option<&str>,
367	index:&FileIndex,
368	results:&mut Vec<SearchResult>,
369) {
370	for (file_path, metadata) in &index.files {
371		if results.len() >= 1000 {
372			break;
373		}
374
375		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
376			continue;
377		}
378
379		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
380			let matches = FindRegexMatches(&content, regex);
381
382			if !matches.is_empty() {
383				let relevance = CalculateRelevance(&matches, metadata);
384
385				results.push(SearchResult {
386					path:file_path.to_string_lossy().to_string(),
387					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
388					matches,
389					relevance,
390					language:metadata.language.clone(),
391				});
392			}
393		}
394	}
395}
396
397/// Fuzzy search with typo tolerance using Levenshtein distance
398async fn QueryIndexFuzzy(
399	query:&str,
400	case_sensitive:bool,
401	path_filter:Option<&str>,
402	language_filter:Option<&str>,
403	index:&FileIndex,
404	results:&mut Vec<SearchResult>,
405) {
406	let query_lower = query.to_lowercase();
407
408	for (file_path, metadata) in &index.files {
409		if results.len() >= 1000 {
410			break;
411		}
412
413		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
414			continue;
415		}
416
417		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
418			const MAX_FUZZY_DISTANCE:usize = 2;
419			let matches = FindFuzzyMatches(&content, &query_lower, case_sensitive, MAX_FUZZY_DISTANCE);
420
421			if !matches.is_empty() {
422				let relevance = CalculateRelevance(&matches, metadata) * 0.8; // Fuzzy matches have lower relevance
423
424				results.push(SearchResult {
425					path:file_path.to_string_lossy().to_string(),
426					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
427					matches,
428					relevance,
429					language:metadata.language.clone(),
430				});
431			}
432		}
433	}
434}
435
436/// Exact match search (whole word, case-sensitive)
437async fn QueryIndexExact(
438	query:&str,
439	_case_sensitive:bool,
440	path_filter:Option<&str>,
441	language_filter:Option<&str>,
442	index:&FileIndex,
443	results:&mut Vec<SearchResult>,
444) {
445	for (file_path, metadata) in &index.files {
446		if results.len() >= 1000 {
447			break;
448		}
449
450		if !MatchesFilters(file_path, metadata, path_filter, language_filter) {
451			continue;
452		}
453
454		if let Ok(content) = tokio::fs::read_to_string(file_path).await {
455			let matches = FindExactMatches(&content, query);
456
457			if !matches.is_empty() {
458				let relevance = CalculateRelevance(&matches, metadata) * 1.1; // Exact matches have higher relevance
459
460				results.push(SearchResult {
461					path:file_path.to_string_lossy().to_string(),
462					file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
463					matches,
464					relevance,
465					language:metadata.language.clone(),
466				});
467			}
468		}
469	}
470}
471
472/// Find matches in a single file with context
473async fn FindMatchesInFile(
474	file_path:&PathBuf,
475	query:&str,
476	case_sensitive:bool,
477	whole_word:bool,
478	index:&FileIndex,
479) -> Result<SearchResult> {
480	let content = tokio::fs::read_to_string(file_path)
481		.await
482		.map_err(|e| AirError::FileSystem(format!("Failed to read file: {}", e)))?;
483
484	let metadata = index
485		.files
486		.get(file_path)
487		.ok_or_else(|| AirError::Internal("File metadata not found in index".to_string()))?;
488
489	let matches = FindMatchesWithContext(&content, query, case_sensitive, whole_word);
490	let relevance = CalculateRelevance(&matches, metadata);
491
492	Ok(SearchResult {
493		path:file_path.to_string_lossy().to_string(),
494		file_name:file_path.file_name().unwrap_or_default().to_string_lossy().to_string(),
495		matches,
496		relevance,
497		language:metadata.language.clone(),
498	})
499}
500
501/// Find matches in content with surrounding context
502fn FindMatchesWithContext(content:&str, query:&str, case_sensitive:bool, whole_word:bool) -> Vec<SearchMatch> {
503	let mut matches = Vec::new();
504	let lines:Vec<&str> = content.lines().collect();
505
506	let search_in = |line:&str| -> Option<(usize, usize)> {
507		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
508
509		let query_to_find = if case_sensitive { query.to_string() } else { query.to_lowercase() };
510
511		let start = if whole_word {
512			FindWholeWordMatch(&line_to_search, &query_to_find)
513		} else {
514			line_to_search.find(&query_to_find)
515		};
516
517		start.map(|s| (s, s + query.len()))
518	};
519
520	for (line_idx, line) in lines.iter().enumerate() {
521		let line_number = line_idx as u32 + 1;
522
523		if let Some((match_start, match_end)) = search_in(line) {
524			// Get context lines (2 before, 2 after)
525			let context_start = line_idx.saturating_sub(2);
526			let context_end = (line_idx + 3).min(lines.len());
527
528			let context_before = lines[context_start..line_idx].iter().map(|s| s.to_string()).collect();
529
530			let context_after = lines[line_idx + 1..context_end].iter().map(|s| s.to_string()).collect();
531
532			matches.push(SearchMatch {
533				line_number,
534				line_content:line.to_string(),
535				match_start,
536				match_end,
537				context_before,
538				context_after,
539			});
540		}
541	}
542
543	matches
544}
545
546/// Find whole word match with word boundary detection
547fn FindWholeWordMatch(line:&str, word:&str) -> Option<usize> {
548	let mut start = 0;
549
550	while let Some(pos) = line[start..].find(word) {
551		let actual_pos = start + pos;
552
553		// Check word boundary before
554		let valid_before = actual_pos == 0
555			|| line
556				.chars()
557				.nth(actual_pos - 1)
558				.map_or(true, |c| !c.is_alphanumeric() && c != '_');
559
560		// Check word boundary after
561		let match_end = actual_pos + word.len();
562		let valid_after =
563			match_end == line.len() || line.chars().nth(match_end).map_or(true, |c| !c.is_alphanumeric() && c != '_');
564
565		if valid_before && valid_after {
566			return Some(actual_pos);
567		}
568
569		start = actual_pos + 1;
570	}
571
572	None
573}
574
575/// Find regex matches in content
576fn FindRegexMatches(content:&str, regex:&Regex) -> Vec<SearchMatch> {
577	let mut matches = Vec::new();
578	let lines:Vec<&str> = content.lines().collect();
579
580	for (line_idx, line) in lines.iter().enumerate() {
581		let line_number = line_idx as u32 + 1;
582
583		for mat in regex.find_iter(line) {
584			matches.push(SearchMatch {
585				line_number,
586				line_content:line.to_string(),
587				match_start:mat.start(),
588				match_end:mat.end(),
589				context_before:Vec::new(),
590				context_after:Vec::new(),
591			});
592		}
593	}
594
595	matches
596}
597
598/// Find fuzzy matches using Levenshtein distance algorithm
599fn FindFuzzyMatches(content:&str, query:&str, case_sensitive:bool, max_distance:usize) -> Vec<SearchMatch> {
600	let mut matches = Vec::new();
601	let lines:Vec<&str> = content.lines().collect();
602
603	for (line_idx, line) in lines.iter().enumerate() {
604		let line_number = line_idx as u32 + 1;
605		let line_to_search = if case_sensitive { line.to_string() } else { line.to_lowercase() };
606
607		// Calculate Levenshtein distance for fuzzy matching
608		if let Some(pos) = line_to_search.find(query) {
609			// Check if the match is within the MaxDistance threshold
610			let distance = CalculateLevenshteinDistance(&line_to_search[pos..pos.saturating_add(query.len())], query);
611
612			if distance <= max_distance {
613				matches.push(SearchMatch {
614					line_number,
615					line_content:line.to_string(),
616					match_start:pos,
617					match_end:pos + query.len(),
618					context_before:Vec::new(),
619					context_after:Vec::new(),
620				});
621			}
622		}
623	}
624
625	matches
626}
627
628/// Find exact matches (word boundary and case-sensitive)
629fn FindExactMatches(content:&str, query:&str) -> Vec<SearchMatch> { FindMatchesWithContext(content, query, true, true) }
630
631/// Calculate Levenshtein distance between two strings
632fn CalculateLevenshteinDistance(s1:&str, s2:&str) -> usize {
633	let s1_chars:Vec<char> = s1.chars().collect();
634	let s2_chars:Vec<char> = s2.chars().collect();
635	let len1 = s1_chars.len();
636	let len2 = s2_chars.len();
637
638	// Create a 2D matrix to store distances
639	let mut dp = vec![vec![0usize; len2 + 1]; len1 + 1];
640
641	// Initialize the matrix
642	for i in 0..=len1 {
643		dp[i][0] = i;
644	}
645	for j in 0..=len2 {
646		dp[0][j] = j;
647	}
648
649	// Calculate distances
650	for i in 1..=len1 {
651		for j in 1..=len2 {
652			if s1_chars[i - 1] == s2_chars[j - 1] {
653				dp[i][j] = dp[i - 1][j - 1];
654			} else {
655				dp[i][j] = 1 + [
656					dp[i - 1][j],     // deletion
657					dp[i][j - 1],     // insertion
658					dp[i - 1][j - 1], // substitution
659				]
660				.into_iter()
661				.min()
662				.unwrap();
663			}
664		}
665	}
666
667	dp[len1][len2]
668}
669
670/// Calculate relevance score for search results
671fn CalculateRelevance(matches:&[SearchMatch], metadata:&FileMetadata) -> f64 {
672	let match_count = matches.len();
673	let line_count = metadata.line_count.unwrap_or(1) as f64;
674
675	// Base relevance: ratio of matching lines to total lines
676	let mut relevance = (match_count as f64 / line_count) * 10.0;
677
678	// Bonus for more matches
679	if match_count > 0 {
680		relevance += (match_count as f64).log10() * 0.5;
681	}
682
683	// Bonus for recently modified files
684	let days_old = (chrono::Utc::now() - metadata.modified).num_days() as f64;
685	relevance += 1.0 / (days_old + 1.0).max(1.0);
686
687	relevance.min(10.0).max(0.0)
688}
689
690/// Check if file matches filters
691pub fn MatchesFilters(
692	file_path:&PathBuf,
693	metadata:&FileMetadata,
694	path_filter:Option<&str>,
695	language_filter:Option<&str>,
696) -> bool {
697	// Check path filter
698	if let Some(search_path) = path_filter {
699		if !file_path.to_string_lossy().contains(search_path) {
700			return false;
701		}
702	}
703
704	// Check language filter
705	if let Some(lang) = language_filter {
706		if metadata.language.as_deref() != Some(lang) {
707			return false;
708		}
709	}
710
711	true
712}