Skip to main content

AirLibrary/Indexing/Scan/
ScanDirectory.rs

1//! # ScanDirectory
2//!
3//! ## File: Indexing/Scan/ScanDirectory.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides directory scanning functionality for the File Indexer service,
8//! handling recursive traversal of directories to discover files for indexing.
9//!
10//! ## Primary Responsibility
11//!
12//! Scan directories recursively to discover files matching include patterns
13//! while respecting exclude patterns and filesystem limits.
14//!
15//! ## Secondary Responsibilities
16//!
17//! - Validate directory permissions before scanning
18//! - Parallel file enumeration for performance
19//! - Skip directories like node_modules, target, .git
20//! - Collect files with metadata for batch processing
21//!
22//! ## Dependencies
23//!
24//! **External Crates:**
25//! - `ignore` - .gitignore-aware directory walking
26//! - `tokio` - Async runtime for I/O operations
27//!
28//! **Internal Modules:**
29//! - `crate::Result` - Error handling type
30//! - `crate::AirError` - Error types
31//! - `crate::Configuration::IndexingConfig` - Indexing configuration
32//!
33//! ## Dependents
34//!
35//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
36//! - `Indexing::Background::StartWatcher` - Background task scanning
37//!
38//! ## VSCode Pattern Reference
39//!
40//! Inspired by VSCode's file system scanning in
41//! `src/vs/base/common/files/`
42//!
43//! ## Security Considerations
44//!
45//! - Path traversal protection through canonicalization
46//! - Symbolic link following disabled by default
47//! - Depth limits prevent infinite recursion
48//! - Permission checking before access
49//!
50//! ## Performance Considerations
51//!
52//! - Parallel directory scanning with limited concurrency
53//! - Batch collection of files for processing
54//! - Lazy evaluation with ignore crate
55//! - Early filtering by file patterns
56//!
57//! ## Error Handling Strategy
58//!
59//! Scan operations log warnings for individual errors and continue,
60//! returning a result only if the top-level operation fails.
61//!
62//! ## Thread Safety
63//!
64//! Scan operations are designed to be called from async tasks and
65//! return collectable results for parallel processing.
66
67use std::{path::Path, sync::Arc};
68
69use tokio::sync::Semaphore;
70
71use crate::{
72	AirError,
73	Configuration::IndexingConfig,
74	Indexing::{Scan::ScanFile::ValidateFileAccess, State::CreateState::FileIndex},
75	Result,
76	dev_log,
77};
78
79/// Scan directory result with statistics
80#[derive(Debug, Clone)]
81pub struct ScanDirectoryResult {
82	/// Number of files discovered
83	pub files_found:u32,
84	/// Number of files skipped (due to patterns/size)
85	pub files_skipped:u32,
86	/// Number of errors encountered
87	pub errors:u32,
88	/// Total size of discovered files in bytes
89	pub total_size:u64,
90}
91
92/// Scan a directory recursively and collect matching files
93///
94/// Features:
95/// - Path traversal protection
96/// - Symbolic link handling (disabled by default)
97/// - File size validation
98/// - Permission error handling
99/// - Include/exclude pattern support
100/// - Parallel scanning with semaphore limits
101pub async fn ScanDirectory(
102	path:&str,
103	patterns:Vec<String>,
104	config:&IndexingConfig,
105	_max_parallel:usize,
106) -> Result<(Vec<std::path::PathBuf>, ScanDirectoryResult)> {
107	let directory_path = crate::Configuration::ConfigurationManager::ExpandPath(path)?;
108
109	// Validate directory exists and is accessible
110	if !directory_path.exists() {
111		return Err(AirError::FileSystem(format!("Directory does not exist: {}", path)));
112	}
113
114	if !directory_path.is_dir() {
115		return Err(AirError::FileSystem(format!("Path is not a directory: {}", path)));
116	}
117
118	// Check directory permissions
119	CheckDirectoryPermissions(&directory_path).await?;
120
121	// Build file patterns
122	let include_patterns = if patterns.is_empty() { config.FileTypes.clone() } else { patterns };
123
124	// Walk directory with .gitignore support
125	let walker = ignore::WalkBuilder::new(&directory_path)
126		.max_depth(Some(10)) // Prevent infinite recursion
127		.hidden(false)
128		.follow_links(false) // Don't follow symlinks by default
129		.build();
130
131	let mut files_to_scan:Vec<std::path::PathBuf> = Vec::new();
132	let mut files_found = 0u32;
133	let mut files_skipped = 0u32;
134	let mut errors = 0u32;
135	let mut total_size = 0u64;
136
137	// Collect all files first
138	for result in walker {
139		match result {
140			Ok(entry) => {
141				// Only index regular files (not directories or symlinks)
142				if entry.file_type().map(|ft| ft.is_file()).unwrap_or(false) {
143					let file_path = entry.path().to_path_buf();
144
145					// Check if file is a symbolic link
146					if entry.path_is_symlink() {
147						dev_log!("indexing", "[ScanDirectory] Skipping symlink: {}", file_path.display());
148						files_skipped += 1;
149						continue;
150					}
151
152					// Check file size limit
153					if let Ok(metadata) = entry.metadata() {
154						let file_size = metadata.len();
155
156						if file_size > config.MaxFileSizeMb as u64 * 1024 * 1024 {
157							dev_log!(
158								"indexing",
159								"warn: [ScanDirectory] Skipping oversized file: {} ({} bytes)",
160								file_path.display(),
161								file_size
162							);
163							files_skipped += 1;
164							continue;
165						}
166
167						// Check file pattern
168						if MatchesPatterns(&file_path, &include_patterns) {
169							// Try to get file access to validate permissions
170							if ValidateFileAccess(&file_path).await {
171								files_to_scan.push(file_path);
172								files_found += 1;
173								total_size += file_size;
174							} else {
175								dev_log!(
176									"indexing",
177									"warn: [ScanDirectory] Cannot access file (permission denied): {}",
178									file_path.display()
179								);
180								errors += 1;
181							}
182						} else {
183							files_skipped += 1;
184						}
185					} else {
186						errors += 1;
187					}
188				}
189			},
190			Err(e) => {
191				dev_log!("indexing", "warn: [ScanDirectory] Error walking directory: {}", e);
192				errors += 1;
193			},
194		}
195	}
196
197	dev_log!(
198		"indexing",
199		"[ScanDirectory] Directory scan completed: {} files, {} skipped, {} errors, {} bytes",
200		files_found,
201		files_skipped,
202		errors,
203		total_size
204	);
205
206	Ok((
207		files_to_scan,
208		ScanDirectoryResult { files_found, files_skipped, errors, total_size },
209	))
210}
211
212/// Scan a directory and remove deleted files from index
213pub async fn ScanAndRemoveDeleted(index:&mut FileIndex, directory_path:&Path) -> Result<u32> {
214	let mut paths_to_remove = Vec::new();
215	let all_paths:Vec<_> = index.files.keys().cloned().collect();
216
217	for path in all_paths {
218		if !path.exists() && path.starts_with(directory_path) {
219			paths_to_remove.push(path.clone());
220		}
221	}
222
223	let removed_count = paths_to_remove.len();
224	for path in paths_to_remove {
225		index.files.remove(&path);
226		index.file_symbols.remove(&path);
227
228		// Remove from symbol index
229		for (_, locations) in index.symbol_index.iter_mut() {
230			locations.retain(|loc| loc.file_path != path);
231		}
232
233		// Remove from content index
234		for (_, files) in index.content_index.iter_mut() {
235			files.retain(|p| p != &path);
236		}
237	}
238
239	Ok(removed_count as u32)
240}
241
242/// Check directory read permissions
243async fn CheckDirectoryPermissions(path:&Path) -> Result<()> {
244	tokio::task::spawn_blocking({
245		let path = path.to_path_buf();
246		move || {
247			std::fs::read_dir(&path)
248				.map_err(|e| AirError::FileSystem(format!("Cannot read directory {}: {}", path.display(), e)))?;
249			Ok(())
250		}
251	})
252	.await?
253}
254
255/// Check if file path matches any of the provided patterns
256pub fn MatchesPatterns(file_path:&std::path::Path, patterns:&[String]) -> bool {
257	if patterns.is_empty() {
258		return true;
259	}
260
261	let file_name = file_path.file_name().unwrap_or_default().to_string_lossy().to_string();
262
263	for pattern in patterns {
264		if MatchesPattern(&file_name, pattern) {
265			return true;
266		}
267	}
268
269	false
270}
271
272/// Check if filename matches a single pattern
273pub fn MatchesPattern(filename:&str, pattern:&str) -> bool {
274	if pattern.starts_with("*.") {
275		let extension = &pattern[2..];
276		filename.ends_with(extension)
277	} else {
278		filename == pattern
279	}
280}
281
282/// Get default exclude patterns for directory scanning
283pub fn GetDefaultExcludePatterns() -> Vec<String> {
284	vec![
285		"node_modules".to_string(),
286		"target".to_string(),
287		".git".to_string(),
288		".svn".to_string(),
289		".hg".to_string(),
290		".bzr".to_string(),
291		"dist".to_string(),
292		"build".to_string(),
293		".next".to_string(),
294		".nuxt".to_string(),
295		"__pycache__".to_string(),
296		"*.pyc".to_string(),
297		".venv".to_string(),
298		"venv".to_string(),
299		"env".to_string(),
300		".env".to_string(),
301		".idea".to_string(),
302		".vscode".to_string(),
303		".DS_Store".to_string(),
304		"Thumbs.db".to_string(),
305	]
306}
307
308/// Parallel scan of multiple directories
309pub async fn ScanDirectoriesParallel(
310	directories:Vec<String>,
311	patterns:Vec<String>,
312	config:&IndexingConfig,
313	max_parallel:usize,
314) -> Result<(Vec<std::path::PathBuf>, ScanDirectoryResult)> {
315	let semaphore = Arc::new(Semaphore::new(max_parallel));
316	let mut all_files = Vec::new();
317	let mut total_result = ScanDirectoryResult { files_found:0, files_skipped:0, errors:0, total_size:0 };
318
319	let mut scan_tasks = Vec::new();
320
321	for directory in directories {
322		let permit = semaphore.clone().acquire_owned().await.unwrap();
323		let config_clone = config.clone();
324		let patterns_clone = patterns.clone();
325
326		let task = tokio::spawn(async move {
327			let _permit = permit;
328			ScanDirectory(&directory, patterns_clone, &config_clone, max_parallel).await
329		});
330
331		scan_tasks.push(task);
332	}
333
334	// Collect results
335	for task in scan_tasks {
336		match task.await {
337			Ok(Ok((files, result))) => {
338				all_files.extend(files);
339				total_result.files_found += result.files_found;
340				total_result.files_skipped += result.files_skipped;
341				total_result.errors += result.errors;
342				total_result.total_size += result.total_size;
343			},
344			Ok(Err(e)) => {
345				dev_log!("indexing", "error: [ScanDirectory] Parallel scan failed: {}", e);
346				total_result.errors += 1;
347			},
348			Err(e) => {
349				dev_log!("indexing", "error: [ScanDirectory] Parallel task panicked: {}", e);
350				total_result.errors += 1;
351			},
352		}
353	}
354
355	Ok((all_files, total_result))
356}
357
358/// Get file count statistics for a directory without full scan
359pub async fn GetDirectoryStatistics(path:&str, max_depth:Option<usize>) -> Result<DirectoryStatistics> {
360	let directory_path = crate::Configuration::ConfigurationManager::ExpandPath(path)?;
361
362	if !directory_path.exists() || !directory_path.is_dir() {
363		return Err(AirError::FileSystem(format!("Invalid directory: {}", path)));
364	}
365
366	let mut file_count = 0u64;
367	let mut total_size = 0u64;
368	let mut directory_count = 0u64;
369	let mut hidden_count = 0u64;
370
371	let walker = ignore::WalkBuilder::new(&directory_path)
372		.max_depth(max_depth)
373		.hidden(true)
374		.follow_links(false)
375		.build();
376
377	for entry in walker.flatten() {
378		let file_type = entry.file_type().expect("Failed to get file type");
379
380		if file_type.is_file() {
381			file_count += 1;
382			if let Ok(metadata) = entry.metadata() {
383				total_size += metadata.len();
384			}
385		} else if file_type.is_dir() {
386			directory_count += 1;
387		}
388
389		if entry.depth() > 0
390			&& entry
391				.path()
392				.components()
393				.any(|c| c.as_os_str().to_string_lossy().starts_with('.'))
394		{
395			hidden_count += 1;
396		}
397	}
398
399	Ok(DirectoryStatistics { file_count, directory_count, hidden_count, total_size })
400}
401
402/// Directory statistics
403#[derive(Debug, Clone)]
404pub struct DirectoryStatistics {
405	pub file_count:u64,
406	pub directory_count:u64,
407	pub hidden_count:u64,
408	pub total_size:u64,
409}