Skip to main content

AirLibrary/Indexing/State/
CreateState.rs

1//! # CreateState
2//!
3//! ## File: Indexing/State/CreateState.rs
4//!
5//! ## Role in Air Architecture
6//!
7//! Provides state creation functions for the File Indexer service, including
8//! the construction of index entries, symbols, and related data structures
9//! used throughout the indexing system.
10//!
11//! ## Primary Responsibility
12//!
13//! Create and initialize index state structures including FileIndex,
14//! FileMetadata, SymbolInfo, and related types.
15//!
16//! ## Secondary Responsibilities
17//!
18//! - Generate index version strings
19//! - Calculate index checksums for integrity verification
20//! - Create new empty indexes
21//! - Backup corrupted indexes
22//!
23//! ## Dependencies
24//!
25//! **External Crates:**
26//! - `chrono` - Timestamp generation for index metadata
27//! - `sha2` - Checksum calculation for index integrity
28//! - `serde` - Serialization/deserialization of index structures
29//!
30//! **Internal Modules:**
31//! - `crate::Result` - Error handling type
32//! - `crate::AirError` - Error types
33//!
34//! ## Dependents
35//!
36//! - `Indexing::Store::StoreEntry` - Creates entries for index storage
37//! - `Indexing::Store::UpdateIndex` - Updates index state
38//! - `Indexing::mod::FileIndexer` - Main file indexer implementation
39//!
40//! ## VSCode Pattern Reference
41//!
42//! Inspired by VSCode's indexer state creation in
43//! `src/vs/workbench/services/search/common/`
44//!
45//! ## Security Considerations
46//!
47//! - Checksums prevent tampering with index data
48//! - Version tracking enables corruption detection
49//! - Path traversal protection applied during validation
50//!
51//! ## Performance Considerations
52//!
53//! - Lightweight state creation operations
54//! - Hash calculations are amortized across index operations
55//! - Memory-efficient data structures for large indexes
56//!
57//! ## Error Handling Strategy
58//!
59//! State creation operations use result types and propagate errors up
60//! with clear messages about what failed during creation or validation.
61//!
62//! ## Thread Safety
63//!
64//! State structures are designed to be moved into Arc<RwLock<>> for
65//! thread-safe shared access across indexing and search operations.
66
67use std::{collections::HashMap, path::PathBuf};
68#[cfg(unix)]
69use std::os::unix::fs::PermissionsExt;
70
71use serde::{Deserialize, Serialize};
72use sha2::{Digest, Sha256};
73
74use crate::{AirError, Result};
75
76/// Maximum file size allowed for indexing (100MB)
77pub const MAX_FILE_SIZE_BYTES:u64 = 100 * 1024 * 1024;
78
79/// Symbol information extracted from files for VSCode Outline View
80#[derive(Debug, Clone, Serialize, Deserialize)]
81pub struct SymbolInfo {
82	/// Symbol name (function, class, variable, etc.)
83	pub name:String,
84	/// Symbol kind (function, class, struct, interface, etc.)
85	pub kind:SymbolKind,
86	/// Line number where symbol is defined
87	pub line:u32,
88	/// Column number
89	pub column:u32,
90	/// Full qualified path
91	pub full_path:String,
92}
93
94/// Symbol kind for VSCode compatibility
95#[derive(Debug, Clone, Serialize, Deserialize, Hash, Eq, PartialEq)]
96pub enum SymbolKind {
97	File = 0,
98	Module = 1,
99	Namespace = 2,
100	Package = 3,
101	Class = 4,
102	Method = 5,
103	Property = 6,
104	Field = 7,
105	Constructor = 8,
106	Enum = 9,
107	Interface = 10,
108	Function = 11,
109	Variable = 12,
110	Constant = 13,
111	String = 14,
112	Number = 15,
113	Boolean = 16,
114	Array = 17,
115	Object = 18,
116	Key = 19,
117	Null = 20,
118	EnumMember = 21,
119	Struct = 22,
120	Event = 23,
121	Operator = 24,
122	TypeParameter = 25,
123}
124
125/// Symbol location for cross-referencing
126#[derive(Debug, Clone, Serialize, Deserialize)]
127pub struct SymbolLocation {
128	/// File containing the symbol
129	pub file_path:PathBuf,
130	/// Line number
131	pub line:u32,
132	/// Symbol information
133	pub symbol:SymbolInfo,
134}
135
136/// File metadata with comprehensive information
137#[derive(Debug, Clone, Serialize, Deserialize)]
138pub struct FileMetadata {
139	/// File path
140	pub path:PathBuf,
141	/// File size in bytes
142	pub size:u64,
143	/// Last modification timestamp
144	pub modified:chrono::DateTime<chrono::Utc>,
145	/// MIME type
146	pub mime_type:String,
147	/// Detected programming language
148	pub language:Option<String>,
149	/// Line count for text files
150	pub line_count:Option<u32>,
151	/// SHA-256 checksum for change detection
152	pub checksum:String,
153	/// Whether file is a symbolic link
154	pub is_symlink:bool,
155	/// File permissions (format: "rwxrwxrwx")
156	pub permissions:String,
157	/// File encoding (UTF-8, ASCII, etc.)
158	pub encoding:Option<String>,
159	/// Last indexed timestamp
160	pub indexed_at:chrono::DateTime<chrono::Utc>,
161	/// Number of symbols extracted
162	pub symbol_count:u32,
163}
164
165/// File index structure with comprehensive metadata
166#[derive(Debug, Clone, Serialize, Deserialize)]
167pub struct FileIndex {
168	/// Indexed files with complete metadata
169	pub files:HashMap<PathBuf, FileMetadata>,
170	/// Content index for fast text search
171	/// Maps words/tokens to file paths where they appear
172	pub content_index:HashMap<String, Vec<PathBuf>>,
173	/// Symbol index for VSCode Outline View and Go to Symbol
174	/// Maps symbol names to their definitions
175	pub symbol_index:HashMap<String, Vec<SymbolLocation>>,
176	/// Reverse symbol index for cross-referencing
177	pub file_symbols:HashMap<PathBuf, Vec<SymbolInfo>>,
178	/// Last update timestamp for all indexes
179	pub last_updated:chrono::DateTime<chrono::Utc>,
180	/// Index version for corruption detection
181	pub index_version:String,
182	/// Index checksum for integrity verification
183	pub index_checksum:String,
184}
185
186/// Create a new empty file index
187pub fn CreateNewIndex() -> FileIndex {
188	FileIndex {
189		files:HashMap::new(),
190		content_index:HashMap::new(),
191		symbol_index:HashMap::new(),
192		file_symbols:HashMap::new(),
193		last_updated:chrono::Utc::now(),
194		index_version:GenerateIndexVersion(),
195		index_checksum:String::new(),
196	}
197}
198
199/// Generate index version string
200pub fn GenerateIndexVersion() -> String { format!("{}-{}", env!("CARGO_PKG_VERSION"), chrono::Utc::now().timestamp()) }
201
202/// Calculate index checksum for integrity verification
203pub fn CalculateIndexChecksum(index:&FileIndex) -> Result<String> {
204	let checksum_input = format!(
205		"{}:{}:{}:{}",
206		index.files.len(),
207		index.content_index.len(),
208		index.symbol_index.len(),
209		index.last_updated.timestamp()
210	);
211
212	let mut hasher = Sha256::new();
213	hasher.update(checksum_input.as_bytes());
214	Ok(format!("{:x}", hasher.finalize()))
215}
216
217/// Create file metadata from raw information
218pub fn CreateFileMetadata(
219	path:PathBuf,
220	size:u64,
221	modified:chrono::DateTime<chrono::Utc>,
222	mime_type:String,
223	language:Option<String>,
224	line_count:Option<u32>,
225	checksum:String,
226	is_symlink:bool,
227	permissions:String,
228	encoding:Option<String>,
229	symbol_count:u32,
230) -> FileMetadata {
231	FileMetadata {
232		path,
233		size,
234		modified,
235		mime_type,
236		language,
237		line_count,
238		checksum,
239		is_symlink,
240		permissions,
241		encoding,
242		indexed_at:chrono::Utc::now(),
243		symbol_count,
244	}
245}
246
247/// Create symbol info with validation
248pub fn CreateSymbolInfo(name:String, kind:SymbolKind, line:u32, column:u32, full_path:String) -> SymbolInfo {
249	SymbolInfo { name, kind, line, column, full_path }
250}
251
252/// Create symbol location for cross-referencing
253pub fn CreateSymbolLocation(file_path:PathBuf, line:u32, symbol:SymbolInfo) -> SymbolLocation {
254	SymbolLocation { file_path, line, symbol }
255}
256
257/// Get file permissions as string from metadata
258#[cfg(unix)]
259pub fn GetPermissionsString(metadata:&std::fs::Metadata) -> String {
260	let mode = metadata.permissions().mode();
261	let mut perms = String::new();
262	// Read permission
263	perms.push(if mode & 0o400 != 0 { 'r' } else { '-' });
264	// Write permission
265	perms.push(if mode & 0o200 != 0 { 'w' } else { '-' });
266	// Execute permission
267	perms.push(if mode & 0o100 != 0 { 'x' } else { '-' });
268	// Group permissions
269	perms.push(if mode & 0o040 != 0 { 'r' } else { '-' });
270	perms.push(if mode & 0o020 != 0 { 'w' } else { '-' });
271	perms.push(if mode & 0o010 != 0 { 'x' } else { '-' });
272	// Other permissions
273	perms.push(if mode & 0o004 != 0 { 'r' } else { '-' });
274	perms.push(if mode & 0o002 != 0 { 'w' } else { '-' });
275	perms.push(if mode & 0o001 != 0 { 'x' } else { '-' });
276	perms
277}
278
279/// Get file permissions as string for non-Unix systems
280#[cfg(not(unix))]
281pub fn GetPermissionsString(_metadata:&std::fs::Metadata) -> String { "--------".to_string() }
282
283/// Validate file size against maximum allowed
284pub fn ValidateFileSize(size:u64) -> Result<()> {
285	if size > MAX_FILE_SIZE_BYTES {
286		return Err(AirError::FileSystem(format!(
287			"File size {} exceeds maximum allowed size of {} bytes",
288			size, MAX_FILE_SIZE_BYTES
289		)));
290	}
291	Ok(())
292}
293
294/// Check if index size is within sane limits
295pub fn ValidateIndexSize(index:&FileIndex) -> Result<()> {
296	const MAX_INDEXED_FILES:usize = 1_000_000;
297	const MAX_SYMBOLS:usize = 10_000_000;
298
299	if index.files.len() > MAX_INDEXED_FILES {
300		return Err(AirError::Internal(format!(
301			"Index exceeds maximum file count: {} > {}",
302			index.files.len(),
303			MAX_INDEXED_FILES
304		)));
305	}
306
307	let total_symbols:usize = index.file_symbols.values().map(|v| v.len()).sum();
308	if total_symbols > MAX_SYMBOLS {
309		return Err(AirError::Internal(format!(
310			"Index exceeds maximum symbol count: {} > {}",
311			total_symbols, MAX_SYMBOLS
312		)));
313	}
314
315	Ok(())
316}