Skip to main content

Mountain/ApplicationState/Internal/Recovery/
RecoverState.rs

1//! # RecoverState Module (Internal)
2//!
3//! ## RESPONSIBILITIES
4//! Provides state recovery utilities including validation, timeout handling,
5//! and exponential backoff for recovery operations.
6//!
7//! ## ARCHITECTURAL ROLE
8//! RecoverState is part of the **Internal::Recovery** module, providing
9//! recovery utilities for corrupted or invalid state.
10//!
11//! ## KEY COMPONENTS
12//! - validate_and_clean_state: Filters state by validator function
13//! - safe_state_operation_with_timeout: Executes operation with timeout
14//! - recover_state_with_backoff: Retries with exponential backoff
15//!
16//! ## ERROR HANDLING
17//! - Validates state before operations
18//! - Timeout protection for operations
19//! - Exponential backoff for retries
20//!
21//! ## LOGGING
22//! Operations are logged at appropriate levels (error, warn).
23//!
24//! ## PERFORMANCE CONSIDERATIONS
25//! - Efficient validation with retain
26//! - Timeout prevents hanging operations
27//! - Exponential backoff prevents overwhelming system
28//!
29//! ## TODO
30//! - [ ] Add state validation rules
31//! - [ ] Implement checkpoint recovery
32//! - [ ] Add recovery metrics collection
33
34use std::collections::HashMap;
35
36use CommonLibrary::Error::CommonError::CommonError;
37
38use crate::dev_log;
39
40/// Validates and cleans up state data by removing entries that don't pass
41/// validation.
42///
43/// # Arguments
44/// * `state_data` - The state data to validate and clean
45/// * `validator` - Function that returns true for valid entries
46///
47/// # Type Parameters
48/// * `T` - The type of values in the state map
49///
50/// # Behavior
51/// - Retains only entries where validator returns true
52/// - In-place modification of the HashMap
53pub fn validate_and_clean_state<T>(state_data:&mut HashMap<String, T>, validator:impl Fn(&T) -> bool) {
54	let original_len = state_data.len();
55	state_data.retain(|_, value| validator(value));
56	let removed_count = original_len - state_data.len();
57
58	if removed_count > 0 {
59		dev_log!(
60			"lifecycle",
61			"warn: [RecoverState] Removed {} invalid state entries ({} remaining)",
62			removed_count,
63			state_data.len()
64		);
65	}
66}
67
68/// Safe state operation with timeout protection.
69///
70/// # Arguments
71/// * `operation` - The operation to execute
72/// * `timeout_ms` - Timeout in milliseconds
73/// * `operation_name` - Name of the operation for logging
74///
75/// # Type Parameters
76/// * `T` - The return type of the operation
77/// * `F` - The operation function type
78///
79/// # Returns
80/// Result containing the operation result or CommonError
81///
82/// # Behavior
83/// - Executes operation in a separate thread
84/// - Waits for result or timeout
85/// - Returns error if timeout occurs
86pub fn safe_state_operation_with_timeout<T, F>(
87	operation:F,
88	timeout_ms:u64,
89	operation_name:&str,
90) -> Result<T, CommonError>
91where
92	F: FnOnce() -> Result<T, CommonError> + Send + 'static,
93	T: Send + 'static, {
94	let (sender, receiver) = std::sync::mpsc::channel();
95
96	std::thread::spawn(move || {
97		let result = operation();
98		let _ = sender.send(result);
99	});
100
101	match receiver.recv_timeout(std::time::Duration::from_millis(timeout_ms)) {
102		Ok(result) => result,
103		Err(_) => {
104			dev_log!(
105				"lifecycle",
106				"error: [RecoverState] Operation '{}' timed out after {}ms",
107				operation_name,
108				timeout_ms
109			);
110			Err(CommonError::Unknown { Description:format!("Operation '{}' timed out", operation_name) })
111		},
112	}
113}
114
115/// Attempt state recovery with exponential backoff.
116///
117/// # Arguments
118/// * `operation` - The operation to retry
119/// * `max_attempts` - Maximum number of retry attempts
120/// * `operation_name` - Name of the operation for logging
121///
122/// # Type Parameters
123/// * `F` - The operation function type
124/// * `T` - The return type of the operation
125///
126/// # Returns
127/// Result containing the operation result or CommonError
128///
129/// # Behavior
130/// - Retries operation up to max_attempts times
131/// - Uses exponential backoff (doubles delay after each failure)
132/// - Starts with 100ms delay
133/// - Logs each attempt and failure
134pub async fn recover_state_with_backoff<F, T>(
135	operation:F,
136	max_attempts:u32,
137	operation_name:&str,
138) -> Result<T, CommonError>
139where
140	F: Fn() -> Result<T, CommonError> + Send, {
141	let mut attempt = 0;
142	let mut delay_ms = 100;
143
144	while attempt < max_attempts {
145		match operation() {
146			Ok(result) => return Ok(result),
147			Err(error) => {
148				attempt += 1;
149				if attempt == max_attempts {
150					return Err(error);
151				}
152
153				dev_log!(
154					"lifecycle",
155					"warn: [RecoverState] Attempt {} failed for '{}': {}. Retrying in {}ms...",
156					attempt,
157					operation_name,
158					error,
159					delay_ms
160				);
161
162				tokio::time::sleep(tokio::time::Duration::from_millis(delay_ms)).await;
163
164				// Apply exponential backoff by doubling the delay after each failure
165				// to prevent overwhelming the system during recovery attempts.
166				delay_ms *= 2;
167			},
168		}
169	}
170
171	Err(CommonError::Unknown {
172		Description:format!(
173			"Failed to recover state for '{}' after {} attempts",
174			operation_name, max_attempts
175		),
176	})
177}