Mountain/ApplicationState/Internal/Recovery/RecoverState.rs
1//! # RecoverState Module (Internal)
2//!
3//! ## RESPONSIBILITIES
4//! Provides state recovery utilities including validation, timeout handling,
5//! and exponential backoff for recovery operations.
6//!
7//! ## ARCHITECTURAL ROLE
8//! RecoverState is part of the **Internal::Recovery** module, providing
9//! recovery utilities for corrupted or invalid state.
10//!
11//! ## KEY COMPONENTS
12//! - validate_and_clean_state: Filters state by validator function
13//! - safe_state_operation_with_timeout: Executes operation with timeout
14//! - recover_state_with_backoff: Retries with exponential backoff
15//!
16//! ## ERROR HANDLING
17//! - Validates state before operations
18//! - Timeout protection for operations
19//! - Exponential backoff for retries
20//!
21//! ## LOGGING
22//! Operations are logged at appropriate levels (error, warn).
23//!
24//! ## PERFORMANCE CONSIDERATIONS
25//! - Efficient validation with retain
26//! - Timeout prevents hanging operations
27//! - Exponential backoff prevents overwhelming system
28//!
29//! ## TODO
30//! - [ ] Add state validation rules
31//! - [ ] Implement checkpoint recovery
32//! - [ ] Add recovery metrics collection
33
34use std::collections::HashMap;
35
36use CommonLibrary::Error::CommonError::CommonError;
37
38use crate::dev_log;
39
40/// Validates and cleans up state data by removing entries that don't pass
41/// validation.
42///
43/// # Arguments
44/// * `state_data` - The state data to validate and clean
45/// * `validator` - Function that returns true for valid entries
46///
47/// # Type Parameters
48/// * `T` - The type of values in the state map
49///
50/// # Behavior
51/// - Retains only entries where validator returns true
52/// - In-place modification of the HashMap
53pub fn validate_and_clean_state<T>(state_data:&mut HashMap<String, T>, validator:impl Fn(&T) -> bool) {
54 let original_len = state_data.len();
55 state_data.retain(|_, value| validator(value));
56 let removed_count = original_len - state_data.len();
57
58 if removed_count > 0 {
59 dev_log!(
60 "lifecycle",
61 "warn: [RecoverState] Removed {} invalid state entries ({} remaining)",
62 removed_count,
63 state_data.len()
64 );
65 }
66}
67
68/// Safe state operation with timeout protection.
69///
70/// # Arguments
71/// * `operation` - The operation to execute
72/// * `timeout_ms` - Timeout in milliseconds
73/// * `operation_name` - Name of the operation for logging
74///
75/// # Type Parameters
76/// * `T` - The return type of the operation
77/// * `F` - The operation function type
78///
79/// # Returns
80/// Result containing the operation result or CommonError
81///
82/// # Behavior
83/// - Executes operation in a separate thread
84/// - Waits for result or timeout
85/// - Returns error if timeout occurs
86pub fn safe_state_operation_with_timeout<T, F>(
87 operation:F,
88 timeout_ms:u64,
89 operation_name:&str,
90) -> Result<T, CommonError>
91where
92 F: FnOnce() -> Result<T, CommonError> + Send + 'static,
93 T: Send + 'static, {
94 let (sender, receiver) = std::sync::mpsc::channel();
95
96 std::thread::spawn(move || {
97 let result = operation();
98 let _ = sender.send(result);
99 });
100
101 match receiver.recv_timeout(std::time::Duration::from_millis(timeout_ms)) {
102 Ok(result) => result,
103 Err(_) => {
104 dev_log!(
105 "lifecycle",
106 "error: [RecoverState] Operation '{}' timed out after {}ms",
107 operation_name,
108 timeout_ms
109 );
110 Err(CommonError::Unknown { Description:format!("Operation '{}' timed out", operation_name) })
111 },
112 }
113}
114
115/// Attempt state recovery with exponential backoff.
116///
117/// # Arguments
118/// * `operation` - The operation to retry
119/// * `max_attempts` - Maximum number of retry attempts
120/// * `operation_name` - Name of the operation for logging
121///
122/// # Type Parameters
123/// * `F` - The operation function type
124/// * `T` - The return type of the operation
125///
126/// # Returns
127/// Result containing the operation result or CommonError
128///
129/// # Behavior
130/// - Retries operation up to max_attempts times
131/// - Uses exponential backoff (doubles delay after each failure)
132/// - Starts with 100ms delay
133/// - Logs each attempt and failure
134pub async fn recover_state_with_backoff<F, T>(
135 operation:F,
136 max_attempts:u32,
137 operation_name:&str,
138) -> Result<T, CommonError>
139where
140 F: Fn() -> Result<T, CommonError> + Send, {
141 let mut attempt = 0;
142 let mut delay_ms = 100;
143
144 while attempt < max_attempts {
145 match operation() {
146 Ok(result) => return Ok(result),
147 Err(error) => {
148 attempt += 1;
149 if attempt == max_attempts {
150 return Err(error);
151 }
152
153 dev_log!(
154 "lifecycle",
155 "warn: [RecoverState] Attempt {} failed for '{}': {}. Retrying in {}ms...",
156 attempt,
157 operation_name,
158 error,
159 delay_ms
160 );
161
162 tokio::time::sleep(tokio::time::Duration::from_millis(delay_ms)).await;
163
164 // Apply exponential backoff by doubling the delay after each failure
165 // to prevent overwhelming the system during recovery attempts.
166 delay_ms *= 2;
167 },
168 }
169 }
170
171 Err(CommonError::Unknown {
172 Description:format!(
173 "Failed to recover state for '{}' after {} attempts",
174 operation_name, max_attempts
175 ),
176 })
177}