dmsc/core/
health.rs

1//! Copyright © 2025-2026 Wenze Wei. All Rights Reserved.
2//!
3//! This file is part of DMSC.
4//! The DMSC project belongs to the Dunimd Team.
5//!
6//! Licensed under the Apache License, Version 2.0 (the "License");
7//! You may not use this file except in compliance with the License.
8//! You may obtain a copy of the License at
9//!
10//!     http://www.apache.org/licenses/LICENSE-2.0
11//!
12//! Unless required by applicable law or agreed to in writing, software
13//! distributed under the License is distributed on an "AS IS" BASIS,
14//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15//! See the License for the specific language governing permissions and
16//! limitations under the License.
17
18#![allow(non_snake_case)]
19
20//! # Health Check System
21//!
22//! This module provides comprehensive health checking functionality for DMSC modules and services.
23//! It supports both active health checks (proactive monitoring) and passive health indicators
24//! (reactive status reporting).
25//!
26//! ## Key Components
27//!
28//! - **HealthStatus**: Enum representing the health state of a component
29//! - **HealthCheck**: Trait for implementing custom health checks
30//! - **HealthChecker**: Service for managing and executing health checks
31//! - **HealthReport**: Comprehensive health status report
32//!
33//! ## Design Principles
34//!
35//! 1. **Non-Intrusive**: Health checks can be added without modifying existing code
36//! 2. **Configurable**: Check intervals, timeouts, and thresholds are configurable
37//! 3. **Comprehensive**: Supports multiple health indicators and aggregation
38//! 4. **Performance-Aware**: Minimal impact on system performance
39//! 5. **Extensible**: Easy to add new health check types
40
41use crate::core::DMSCResult;
42use serde::{Deserialize, Serialize};
43use std::collections::HashMap;
44use std::time::{Duration, SystemTime};
45
46/// Health status enumeration representing the state of a component or service.
47#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
48#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
49pub enum DMSCHealthStatus {
50    /// Component is functioning normally
51    Healthy,
52    /// Component is experiencing issues but still operational
53    Degraded,
54    /// Component is not functioning and requires attention
55    Unhealthy,
56    /// Health status is unknown (check failed or not performed)
57    Unknown,
58}
59
60impl DMSCHealthStatus {
61    /// Returns true if the status is considered healthy (Healthy or Degraded).
62    pub fn is_healthy(&self) -> bool {
63        matches!(self, DMSCHealthStatus::Healthy | DMSCHealthStatus::Degraded)
64    }
65
66    /// Returns true if the status requires immediate attention.
67    pub fn requires_attention(&self) -> bool {
68        matches!(self, DMSCHealthStatus::Unhealthy)
69    }
70
71    /// Merges multiple health statuses into a single status.
72    /// The most severe status takes precedence: Unhealthy > Degraded > Unknown > Healthy
73    pub fn merge(statuses: &[DMSCHealthStatus]) -> DMSCHealthStatus {
74        if statuses.is_empty() {
75            return DMSCHealthStatus::Unknown;
76        }
77
78        let mut has_unhealthy = false;
79        let mut has_degraded = false;
80        let mut has_unknown = false;
81
82        for status in statuses {
83            match status {
84                DMSCHealthStatus::Unhealthy => has_unhealthy = true,
85                DMSCHealthStatus::Degraded => has_degraded = true,
86                DMSCHealthStatus::Unknown => has_unknown = true,
87                DMSCHealthStatus::Healthy => {}
88            }
89        }
90
91        if has_unhealthy {
92            DMSCHealthStatus::Unhealthy
93        } else if has_degraded {
94            DMSCHealthStatus::Degraded
95        } else if has_unknown {
96            DMSCHealthStatus::Unknown
97        } else {
98            DMSCHealthStatus::Healthy
99        }
100    }
101}
102
103impl std::fmt::Display for DMSCHealthStatus {
104    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
105        match self {
106            DMSCHealthStatus::Healthy => write!(f, "healthy"),
107            DMSCHealthStatus::Degraded => write!(f, "degraded"),
108            DMSCHealthStatus::Unhealthy => write!(f, "unhealthy"),
109            DMSCHealthStatus::Unknown => write!(f, "unknown"),
110        }
111    }
112}
113
114#[cfg(feature = "pyo3")]
115#[pyo3::prelude::pymethods]
116impl DMSCHealthStatus {
117    fn __str__(&self) -> String {
118        self.to_string()
119    }
120
121    fn __repr__(&self) -> String {
122        format!("DMSCHealthStatus::{}", self)
123    }
124
125    #[staticmethod]
126    fn merge_statuses(statuses: Vec<DMSCHealthStatus>) -> Self {
127        DMSCHealthStatus::merge(&statuses)
128    }
129}
130
131/// Result of a health check execution.
132#[derive(Debug, Clone, Serialize, Deserialize)]
133#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
134pub struct DMSCHealthCheckResult {
135    /// Name of the health check
136    pub name: String,
137    /// Health status
138    pub status: DMSCHealthStatus,
139    /// Optional message providing additional context
140    pub message: Option<String>,
141    /// Timestamp when the check was performed
142    pub timestamp: SystemTime,
143    /// Duration of the health check execution
144    pub duration: Duration,
145}
146
147impl DMSCHealthCheckResult {
148    /// Creates a new successful health check result.
149    pub fn healthy(name: String, message: Option<String>) -> Self {
150        Self {
151            name,
152            status: DMSCHealthStatus::Healthy,
153            message,
154            timestamp: SystemTime::now(),
155            duration: Duration::ZERO,
156        }
157    }
158
159    /// Creates a new degraded health check result.
160    pub fn degraded(name: String, message: Option<String>) -> Self {
161        Self {
162            name,
163            status: DMSCHealthStatus::Degraded,
164            message,
165            timestamp: SystemTime::now(),
166            duration: Duration::ZERO,
167        }
168    }
169
170    /// Creates a new unhealthy health check result.
171    pub fn unhealthy(name: String, message: Option<String>) -> Self {
172        Self {
173            name,
174            status: DMSCHealthStatus::Unhealthy,
175            message,
176            timestamp: SystemTime::now(),
177            duration: Duration::ZERO,
178        }
179    }
180
181    /// Creates a new unknown health check result.
182    pub fn unknown(name: String, message: Option<String>) -> Self {
183        Self {
184            name,
185            status: DMSCHealthStatus::Unknown,
186            message,
187            timestamp: SystemTime::now(),
188            duration: Duration::ZERO,
189        }
190    }
191}
192
193#[cfg(feature = "pyo3")]
194#[pyo3::prelude::pymethods]
195impl DMSCHealthCheckResult {
196    #[new]
197    fn new_py(name: String, status: DMSCHealthStatus, message: Option<String>) -> Self {
198        Self {
199            name,
200            status,
201            message,
202            timestamp: SystemTime::now(),
203            duration: Duration::ZERO,
204        }
205    }
206
207    #[staticmethod]
208    fn create_healthy(name: String, message: Option<String>) -> Self {
209        Self::healthy(name, message)
210    }
211
212    #[staticmethod]
213    fn create_degraded(name: String, message: Option<String>) -> Self {
214        Self::degraded(name, message)
215    }
216
217    #[staticmethod]
218    fn create_unhealthy(name: String, message: Option<String>) -> Self {
219        Self::unhealthy(name, message)
220    }
221
222    #[staticmethod]
223    fn create_unknown(name: String, message: Option<String>) -> Self {
224        Self::unknown(name, message)
225    }
226
227    #[getter]
228    fn name(&self) -> String {
229        self.name.clone()
230    }
231
232    #[getter]
233    fn status(&self) -> DMSCHealthStatus {
234        self.status
235    }
236
237    #[getter]
238    fn message(&self) -> Option<String> {
239        self.message.clone()
240    }
241
242    fn __str__(&self) -> String {
243        format!("{}: {}", self.name, self.status)
244    }
245
246    fn __repr__(&self) -> String {
247        format!("DMSCHealthCheckResult {{ name: {:?}, status: {:?} }}", self.name, self.status)
248    }
249}
250
251/// Configuration for health checks.
252#[derive(Debug, Clone, Serialize, Deserialize)]
253#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
254pub struct DMSCHealthCheckConfig {
255    /// Interval between health checks
256    pub check_interval: Duration,
257    /// Timeout for individual health checks
258    pub timeout: Duration,
259    /// Number of consecutive failures before marking as unhealthy
260    pub failure_threshold: u32,
261    /// Number of consecutive successes before marking as healthy
262    pub success_threshold: u32,
263    /// Whether the health check is enabled
264    pub enabled: bool,
265}
266
267impl Default for DMSCHealthCheckConfig {
268    fn default() -> Self {
269        Self {
270            check_interval: Duration::from_secs(30),
271            timeout: Duration::from_secs(5),
272            failure_threshold: 3,
273            success_threshold: 2,
274            enabled: true,
275        }
276    }
277}
278
279#[cfg(feature = "pyo3")]
280#[pyo3::prelude::pymethods]
281impl DMSCHealthCheckConfig {
282    #[new]
283    fn new_py(check_interval: u64, timeout: u64, failure_threshold: u32, success_threshold: u32, enabled: bool) -> Self {
284        Self {
285            check_interval: Duration::from_secs(check_interval),
286            timeout: Duration::from_secs(timeout),
287            failure_threshold,
288            success_threshold,
289            enabled,
290        }
291    }
292
293    #[staticmethod]
294    fn default_config() -> Self {
295        Self::default()
296    }
297
298    #[getter]
299    fn check_interval(&self) -> u64 {
300        self.check_interval.as_secs()
301    }
302
303    #[setter]
304    fn set_check_interval(&mut self, value: u64) {
305        self.check_interval = Duration::from_secs(value);
306    }
307
308    #[getter]
309    fn timeout(&self) -> u64 {
310        self.timeout.as_secs()
311    }
312
313    #[setter]
314    fn set_timeout(&mut self, value: u64) {
315        self.timeout = Duration::from_secs(value);
316    }
317
318    #[getter]
319    fn failure_threshold(&self) -> u32 {
320        self.failure_threshold
321    }
322
323    #[getter]
324    fn success_threshold(&self) -> u32 {
325        self.success_threshold
326    }
327
328    #[getter]
329    fn enabled(&self) -> bool {
330        self.enabled
331    }
332
333    fn __repr__(&self) -> String {
334        format!("DMSCHealthCheckConfig {{ check_interval: {}, timeout: {}, failure_threshold: {}, success_threshold: {}, enabled: {} }}",
335            self.check_interval.as_secs(), self.timeout.as_secs(), self.failure_threshold, self.success_threshold, self.enabled)
336    }
337}
338
339/// Trait for implementing custom health checks.
340#[async_trait::async_trait]
341pub trait HealthCheck: Send + Sync {
342    /// Performs the health check and returns the result.
343    async fn check(&self) -> DMSCResult<DMSCHealthCheckResult>;
344
345    /// Returns the name of this health check.
346    fn name(&self) -> &str;
347
348    /// Returns the configuration for this health check.
349    fn config(&self) -> &DMSCHealthCheckConfig;
350}
351
352/// Comprehensive health report containing status of all components.
353#[derive(Debug, Clone, Serialize, Deserialize)]
354#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
355pub struct DMSCHealthReport {
356    /// Overall system health status
357    pub overall_status: DMSCHealthStatus,
358    /// Individual component health results
359    pub components: HashMap<String, DMSCHealthCheckResult>,
360    /// Timestamp when the report was generated
361    pub timestamp: SystemTime,
362    /// Total number of components checked
363    pub total_components: usize,
364    /// Number of healthy components
365    pub healthy_count: usize,
366    /// Number of degraded components
367    pub degraded_count: usize,
368    /// Number of unhealthy components
369    pub unhealthy_count: usize,
370    /// Number of unknown components
371    pub unknown_count: usize,
372}
373
374impl DMSCHealthReport {
375    /// Creates a new empty health report.
376    pub fn new() -> Self {
377        Self {
378            overall_status: DMSCHealthStatus::Unknown,
379            components: HashMap::new(),
380            timestamp: SystemTime::now(),
381            total_components: 0,
382            healthy_count: 0,
383            degraded_count: 0,
384            unhealthy_count: 0,
385            unknown_count: 0,
386        }
387    }
388
389    /// Adds a health check result to the report.
390    pub fn add_result(&mut self, result: DMSCHealthCheckResult) {
391        match result.status {
392            DMSCHealthStatus::Healthy => self.healthy_count += 1,
393            DMSCHealthStatus::Degraded => self.degraded_count += 1,
394            DMSCHealthStatus::Unhealthy => self.unhealthy_count += 1,
395            DMSCHealthStatus::Unknown => self.unknown_count += 1,
396        }
397        self.total_components += 1;
398        self.components.insert(result.name.clone(), result);
399        self.update_overall_status();
400    }
401
402    /// Updates the overall health status based on component statuses.
403    fn update_overall_status(&mut self) {
404        let statuses: Vec<DMSCHealthStatus> = self.components.values().map(|r| r.status).collect();
405        self.overall_status = DMSCHealthStatus::merge(&statuses);
406    }
407}
408
409impl Default for DMSCHealthReport {
410    fn default() -> Self {
411        Self::new()
412    }
413}
414
415#[cfg(feature = "pyo3")]
416#[pyo3::prelude::pymethods]
417impl DMSCHealthReport {
418    #[new]
419    fn new_py() -> Self {
420        Self::new()
421    }
422
423    #[staticmethod]
424    fn create() -> Self {
425        Self::new()
426    }
427
428    #[staticmethod]
429    fn from_results(results: Vec<DMSCHealthCheckResult>) -> Self {
430        let mut report = Self::new();
431        for result in results {
432            report.add_result(result);
433        }
434        report
435    }
436
437    #[getter]
438    fn overall_status(&self) -> DMSCHealthStatus {
439        self.overall_status
440    }
441
442    #[getter]
443    fn total_components(&self) -> usize {
444        self.total_components
445    }
446
447    #[getter]
448    fn healthy_count(&self) -> usize {
449        self.healthy_count
450    }
451
452    #[getter]
453    fn degraded_count(&self) -> usize {
454        self.degraded_count
455    }
456
457    #[getter]
458    fn unhealthy_count(&self) -> usize {
459        self.unhealthy_count
460    }
461
462    #[getter]
463    fn unknown_count(&self) -> usize {
464        self.unknown_count
465    }
466
467    fn __str__(&self) -> String {
468        format!("DMSCHealthReport: {} ({}/{} healthy, {} degraded, {} unhealthy, {} unknown)",
469            self.overall_status, self.healthy_count, self.total_components,
470            self.degraded_count, self.unhealthy_count, self.unknown_count)
471    }
472
473    fn __repr__(&self) -> String {
474        format!("DMSCHealthReport {{ overall_status: {:?}, total_components: {} }}", self.overall_status, self.total_components)
475    }
476}
477
478/// Health checker service that manages and executes health checks.
479#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
480pub struct DMSCHealthChecker {
481    /// Registered health checks
482    checks: Vec<Box<dyn HealthCheck>>,
483    /// Global configuration
484    _config: DMSCHealthCheckConfig,
485}
486
487impl DMSCHealthChecker {
488    /// Creates a new health checker with default configuration.
489    pub fn new() -> Self {
490        Self {
491            checks: Vec::new(),
492            _config: DMSCHealthCheckConfig::default(),
493        }
494    }
495
496    /// Creates a new health checker with custom configuration.
497    pub fn with_config(config: DMSCHealthCheckConfig) -> Self {
498        Self {
499            checks: Vec::new(),
500            _config: config,
501        }
502    }
503
504    /// Registers a health check.
505    pub fn register_check(&mut self, check: Box<dyn HealthCheck>) {
506        self.checks.push(check);
507    }
508
509    /// Performs all health checks and returns a comprehensive report.
510    pub async fn check_all(&self) -> DMSCHealthReport {
511        let mut report = DMSCHealthReport::new();
512
513        for check in &self.checks {
514            if !check.config().enabled {
515                continue;
516            }
517
518            let start_time = SystemTime::now();
519            let result = match tokio::time::timeout(check.config().timeout, check.check()).await {
520                Ok(Ok(result)) => result,
521                Ok(Err(err)) => DMSCHealthCheckResult::unknown(
522                    check.name().to_string(),
523                    Some(format!("Check failed: {err}")),
524                ),
525                Err(_) => DMSCHealthCheckResult::unknown(
526                    check.name().to_string(),
527                    Some("Check timed out".to_string()),
528                ),
529            };
530
531            let duration = SystemTime::now()
532                .duration_since(start_time)
533                .unwrap_or(Duration::ZERO);
534
535            let mut result_with_duration = result;
536            result_with_duration.duration = duration;
537            report.add_result(result_with_duration);
538        }
539
540        report
541    }
542
543    /// Gets the number of registered health checks.
544    pub fn check_count(&self) -> usize {
545        self.checks.len()
546    }
547}
548
549impl Default for DMSCHealthChecker {
550    fn default() -> Self {
551        Self::new()
552    }
553}
554
555