dmsc/service_mesh/
health_check.rs

1//! Copyright © 2025-2026 Wenze Wei. All Rights Reserved.
2//!
3//! This file is part of DMSC.
4//! The DMSC project belongs to the Dunimd Team.
5//!
6//! Licensed under the Apache License, Version 2.0 (the "License");
7//! You may not use this file except in compliance with the License.
8//! You may obtain a copy of the License at
9//!
10//!     http://www.apache.org/licenses/LICENSE-2.0
11//!
12//! Unless required by applicable law or agreed to in writing, software
13//! distributed under the License is distributed on an "AS IS" BASIS,
14//! WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15//! See the License for the specific language governing permissions and
16//! limitations under the License.
17
18//! # Health Check Module
19//! 
20//! This module provides health checking functionality for the DMSC service mesh. It allows
21//! monitoring the health of services using various protocols and provides comprehensive
22//! health status information.
23//! 
24//! ## Key Components
25//! 
26//! - **DMSCHealthCheckConfig**: Configuration for health checks
27//! - **DMSCHealthCheckResult**: Result of a health check
28//! - **DMSCHealthCheckType**: Supported health check types
29//! - **DMSCHealthCheckProvider**: Trait for implementing health check providers
30//! - **DMSCHttpHealthCheckProvider**: HTTP health check implementation
31//! - **DMSCTcpHealthCheckProvider**: TCP health check implementation
32//! - **DMSCHealthChecker**: Main health checking service
33//! - **DMSCHealthStatus**: Health status enum
34//! - **DMSCHealthSummary**: Summary of health check results
35//! 
36//! ## Design Principles
37//! 
38//! 1. **Protocol Agnostic**: Supports multiple health check protocols (HTTP, TCP, gRPC, custom)
39//! 2. **Async-First**: All health check operations are asynchronous
40//! 3. **Extensible**: Easy to implement new health check providers
41//! 4. **Configurable**: Highly configurable health check parameters
42//! 5. **Real-time Monitoring**: Background tasks for continuous health monitoring
43//! 6. **Comprehensive Results**: Detailed health check results with response times and error messages
44//! 7. **Health Summary**: Aggregated health status with success rates and average response times
45//! 8. **Thread-safe**: Uses Arc and RwLock for safe concurrent access
46//! 9. **Graceful Shutdown**: Proper cleanup of background tasks
47//! 10. **Error Handling**: Comprehensive error handling with DMSCResult
48//! 
49//! ## Usage
50//! 
51//! ```rust
52//! use dmsc::prelude::*;
53//! use std::time::Duration;
54//! 
55//! async fn example() -> DMSCResult<()> {
56//!     // Create a health checker with 30-second intervals
57//!     let health_checker = DMSCHealthChecker::new(Duration::from_secs(30));
58//!     
59//!     // Register a health check for a service
60//!     let config = DMSCHealthCheckConfig {
61//!         endpoint: "/health".to_string(),
62//!         method: "GET".to_string(),
63//!         timeout: Duration::from_secs(5),
64//!         expected_status_code: 200,
65//!         expected_response_body: None,
66//!         headers: HashMap::new(),
67//!     };
68//!     
69//!     health_checker.register_health_check(
70//!         "example-service",
71//!         "http://localhost:8080",
72//!         DMSCHealthCheckType::Http,
73//!         config
74//!     ).await?;
75//!     
76//!     // Start background health checks
77//!     health_checker.start_health_check("example-service", "http://localhost:8080").await?;
78//!     
79//!     // Get health summary
80//!     let summary = health_checker.get_service_health_summary("example-service").await?;
81//!     println!("Service health: {:?}", summary.overall_status);
82//!     println!("Success rate: {:.2}%", summary.success_rate);
83//!     
84//!     Ok(())
85//! }
86//! ```
87
88use async_trait::async_trait;
89use serde::{Deserialize, Serialize};
90use std::collections::HashMap;
91use std::sync::Arc;
92use std::time::{Duration, SystemTime};
93use tokio::sync::RwLock;
94use tokio::task::JoinHandle;
95
96#[cfg(feature = "pyo3")]
97use pyo3::PyResult;
98#[cfg(feature = "service_mesh")]
99use hyper;
100
101use crate::core::{DMSCResult, DMSCError};
102use crate::observability::{DMSCTracer, DMSCSpanKind, DMSCSpanStatus};
103
104/// Configuration for health checks.
105///
106/// This struct defines the parameters for performing health checks, including
107/// endpoint, HTTP method, timeout, expected status code, and custom headers.
108#[derive(Debug, Clone, Serialize, Deserialize)]
109pub struct DMSCHealthCheckConfig {
110    /// Health check endpoint path
111    pub endpoint: String,
112    /// HTTP method to use for health checks
113    pub method: String,
114    /// Timeout for health check requests
115    pub timeout: Duration,
116    /// Expected HTTP status code for a healthy service
117    pub expected_status_code: u16,
118    /// Optional expected response body for validation
119    pub expected_response_body: Option<String>,
120    /// Custom headers to include in health check requests
121    pub headers: HashMap<String, String>,
122}
123
124impl Default for DMSCHealthCheckConfig {
125    /// Creates a default health check configuration.
126    ///
127    /// # Returns
128    ///
129    /// A `DMSCHealthCheckConfig` instance with default values
130    fn default() -> Self {
131        Self {
132            endpoint: "/health".to_string(),
133            method: "GET".to_string(),
134            timeout: Duration::from_secs(5),
135            expected_status_code: 200,
136            expected_response_body: None,
137            headers: HashMap::new(),
138        }
139    }
140}
141
142/// Result of a health check operation.
143///
144/// This struct contains detailed information about the result of a health check,
145/// including whether the service is healthy, response time, and error messages if any.
146#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
147#[derive(Debug, Clone)]
148pub struct DMSCHealthCheckResult {
149    /// Name of the service being checked
150    pub service_name: String,
151    /// Endpoint used for the health check
152    pub endpoint: String,
153    /// Whether the service is considered healthy
154    pub is_healthy: bool,
155    /// HTTP status code received (if applicable)
156    pub status_code: Option<u16>,
157    /// Time taken to perform the health check
158    pub response_time: Duration,
159    /// Error message if the health check failed
160    pub error_message: Option<String>,
161    /// Timestamp when the health check was performed
162    pub timestamp: SystemTime,
163}
164
165#[cfg(feature = "pyo3")]
166#[pyo3::prelude::pymethods]
167impl DMSCHealthCheckResult {
168    fn get_service_name(&self) -> String {
169        self.service_name.clone()
170    }
171    
172    fn get_endpoint(&self) -> String {
173        self.endpoint.clone()
174    }
175    
176    fn get_is_healthy(&self) -> bool {
177        self.is_healthy
178    }
179    
180    fn get_status_code(&self) -> Option<u16> {
181        self.status_code
182    }
183    
184    fn get_response_time_ms(&self) -> u64 {
185        self.response_time.as_millis() as u64
186    }
187    
188    fn get_error_message(&self) -> Option<String> {
189        self.error_message.clone()
190    }
191}
192
193/// Types of health checks supported.
194///
195/// This enum defines the different protocols that can be used for health checking.
196#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
197#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq, Hash)]
198pub enum DMSCHealthCheckType {
199    /// HTTP health check
200    Http,
201    /// TCP health check
202    Tcp,
203    /// gRPC health check
204    Grpc,
205    /// Custom health check implementation
206    Custom,
207}
208
209/// Trait for implementing health check providers.
210///
211/// This trait defines the interface for health check providers, allowing for
212/// different health check implementations based on protocol.
213#[async_trait]
214pub trait DMSCHealthCheckProvider: Send + Sync {
215    /// Performs a health check on the specified endpoint.
216    ///
217    /// # Parameters
218    ///
219    /// - `endpoint`: The endpoint to check
220    /// - `config`: Health check configuration
221    ///
222    /// # Returns
223    ///
224    /// A `DMSCResult<DMSCHealthCheckResult>` containing the health check result
225    async fn check_health(&self, endpoint: &str, config: &DMSCHealthCheckConfig) -> DMSCResult<DMSCHealthCheckResult>;
226}
227
228/// HTTP health check provider.
229///
230/// This struct implements the `DMSCHealthCheckProvider` trait for HTTP health checks.
231pub struct DMSCHttpHealthCheckProvider;
232
233#[async_trait]
234impl DMSCHealthCheckProvider for DMSCHttpHealthCheckProvider {
235    /// Performs an HTTP health check on the specified endpoint.
236    ///
237    /// # Parameters
238    ///
239    /// - `endpoint`: The HTTP endpoint to check
240    /// - `config`: Health check configuration
241    ///
242    /// # Returns
243    ///
244    /// A `DMSCResult<DMSCHealthCheckResult>` containing the health check result
245    #[cfg(feature = "service_mesh")]
246    async fn check_health(&self, endpoint: &str, _config: &DMSCHealthCheckConfig) -> DMSCResult<DMSCHealthCheckResult> {
247        let start_time = SystemTime::now();
248        
249        let client = hyper::Client::new();
250
251        let uri: hyper::Uri = endpoint.parse()
252            .map_err(|e| DMSCError::ServiceMesh(format!("Invalid URI: {e}")))?;
253
254        let req = hyper::Request::builder()
255            .method(_config.method.as_str())
256            .uri(uri)
257            .body(hyper::body::Body::empty())
258            .map_err(|e| DMSCError::ServiceMesh(format!("Failed to build request: {e}")))?;
259
260        match client.request(req).await {
261            Ok(response) => {
262                let status_code = response.status().as_u16();
263                let is_healthy = status_code == _config.expected_status_code;
264                let response_time = SystemTime::now().duration_since(start_time)
265                    .unwrap_or(Duration::from_secs(0));
266
267                let error_message = if !is_healthy {
268                    Some(format!("Expected status code {}, got {}", _config.expected_status_code, status_code))
269                } else {
270                    None
271                };
272
273                Ok(DMSCHealthCheckResult {
274                    service_name: "unknown".to_string(),
275                    endpoint: endpoint.to_string(),
276                    is_healthy,
277                    status_code: Some(status_code),
278                    response_time,
279                    error_message,
280                    timestamp: SystemTime::now(),
281                })
282            }
283            Err(e) => {
284                let response_time = SystemTime::now().duration_since(start_time)
285                    .unwrap_or(Duration::from_secs(0));
286
287                Ok(DMSCHealthCheckResult {
288                    service_name: "unknown".to_string(),
289                    endpoint: endpoint.to_string(),
290                    is_healthy: false,
291                    status_code: None,
292                    response_time,
293                    error_message: Some(e.to_string()),
294                    timestamp: SystemTime::now(),
295                })
296            }
297        }
298    }
299    
300    #[cfg(not(feature = "service_mesh"))]
301    async fn check_health(&self, endpoint: &str, _config: &DMSCHealthCheckConfig) -> DMSCResult<DMSCHealthCheckResult> {
302        // If service_mesh feature is not enabled, assume all endpoints are healthy
303        Ok(DMSCHealthCheckResult {
304            service_name: "unknown".to_string(),
305            endpoint: endpoint.to_string(),
306            is_healthy: true,
307            status_code: Some(_config.expected_status_code),
308            response_time: Duration::from_secs(0),
309            error_message: None,
310            timestamp: SystemTime::now(),
311        })
312    }
313}
314
315/// TCP health check provider.
316///
317/// This struct implements the `DMSCHealthCheckProvider` trait for TCP health checks.
318pub struct DMSCTcpHealthCheckProvider;
319
320#[async_trait]
321impl DMSCHealthCheckProvider for DMSCTcpHealthCheckProvider {
322    /// Performs a TCP health check on the specified endpoint.
323    ///
324    /// # Parameters
325    ///
326    /// - `endpoint`: The TCP endpoint to check (format: "host:port")
327    /// - `config`: Health check configuration
328    ///
329    /// # Returns
330    ///
331    /// A `DMSCResult<DMSCHealthCheckResult>` containing the health check result
332    async fn check_health(&self, endpoint: &str, _config: &DMSCHealthCheckConfig) -> DMSCResult<DMSCHealthCheckResult> {
333        let start_time = SystemTime::now();
334        
335        match tokio::net::TcpStream::connect(endpoint).await {
336            Ok(_) => {
337                let response_time = SystemTime::now().duration_since(start_time)
338                    .unwrap_or(Duration::from_secs(0));
339
340                Ok(DMSCHealthCheckResult {
341                    service_name: "unknown".to_string(),
342                    endpoint: endpoint.to_string(),
343                    is_healthy: true,
344                    status_code: None,
345                    response_time,
346                    error_message: None,
347                    timestamp: SystemTime::now(),
348                })
349            }
350            Err(e) => {
351                let response_time = SystemTime::now().duration_since(start_time)
352                    .unwrap_or(Duration::from_secs(0));
353
354                Ok(DMSCHealthCheckResult {
355                    service_name: "unknown".to_string(),
356                    endpoint: endpoint.to_string(),
357                    is_healthy: false,
358                    status_code: None,
359                    response_time,
360                    error_message: Some(e.to_string()),
361                    timestamp: SystemTime::now(),
362                })
363            }
364        }
365    }
366}
367
368/// gRPC health check provider.
369///
370/// This struct implements the `DMSCHealthCheckProvider` trait for gRPC health checks.
371pub struct DMSCGrpcHealthCheckProvider;
372
373#[async_trait]
374impl DMSCHealthCheckProvider for DMSCGrpcHealthCheckProvider {
375    /// Performs a gRPC health check on the specified endpoint.
376    ///
377    /// # Parameters
378    ///
379    /// - `endpoint`: The gRPC endpoint to check (format: "host:port")
380    /// - `config`: Health check configuration
381    ///
382    /// # Returns
383    ///
384    /// A `DMSCResult<DMSCHealthCheckResult>` containing the health check result
385    async fn check_health(&self, endpoint: &str, _config: &DMSCHealthCheckConfig) -> DMSCResult<DMSCHealthCheckResult> {
386        let start_time = SystemTime::now();
387        
388        // Simple gRPC health check implementation using TCP connection
389        // In a full implementation, this would use the gRPC health check service
390        match tokio::net::TcpStream::connect(endpoint).await {
391            Ok(_) => {
392                let response_time = SystemTime::now().duration_since(start_time)
393                    .unwrap_or(Duration::from_secs(0));
394
395                Ok(DMSCHealthCheckResult {
396                    service_name: "unknown".to_string(),
397                    endpoint: endpoint.to_string(),
398                    is_healthy: true,
399                    status_code: None,
400                    response_time,
401                    error_message: None,
402                    timestamp: SystemTime::now(),
403                })
404            }
405            Err(e) => {
406                let response_time = SystemTime::now().duration_since(start_time)
407                    .unwrap_or(Duration::from_secs(0));
408
409                Ok(DMSCHealthCheckResult {
410                    service_name: "unknown".to_string(),
411                    endpoint: endpoint.to_string(),
412                    is_healthy: false,
413                    status_code: None,
414                    response_time,
415                    error_message: Some(e.to_string()),
416                    timestamp: SystemTime::now(),
417                })
418            }
419        }
420    }
421}
422
423/// Main health checker service.
424///
425/// This struct provides the core functionality for managing health checks, including
426/// registering health checks, starting background monitoring, and retrieving health status.
427#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
428pub struct DMSCHealthChecker {
429    check_interval: Duration,
430    providers: Arc<RwLock<HashMap<DMSCHealthCheckType, Box<dyn DMSCHealthCheckProvider>>>>,
431    check_results: Arc<RwLock<HashMap<String, Vec<DMSCHealthCheckResult>>>>,
432    background_tasks: Arc<RwLock<Vec<JoinHandle<()>>>>,
433    tracer: Option<Arc<DMSCTracer>>,
434}
435
436impl DMSCHealthChecker {
437    pub fn new(check_interval: Duration) -> Self {
438        let mut providers: HashMap<DMSCHealthCheckType, Box<dyn DMSCHealthCheckProvider>> = HashMap::new();
439        providers.insert(DMSCHealthCheckType::Http, Box::new(DMSCHttpHealthCheckProvider));
440        providers.insert(DMSCHealthCheckType::Tcp, Box::new(DMSCTcpHealthCheckProvider));
441        providers.insert(DMSCHealthCheckType::Grpc, Box::new(DMSCGrpcHealthCheckProvider));
442
443        Self {
444            check_interval,
445            providers: Arc::new(RwLock::new(providers)),
446            check_results: Arc::new(RwLock::new(HashMap::new())),
447            background_tasks: Arc::new(RwLock::new(Vec::new())),
448            tracer: None,
449        }
450    }
451    
452    pub fn with_tracer(mut self, tracer: Arc<DMSCTracer>) -> Self {
453        self.tracer = Some(tracer);
454        self
455    }
456    
457    pub fn set_tracer(&mut self, tracer: Arc<DMSCTracer>) {
458        self.tracer = Some(tracer);
459    }
460    
461
462
463    /// Registers a health check for a service.
464    ///
465    /// This method registers a health check for a service and performs an immediate check.
466    ///
467    /// # Parameters
468    ///
469    /// - `service_name`: Name of the service to check
470    /// - `endpoint`: Endpoint URL for health checks
471    /// - `check_type`: Type of health check to perform
472    /// - `config`: Health check configuration
473    ///
474    /// # Returns
475    ///
476    /// A `DMSCResult<()>` indicating success or failure
477    pub async fn register_health_check(
478        &self,
479        service_name: &str,
480        endpoint: &str,
481        check_type: DMSCHealthCheckType,
482        config: DMSCHealthCheckConfig,
483    ) -> DMSCResult<()> {
484        let span_id = if let Some(tracer) = &self.tracer {
485            let span_id = tracer.start_span_from_context(
486                format!("health_check:{}", service_name),
487                DMSCSpanKind::Internal,
488            );
489            if let Some(ref sid) = span_id {
490                let _ = tracer.span_mut(sid, |span| {
491                    span.set_attribute("service_name".to_string(), service_name.to_string());
492                    span.set_attribute("endpoint".to_string(), endpoint.to_string());
493                    span.set_attribute("check_type".to_string(), format!("{:?}", check_type));
494                });
495            }
496            span_id
497        } else {
498            None
499        };
500
501        let result = self.register_health_check_internal(service_name, endpoint, check_type, config).await;
502
503        if let (Some(tracer), Some(sid)) = (&self.tracer, span_id) {
504            let status = match &result {
505                Ok(_) => DMSCSpanStatus::Ok,
506                Err(e) => DMSCSpanStatus::Error(e.to_string()),
507            };
508            let _ = tracer.end_span(&sid, status);
509        }
510
511        result
512    }
513    
514    async fn register_health_check_internal(
515        &self,
516        service_name: &str,
517        endpoint: &str,
518        check_type: DMSCHealthCheckType,
519        config: DMSCHealthCheckConfig,
520    ) -> DMSCResult<()> {
521        let providers = self.providers.read().await;
522        let provider = providers.get(&check_type)
523            .ok_or_else(|| DMSCError::ServiceMesh(format!("Health check provider for {check_type:?} not found")))?;
524
525        let result = provider.check_health(endpoint, &config).await?;
526        
527        let mut check_results = self.check_results.write().await;
528        let service_results = check_results.entry(service_name.to_string())
529            .or_insert_with(Vec::new);
530        service_results.push(result);
531
532        Ok(())
533    }
534
535    /// Starts background health checks for a service.
536    /// 
537    /// This method creates a background task that periodically checks the health of a service.
538    /// 
539    /// # Parameters
540    /// 
541    /// - `service_name`: Name of the service to check
542    /// - `endpoint`: Endpoint URL for health checks
543    /// 
544    /// # Returns
545    /// 
546    /// A `DMSCResult<()>` indicating success or failure
547    pub async fn start_health_check(&self, service_name: &str, endpoint: &str) -> DMSCResult<()> {
548        let mut tasks = self.background_tasks.write().await;
549        
550        let service_name_clone = service_name.to_string();
551        let endpoint_clone = endpoint.to_string();
552        let check_interval = self.check_interval;
553        let providers = Arc::clone(&self.providers);
554        let check_results = Arc::clone(&self.check_results);
555
556        // Determine health check type based on endpoint URL scheme
557        let check_type = if endpoint.starts_with("grpc://") || endpoint.starts_with("grpcs://") {
558            DMSCHealthCheckType::Grpc
559        } else if endpoint.starts_with("http://") || endpoint.starts_with("https://") {
560            DMSCHealthCheckType::Http
561        } else {
562            // Assume TCP for other protocols
563            DMSCHealthCheckType::Tcp
564        };
565
566        let task = tokio::spawn(async move {
567            let mut interval = tokio::time::interval(check_interval);
568            let config = DMSCHealthCheckConfig::default();
569            
570            loop {
571                interval.tick().await;
572                
573                let providers_guard = providers.read().await;
574                if let Some(provider) = providers_guard.get(&check_type) {
575                    match provider.check_health(&endpoint_clone, &config).await {
576                        Ok(result) => {
577                            let mut results = check_results.write().await;
578                            let service_results = results.entry(service_name_clone.clone())
579                                .or_insert_with(Vec::new);
580                            
581                            // Add new result to the end
582                            service_results.push(result);
583                            
584                            // Keep only the most recent 100 results per service to avoid memory issues
585                            if service_results.len() > 100 {
586                                service_results.drain(0..service_results.len() - 100);
587                            }
588                        }
589                        Err(e) => {
590                            log::warn!("Health check failed for {endpoint_clone}: {e}");
591                        }
592                    }
593                }
594            }
595        });
596
597        tasks.push(task);
598        Ok(())
599    }
600    
601    /// Stops health checks for a specific service endpoint.
602    /// 
603    /// This method clears the health check results for the specified service.
604    /// The background task will continue running but will no longer record results.
605    /// 
606    /// # Parameters
607    /// 
608    /// - `service_name`: Name of the service
609    /// - `endpoint`: Endpoint URL
610    /// 
611    /// # Returns
612    /// 
613    /// A `DMSCResult<()>` indicating success or failure
614    pub async fn stop_health_check(&self, service_name: &str, _endpoint: &str) -> DMSCResult<()> {
615        let mut results = self.check_results.write().await;
616        results.remove(service_name);
617        Ok(())
618    }
619    
620    /// Starts background health checks for a service with a specific health check type.
621    /// 
622    /// This method creates a background task that periodically checks the health of a service
623    /// using the specified health check type.
624    /// 
625    /// # Parameters
626    /// 
627    /// - `service_name`: Name of the service to check
628    /// - `endpoint`: Endpoint URL for health checks
629    /// - `check_type`: Type of health check to perform
630    /// 
631    /// # Returns
632    /// 
633    /// A `DMSCResult<()>` indicating success or failure
634    pub async fn start_health_check_with_type(
635        &self, 
636        service_name: &str, 
637        endpoint: &str,
638        check_type: DMSCHealthCheckType
639    ) -> DMSCResult<()> {
640        let mut tasks = self.background_tasks.write().await;
641        
642        let service_name_clone = service_name.to_string();
643        let endpoint_clone = endpoint.to_string();
644        let check_interval = self.check_interval;
645        let providers = Arc::clone(&self.providers);
646        let check_results = Arc::clone(&self.check_results);
647        let check_type_clone = check_type;
648
649        let task = tokio::spawn(async move {
650            let mut interval = tokio::time::interval(check_interval);
651            let config = DMSCHealthCheckConfig::default();
652            
653            loop {
654                interval.tick().await;
655                
656                let providers_guard = providers.read().await;
657                if let Some(provider) = providers_guard.get(&check_type_clone) {
658                    match provider.check_health(&endpoint_clone, &config).await {
659                        Ok(result) => {
660                            let mut results = check_results.write().await;
661                            let service_results = results.entry(service_name_clone.clone())
662                                .or_insert_with(Vec::new);
663                            
664                            // Add new result to the end
665                            service_results.push(result);
666                            
667                            // Keep only the most recent 100 results per service to avoid memory issues
668                            if service_results.len() > 100 {
669                                service_results.drain(0..service_results.len() - 100);
670                            }
671                        }
672                        Err(e) => {
673                            log::warn!("Health check failed for {endpoint_clone}: {e}");
674                        }
675                    }
676                }
677            }
678        });
679
680        tasks.push(task);
681        Ok(())
682    }
683
684    /// Gets the health check results for a service.
685    ///
686    /// # Parameters
687    ///
688    /// - `service_name`: Name of the service to get results for
689    ///
690    /// # Returns
691    ///
692    /// A `DMSCResult<Vec<DMSCHealthCheckResult>>` containing the health check results
693    pub async fn get_health_status(&self, service_name: &str) -> DMSCResult<Vec<DMSCHealthCheckResult>> {
694        let check_results = self.check_results.read().await;
695        let results = check_results.get(service_name)
696            .cloned()
697            .unwrap_or_default();
698
699        Ok(results)
700    }
701    
702    /// Gets the latest health check result for a service.
703    ///
704    /// # Parameters
705    ///
706    /// - `service_name`: Name of the service to get the latest result for
707    ///
708    /// # Returns
709    ///
710    /// A `DMSCResult<Option<DMSCHealthCheckResult>>` containing the latest health check result if available
711    pub async fn get_latest_health_status(&self, service_name: &str) -> DMSCResult<Option<DMSCHealthCheckResult>> {
712        let check_results = self.check_results.read().await;
713        let latest_result = check_results.get(service_name)
714            .and_then(|results| results.last().cloned());
715
716        Ok(latest_result)
717    }
718    
719    /// Gets the health check results for a service within a specified time window.
720    ///
721    /// # Parameters
722    ///
723    /// - `service_name`: Name of the service to get results for
724    /// - `time_window`: Time window to filter results by
725    ///
726    /// # Returns
727    ///
728    /// A `DMSCResult<Vec<DMSCHealthCheckResult>>` containing the filtered health check results
729    pub async fn get_health_status_within(&self, service_name: &str, time_window: Duration) -> DMSCResult<Vec<DMSCHealthCheckResult>> {
730        let check_results = self.check_results.read().await;
731        let now = SystemTime::now();
732        
733        let results = check_results.get(service_name)
734            .map(|results| {
735                results.iter()
736                    .filter(|r| {
737                        if let Ok(elapsed) = now.duration_since(r.timestamp) {
738                            elapsed <= time_window
739                        } else {
740                            false
741                        }
742                    })
743                    .cloned()
744                    .collect()
745            })
746            .unwrap_or_default();
747
748        Ok(results)
749    }
750
751    /// Gets a health summary for a service.
752    ///
753    /// This method aggregates health check results to provide a summary of the service's health,
754    /// including success rate, average response time, and overall status.
755    ///
756    /// # Parameters
757    ///
758    /// - `service_name`: Name of the service to get a summary for
759    ///
760    /// # Returns
761    ///
762    /// A `DMSCResult<DMSCHealthSummary>` containing the health summary
763    pub async fn get_service_health_summary(&self, service_name: &str) -> DMSCResult<DMSCHealthSummary> {
764        let results = self.get_health_status(service_name).await?;
765        
766        if results.is_empty() {
767            return Ok(DMSCHealthSummary {
768                service_name: service_name.to_string(),
769                total_checks: 0,
770                healthy_checks: 0,
771                unhealthy_checks: 0,
772                success_rate: 0.0,
773                average_response_time: Duration::from_secs(0),
774                last_check_time: None,
775                overall_status: DMSCHealthStatus::Unknown,
776            });
777        }
778
779        let total_checks = results.len();
780        let healthy_checks = results.iter().filter(|r| r.is_healthy).count();
781        let unhealthy_checks = total_checks - healthy_checks;
782        let success_rate = (healthy_checks as f64) / (total_checks as f64) * 100.0;
783
784        let total_response_time: Duration = results.iter()
785            .map(|r| r.response_time)
786            .sum();
787        let average_response_time = total_response_time / total_checks as u32;
788
789        let last_check_time = results.last().map(|r| r.timestamp);
790
791        let overall_status = if success_rate >= 80.0 {
792            DMSCHealthStatus::Healthy
793        } else if success_rate >= 50.0 {
794            DMSCHealthStatus::Degraded
795        } else {
796            DMSCHealthStatus::Unhealthy
797        };
798
799        Ok(DMSCHealthSummary {
800            service_name: service_name.to_string(),
801            total_checks,
802            healthy_checks,
803            unhealthy_checks,
804            success_rate,
805            average_response_time,
806            last_check_time,
807            overall_status,
808        })
809    }
810
811    /// Starts background health check tasks.
812    ///
813    /// This method initializes and starts all background health monitoring tasks,
814    /// including periodic health checks for registered services and cleanup tasks.
815    ///
816    /// # Returns
817    ///
818    /// A `DMSCResult<()>` indicating success or failure
819    pub async fn start_background_tasks(&self) -> DMSCResult<()> {
820        // Start periodic cleanup task to remove old health check results
821        let check_results = Arc::clone(&self.check_results);
822        let cleanup_interval = self.check_interval * 10; // Cleanup every 10 check intervals
823        
824        let cleanup_task = tokio::spawn(async move {
825            let mut interval = tokio::time::interval(cleanup_interval);
826            
827            loop {
828                interval.tick().await;
829                
830                let mut results = check_results.write().await;
831                let now = SystemTime::now();
832                let max_age = Duration::from_secs(3600); // Keep results for 1 hour
833                
834                // Remove health check results older than max_age
835                for service_results in results.values_mut() {
836                    service_results.retain(|result| {
837                        now.duration_since(result.timestamp)
838                            .map(|age| age < max_age)
839                            .unwrap_or(false)
840                    });
841                }
842                
843                // Remove services with no recent results
844                results.retain(|_, results| !results.is_empty());
845            }
846        });
847        
848        // Store cleanup task
849        let mut tasks = self.background_tasks.write().await;
850        tasks.push(cleanup_task);
851        
852        log::info!("Background health check tasks started successfully");
853        Ok(())
854    }
855
856    /// Stops all background health check tasks.
857    ///
858    /// This method aborts all running background health check tasks and cleans up resources.
859    ///
860    /// # Returns
861    ///
862    /// A `DMSCResult<()>` indicating success or failure
863    pub async fn stop_background_tasks(&self) -> DMSCResult<()> {
864        let mut tasks = self.background_tasks.write().await;
865        for task in tasks.drain(..) {
866            task.abort();
867        }
868        Ok(())
869    }
870
871    /// Performs a health check on the health checker itself.
872    ///
873    /// # Returns
874    ///
875    /// A `DMSCResult<bool>` indicating whether the health checker is healthy
876    pub async fn health_check(&self) -> DMSCResult<bool> {
877        Ok(true)
878    }
879}
880
881#[cfg(feature = "pyo3")]
882/// Python bindings for DMSCHealthChecker
883#[pyo3::prelude::pymethods]
884impl DMSCHealthChecker {
885    #[new]
886    fn py_new(check_interval: u64) -> PyResult<Self> {
887        Ok(Self::new(Duration::from_secs(check_interval)))
888    }
889    
890    /// Get service health summary from Python
891    #[pyo3(name = "get_service_health_summary")]
892    fn get_service_health_summary_impl(&self, service_name: String) -> PyResult<DMSCHealthSummary> {
893        let rt = tokio::runtime::Runtime::new().map_err(|e| {
894            pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to create runtime: {}", e))
895        })?;
896        
897        rt.block_on(async {
898            self.get_service_health_summary(&service_name)
899                .await
900                .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to get health summary: {e}")))
901        })
902    }
903    
904    /// Start health check from Python
905    #[pyo3(name = "start_health_check")]
906    fn start_health_check_impl(&self, service_name: String, endpoint: String) -> PyResult<()> {
907        let rt = tokio::runtime::Runtime::new().map_err(|e| {
908            pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to create runtime: {}", e))
909        })?;
910        
911        rt.block_on(async {
912            self.start_health_check(&service_name, &endpoint)
913                .await
914                .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to start health check: {e}")))
915        })
916    }
917    
918    /// Stop health check from Python
919    #[pyo3(name = "stop_health_check")]
920    fn stop_health_check_impl(&self, service_name: String, endpoint: String) -> PyResult<()> {
921        let rt = tokio::runtime::Runtime::new().map_err(|e| {
922            pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to create runtime: {}", e))
923        })?;
924        
925        rt.block_on(async {
926            self.stop_health_check(&service_name, &endpoint)
927                .await
928                .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to stop health check: {e}")))
929        })
930    }
931    
932    /// Get health status from Python
933    #[pyo3(name = "get_health_status")]
934    fn get_health_status_impl(&self, service_name: String) -> PyResult<Vec<DMSCHealthCheckResult>> {
935        let rt = tokio::runtime::Runtime::new().map_err(|e| {
936            pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to create runtime: {}", e))
937        })?;
938        
939        rt.block_on(async {
940            self.get_health_status(&service_name)
941                .await
942                .map_err(|e| pyo3::exceptions::PyRuntimeError::new_err(format!("Failed to get health status: {e}")))
943        })
944    }
945}
946
947/// Health status enum.
948///
949/// This enum represents the overall health status of a service.
950#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
951#[derive(Debug, Clone)]
952pub enum DMSCHealthStatus {
953    /// Service is healthy
954    Healthy,
955    /// Service is degraded but still functional
956    Degraded,
957    /// Service is unhealthy
958    Unhealthy,
959    /// Health status is unknown
960    Unknown,
961}
962
963/// Summary of health check results.
964///
965/// This struct provides an aggregated view of a service's health, including
966/// total checks, success rate, average response time, and overall status.
967#[cfg_attr(feature = "pyo3", pyo3::prelude::pyclass)]
968#[derive(Debug, Clone)]
969pub struct DMSCHealthSummary {
970    /// Name of the service
971    pub service_name: String,
972    /// Total number of health checks performed
973    pub total_checks: usize,
974    /// Number of successful health checks
975    pub healthy_checks: usize,
976    /// Number of failed health checks
977    pub unhealthy_checks: usize,
978    /// Success rate percentage (0.0 to 100.0)
979    pub success_rate: f64,
980    /// Average response time for health checks
981    pub average_response_time: Duration,
982    /// Timestamp of the last health check
983    pub last_check_time: Option<SystemTime>,
984    /// Overall health status
985    pub overall_status: DMSCHealthStatus,
986}
987
988#[cfg(feature = "pyo3")]
989#[pyo3::prelude::pymethods]
990impl DMSCHealthSummary {
991    fn get_service_name(&self) -> String {
992        self.service_name.clone()
993    }
994    
995    fn get_total_checks(&self) -> usize {
996        self.total_checks
997    }
998    
999    fn get_healthy_checks(&self) -> usize {
1000        self.healthy_checks
1001    }
1002    
1003    fn get_unhealthy_checks(&self) -> usize {
1004        self.unhealthy_checks
1005    }
1006    
1007    fn get_success_rate(&self) -> f64 {
1008        self.success_rate
1009    }
1010    
1011    fn get_average_response_time_ms(&self) -> u64 {
1012        self.average_response_time.as_millis() as u64
1013    }
1014    
1015    fn get_overall_status(&self) -> String {
1016        match self.overall_status {
1017            DMSCHealthStatus::Healthy => "Healthy".to_string(),
1018            DMSCHealthStatus::Degraded => "Degraded".to_string(),
1019            DMSCHealthStatus::Unhealthy => "Unhealthy".to_string(),
1020            DMSCHealthStatus::Unknown => "Unknown".to_string(),
1021        }
1022    }
1023}