"""Comprehensive health monitoring tools. This module provides tools for comprehensive health checks of the Unraid MCP server and the underlying Unraid system, including performance metrics, system status, notifications, Docker services, and API responsiveness. """ import datetime import time from typing import Any, Dict from fastmcp import FastMCP from ..config.logging import logger from ..config.settings import UNRAID_API_URL, UNRAID_MCP_HOST, UNRAID_MCP_PORT, UNRAID_MCP_TRANSPORT from ..core.client import make_graphql_request from ..core.exceptions import ToolError def register_health_tools(mcp: FastMCP): """Register all health tools with the FastMCP instance. Args: mcp: FastMCP instance to register tools with """ @mcp.tool() async def health_check() -> Dict[str, Any]: """Returns comprehensive health status of the Unraid MCP server and system for monitoring purposes.""" start_time = time.time() health_status = "healthy" issues = [] try: # Enhanced health check with multiple system components comprehensive_query = """ query ComprehensiveHealthCheck { info { machineId time versions { unraid } os { uptime } } array { state } notifications { overview { unread { alert warning total } } } docker { containers(skipCache: true) { id state status } } } """ response_data = await make_graphql_request(comprehensive_query) api_latency = round((time.time() - start_time) * 1000, 2) # ms # Base health info health_info = { "status": health_status, "timestamp": datetime.datetime.utcnow().isoformat(), "api_latency_ms": api_latency, "server": { "name": "Unraid MCP Server", "version": "0.1.0", "transport": UNRAID_MCP_TRANSPORT, "host": UNRAID_MCP_HOST, "port": UNRAID_MCP_PORT, "process_uptime_seconds": time.time() - start_time # Rough estimate } } if not response_data: health_status = "unhealthy" issues.append("No response from Unraid API") health_info["status"] = health_status health_info["issues"] = issues return health_info # System info analysis info = response_data.get("info", {}) if info: health_info["unraid_system"] = { "status": "connected", "url": UNRAID_API_URL, "machine_id": info.get("machineId"), "time": info.get("time"), "version": info.get("versions", {}).get("unraid"), "uptime": info.get("os", {}).get("uptime") } else: health_status = "degraded" issues.append("Unable to retrieve system info") # Array health analysis array_info = response_data.get("array", {}) if array_info: array_state = array_info.get("state", "unknown") health_info["array_status"] = { "state": array_state, "healthy": array_state in ["STARTED", "STOPPED"] } if array_state not in ["STARTED", "STOPPED"]: health_status = "warning" issues.append(f"Array in unexpected state: {array_state}") else: health_status = "warning" issues.append("Unable to retrieve array status") # Notifications analysis notifications = response_data.get("notifications", {}) if notifications and notifications.get("overview"): unread = notifications["overview"].get("unread", {}) alert_count = unread.get("alert", 0) warning_count = unread.get("warning", 0) total_unread = unread.get("total", 0) health_info["notifications"] = { "unread_total": total_unread, "unread_alerts": alert_count, "unread_warnings": warning_count, "has_critical_notifications": alert_count > 0 } if alert_count > 0: health_status = "warning" issues.append(f"{alert_count} unread alert notification(s)") # Docker services analysis docker_info = response_data.get("docker", {}) if docker_info and docker_info.get("containers"): containers = docker_info["containers"] running_containers = [c for c in containers if c.get("state") == "running"] stopped_containers = [c for c in containers if c.get("state") == "exited"] health_info["docker_services"] = { "total_containers": len(containers), "running_containers": len(running_containers), "stopped_containers": len(stopped_containers), "containers_healthy": len([c for c in containers if c.get("status", "").startswith("Up")]) } # API performance assessment if api_latency > 5000: # > 5 seconds health_status = "warning" issues.append(f"High API latency: {api_latency}ms") elif api_latency > 10000: # > 10 seconds health_status = "degraded" issues.append(f"Very high API latency: {api_latency}ms") # Final status determination health_info["status"] = health_status if issues: health_info["issues"] = issues # Add performance metrics health_info["performance"] = { "api_response_time_ms": api_latency, "health_check_duration_ms": round((time.time() - start_time) * 1000, 2) } return health_info except Exception as e: logger.error(f"Health check failed: {e}") return { "status": "unhealthy", "timestamp": datetime.datetime.utcnow().isoformat(), "error": str(e), "api_latency_ms": round((time.time() - start_time) * 1000, 2) if 'start_time' in locals() else None, "server": { "name": "Unraid MCP Server", "version": "0.1.0", "transport": UNRAID_MCP_TRANSPORT, "host": UNRAID_MCP_HOST, "port": UNRAID_MCP_PORT } } logger.info("Health tools registered successfully")