"""Health monitoring and diagnostics. Provides the `unraid_health` tool with 3 actions for system health checks, connection testing, and subscription diagnostics. """ import datetime import time from typing import Any, Literal, get_args from fastmcp import FastMCP from ..config.logging import logger from ..config.settings import ( UNRAID_API_URL, UNRAID_MCP_HOST, UNRAID_MCP_PORT, UNRAID_MCP_TRANSPORT, VERSION, ) from ..core.client import make_graphql_request from ..core.exceptions import ToolError, tool_error_handler from ..core.utils import safe_display_url ALL_ACTIONS = {"check", "test_connection", "diagnose"} HEALTH_ACTIONS = Literal["check", "test_connection", "diagnose"] if set(get_args(HEALTH_ACTIONS)) != ALL_ACTIONS: _missing = ALL_ACTIONS - set(get_args(HEALTH_ACTIONS)) _extra = set(get_args(HEALTH_ACTIONS)) - ALL_ACTIONS raise RuntimeError( "HEALTH_ACTIONS and ALL_ACTIONS are out of sync. " f"Missing in HEALTH_ACTIONS: {_missing}; extra in HEALTH_ACTIONS: {_extra}" ) # Severity ordering: only upgrade, never downgrade _SEVERITY = {"healthy": 0, "warning": 1, "degraded": 2, "unhealthy": 3} def _server_info() -> dict[str, Any]: """Return the standard server info block used in health responses.""" return { "name": "Unraid MCP Server", "version": VERSION, "transport": UNRAID_MCP_TRANSPORT, "host": UNRAID_MCP_HOST, "port": UNRAID_MCP_PORT, } def register_health_tool(mcp: FastMCP) -> None: """Register the unraid_health tool with the FastMCP instance.""" @mcp.tool() async def unraid_health( action: HEALTH_ACTIONS, ) -> dict[str, Any]: """Monitor Unraid MCP server and system health. Actions: check - Comprehensive health check (API latency, array, notifications, Docker) test_connection - Quick connectivity test (just checks { online }) diagnose - Subscription system diagnostics """ if action not in ALL_ACTIONS: raise ToolError(f"Invalid action '{action}'. Must be one of: {sorted(ALL_ACTIONS)}") with tool_error_handler("health", action, logger): logger.info(f"Executing unraid_health action={action}") if action == "test_connection": start = time.time() data = await make_graphql_request("query { online }") latency = round((time.time() - start) * 1000, 2) return { "status": "connected", "online": data.get("online"), "latency_ms": latency, } if action == "check": return await _comprehensive_check() if action == "diagnose": return await _diagnose_subscriptions() raise ToolError(f"Unhandled action '{action}' — this is a bug") logger.info("Health tool registered successfully") async def _comprehensive_check() -> dict[str, Any]: """Run comprehensive health check against the Unraid system.""" start_time = time.time() health_severity = 0 # Track as int to prevent downgrade issues: list[str] = [] def _escalate(level: str) -> None: nonlocal health_severity health_severity = max(health_severity, _SEVERITY.get(level, 0)) try: query = """ query ComprehensiveHealthCheck { info { machineId time versions { unraid } os { uptime } } array { state } notifications { overview { unread { alert warning total } } } docker { containers { id state status } } } """ data = await make_graphql_request(query) api_latency = round((time.time() - start_time) * 1000, 2) health_info: dict[str, Any] = { "status": "healthy", "timestamp": datetime.datetime.now(datetime.UTC).isoformat(), "api_latency_ms": api_latency, "server": _server_info(), } if not data: health_info["status"] = "unhealthy" health_info["issues"] = ["No response from Unraid API"] return health_info # System info info = data.get("info", {}) if info: health_info["unraid_system"] = { "status": "connected", "url": safe_display_url(UNRAID_API_URL), "machine_id": info.get("machineId"), "version": info.get("versions", {}).get("unraid"), "uptime": info.get("os", {}).get("uptime"), } else: _escalate("degraded") issues.append("Unable to retrieve system info") # Array array_info = data.get("array", {}) if array_info: state = array_info.get("state", "unknown") health_info["array_status"] = { "state": state, "healthy": state in ("STARTED", "STOPPED"), } if state not in ("STARTED", "STOPPED"): _escalate("warning") issues.append(f"Array in unexpected state: {state}") else: _escalate("warning") issues.append("Unable to retrieve array status") # Notifications notifications = data.get("notifications", {}) if notifications and notifications.get("overview"): unread = notifications["overview"].get("unread", {}) alerts = unread.get("alert", 0) health_info["notifications"] = { "unread_total": unread.get("total", 0), "unread_alerts": alerts, "unread_warnings": unread.get("warning", 0), } if alerts > 0: _escalate("warning") issues.append(f"{alerts} unread alert(s)") # Docker docker = data.get("docker", {}) if docker and docker.get("containers"): containers = docker["containers"] health_info["docker_services"] = { "total": len(containers), "running": len([c for c in containers if c.get("state") == "running"]), "stopped": len([c for c in containers if c.get("state") == "exited"]), } # Latency assessment if api_latency > 10000: _escalate("degraded") issues.append(f"Very high API latency: {api_latency}ms") elif api_latency > 5000: _escalate("warning") issues.append(f"High API latency: {api_latency}ms") # Resolve final status from severity level severity_to_status = {v: k for k, v in _SEVERITY.items()} health_info["status"] = severity_to_status.get(health_severity, "healthy") if issues: health_info["issues"] = issues health_info["performance"] = { "api_response_time_ms": api_latency, "check_duration_ms": round((time.time() - start_time) * 1000, 2), } return health_info except Exception as e: # Intentionally broad: health checks must always return a result, # even on unexpected failures, so callers never get an unhandled exception. logger.error(f"Health check failed: {e}", exc_info=True) return { "status": "unhealthy", "timestamp": datetime.datetime.now(datetime.UTC).isoformat(), "error": str(e), "server": _server_info(), } def _analyze_subscription_status( status: dict[str, Any], ) -> tuple[int, list[dict[str, Any]]]: """Analyze subscription status dict, returning error count and connection issues. This is the canonical implementation of subscription status analysis. TODO: subscriptions/diagnostics.py has a similar status-analysis pattern in diagnose_subscriptions(). That module could import and call this helper directly to avoid divergence. See Code-H05. Args: status: Dict of subscription name -> status info from get_subscription_status(). Returns: Tuple of (error_count, connection_issues_list). """ error_count = 0 connection_issues: list[dict[str, Any]] = [] for sub_name, sub_status in status.items(): runtime = sub_status.get("runtime", {}) conn_state = runtime.get("connection_state", "unknown") if conn_state in ("error", "auth_failed", "timeout", "max_retries_exceeded"): error_count += 1 if runtime.get("last_error"): connection_issues.append( { "subscription": sub_name, "state": conn_state, "error": runtime["last_error"], } ) return error_count, connection_issues async def _diagnose_subscriptions() -> dict[str, Any]: """Import and run subscription diagnostics.""" try: from ..subscriptions.manager import subscription_manager from ..subscriptions.resources import ensure_subscriptions_started await ensure_subscriptions_started() status = await subscription_manager.get_subscription_status() error_count, connection_issues = _analyze_subscription_status(status) return { "timestamp": datetime.datetime.now(datetime.UTC).isoformat(), "environment": { "auto_start_enabled": subscription_manager.auto_start_enabled, "max_reconnect_attempts": subscription_manager.max_reconnect_attempts, "api_url_configured": bool(UNRAID_API_URL), }, "subscriptions": status, "summary": { "total_configured": len(subscription_manager.subscription_configs), "active_count": len(subscription_manager.active_subscriptions), "with_data": len(subscription_manager.resource_data), "in_error_state": error_count, "connection_issues": connection_issues, }, } except ImportError as e: raise ToolError("Subscription modules not available") from e except Exception as e: raise ToolError(f"Failed to generate diagnostics: {e!s}") from e