Files
unraid-mcp/unraid_mcp/tools/health.py
Jacob Magar 493a376640 lintfree
2025-08-12 11:35:00 -04:00

187 lines
7.0 KiB
Python

"""Comprehensive health monitoring tools.
This module provides tools for comprehensive health checks of the Unraid MCP server
and the underlying Unraid system, including performance metrics, system status,
notifications, Docker services, and API responsiveness.
"""
import datetime
import time
from typing import Any
from fastmcp import FastMCP
from ..config.logging import logger
from ..config.settings import UNRAID_API_URL, UNRAID_MCP_HOST, UNRAID_MCP_PORT, UNRAID_MCP_TRANSPORT
from ..core.client import make_graphql_request
def register_health_tools(mcp: FastMCP) -> None:
"""Register all health tools with the FastMCP instance.
Args:
mcp: FastMCP instance to register tools with
"""
@mcp.tool()
async def health_check() -> dict[str, Any]:
"""Returns comprehensive health status of the Unraid MCP server and system for monitoring purposes."""
start_time = time.time()
health_status = "healthy"
issues = []
try:
# Enhanced health check with multiple system components
comprehensive_query = """
query ComprehensiveHealthCheck {
info {
machineId
time
versions { unraid }
os { uptime }
}
array {
state
}
notifications {
overview {
unread { alert warning total }
}
}
docker {
containers(skipCache: true) {
id
state
status
}
}
}
"""
response_data = await make_graphql_request(comprehensive_query)
api_latency = round((time.time() - start_time) * 1000, 2) # ms
# Base health info
health_info = {
"status": health_status,
"timestamp": datetime.datetime.utcnow().isoformat(),
"api_latency_ms": api_latency,
"server": {
"name": "Unraid MCP Server",
"version": "0.1.0",
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT,
"process_uptime_seconds": time.time() - start_time # Rough estimate
}
}
if not response_data:
health_status = "unhealthy"
issues.append("No response from Unraid API")
health_info["status"] = health_status
health_info["issues"] = issues
return health_info
# System info analysis
info = response_data.get("info", {})
if info:
health_info["unraid_system"] = {
"status": "connected",
"url": UNRAID_API_URL,
"machine_id": info.get("machineId"),
"time": info.get("time"),
"version": info.get("versions", {}).get("unraid"),
"uptime": info.get("os", {}).get("uptime")
}
else:
health_status = "degraded"
issues.append("Unable to retrieve system info")
# Array health analysis
array_info = response_data.get("array", {})
if array_info:
array_state = array_info.get("state", "unknown")
health_info["array_status"] = {
"state": array_state,
"healthy": array_state in ["STARTED", "STOPPED"]
}
if array_state not in ["STARTED", "STOPPED"]:
health_status = "warning"
issues.append(f"Array in unexpected state: {array_state}")
else:
health_status = "warning"
issues.append("Unable to retrieve array status")
# Notifications analysis
notifications = response_data.get("notifications", {})
if notifications and notifications.get("overview"):
unread = notifications["overview"].get("unread", {})
alert_count = unread.get("alert", 0)
warning_count = unread.get("warning", 0)
total_unread = unread.get("total", 0)
health_info["notifications"] = {
"unread_total": total_unread,
"unread_alerts": alert_count,
"unread_warnings": warning_count,
"has_critical_notifications": alert_count > 0
}
if alert_count > 0:
health_status = "warning"
issues.append(f"{alert_count} unread alert notification(s)")
# Docker services analysis
docker_info = response_data.get("docker", {})
if docker_info and docker_info.get("containers"):
containers = docker_info["containers"]
running_containers = [c for c in containers if c.get("state") == "running"]
stopped_containers = [c for c in containers if c.get("state") == "exited"]
health_info["docker_services"] = {
"total_containers": len(containers),
"running_containers": len(running_containers),
"stopped_containers": len(stopped_containers),
"containers_healthy": len([c for c in containers if c.get("status", "").startswith("Up")])
}
# API performance assessment
if api_latency > 5000: # > 5 seconds
health_status = "warning"
issues.append(f"High API latency: {api_latency}ms")
elif api_latency > 10000: # > 10 seconds
health_status = "degraded"
issues.append(f"Very high API latency: {api_latency}ms")
# Final status determination
health_info["status"] = health_status
if issues:
health_info["issues"] = issues
# Add performance metrics
health_info["performance"] = {
"api_response_time_ms": api_latency,
"health_check_duration_ms": round((time.time() - start_time) * 1000, 2)
}
return health_info
except Exception as e:
logger.error(f"Health check failed: {e}")
return {
"status": "unhealthy",
"timestamp": datetime.datetime.utcnow().isoformat(),
"error": str(e),
"api_latency_ms": round((time.time() - start_time) * 1000, 2) if 'start_time' in locals() else None,
"server": {
"name": "Unraid MCP Server",
"version": "0.1.0",
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT
}
}
logger.info("Health tools registered successfully")