feat: consolidate 26 tools into 10 tools with 90 actions

Refactor the entire tool layer to use the consolidated action pattern
(action: Literal[...] with QUERIES/MUTATIONS dicts). This reduces LLM
context from ~12k to ~5k tokens while adding ~60 new API capabilities.

New tools: unraid_info (19 actions), unraid_array (12), unraid_notifications (9),
unraid_users (8), unraid_keys (5). Rewritten: unraid_docker (15), unraid_vm (9),
unraid_storage (6), unraid_rclone (4), unraid_health (3).

Includes 129 tests across 10 test files, code review fixes for 16 issues
(severity ordering, PrefixedID regex, sensitive var redaction, etc.).

Removes tools/system.py (replaced by tools/info.py). Version bumped to 0.2.0.
This commit is contained in:
Jacob Magar
2026-02-08 08:49:47 -05:00
parent 67b775a9bc
commit 523b3edc76
33 changed files with 3538 additions and 1583 deletions

View File

@@ -1,186 +1,264 @@
"""Comprehensive health monitoring tools.
"""Health monitoring and diagnostics.
This module provides tools for comprehensive health checks of the Unraid MCP server
and the underlying Unraid system, including performance metrics, system status,
notifications, Docker services, and API responsiveness.
Provides the `unraid_health` tool with 3 actions for system health checks,
connection testing, and subscription diagnostics.
"""
import datetime
import time
from typing import Any
from typing import Any, Literal
from fastmcp import FastMCP
from ..config.logging import logger
from ..config.settings import UNRAID_API_URL, UNRAID_MCP_HOST, UNRAID_MCP_PORT, UNRAID_MCP_TRANSPORT
from ..config.settings import (
UNRAID_API_URL,
UNRAID_MCP_HOST,
UNRAID_MCP_PORT,
UNRAID_MCP_TRANSPORT,
VERSION,
)
from ..core.client import make_graphql_request
from ..core.exceptions import ToolError
HEALTH_ACTIONS = Literal["check", "test_connection", "diagnose"]
# Severity ordering: only upgrade, never downgrade
_SEVERITY = {"healthy": 0, "warning": 1, "degraded": 2, "unhealthy": 3}
def register_health_tools(mcp: FastMCP) -> None:
"""Register all health tools with the FastMCP instance.
Args:
mcp: FastMCP instance to register tools with
"""
def register_health_tool(mcp: FastMCP) -> None:
"""Register the unraid_health tool with the FastMCP instance."""
@mcp.tool()
async def health_check() -> dict[str, Any]:
"""Returns comprehensive health status of the Unraid MCP server and system for monitoring purposes."""
start_time = time.time()
health_status = "healthy"
issues = []
async def unraid_health(
action: HEALTH_ACTIONS,
) -> dict[str, Any]:
"""Monitor Unraid MCP server and system health.
Actions:
check - Comprehensive health check (API latency, array, notifications, Docker)
test_connection - Quick connectivity test (just checks { online })
diagnose - Subscription system diagnostics
"""
if action not in ("check", "test_connection", "diagnose"):
raise ToolError(
f"Invalid action '{action}'. Must be one of: check, test_connection, diagnose"
)
try:
# Enhanced health check with multiple system components
comprehensive_query = """
query ComprehensiveHealthCheck {
info {
machineId
time
versions { unraid }
os { uptime }
}
array {
state
}
notifications {
overview {
unread { alert warning total }
}
}
docker {
containers(skipCache: true) {
id
state
status
}
}
}
"""
logger.info(f"Executing unraid_health action={action}")
response_data = await make_graphql_request(comprehensive_query)
api_latency = round((time.time() - start_time) * 1000, 2) # ms
# Base health info
health_info = {
"status": health_status,
"timestamp": datetime.datetime.utcnow().isoformat(),
"api_latency_ms": api_latency,
"server": {
"name": "Unraid MCP Server",
"version": "0.1.0",
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT,
"process_uptime_seconds": time.time() - start_time # Rough estimate
}
}
if not response_data:
health_status = "unhealthy"
issues.append("No response from Unraid API")
health_info["status"] = health_status
health_info["issues"] = issues
return health_info
# System info analysis
info = response_data.get("info", {})
if info:
health_info["unraid_system"] = {
if action == "test_connection":
start = time.time()
data = await make_graphql_request("query { online }")
latency = round((time.time() - start) * 1000, 2)
return {
"status": "connected",
"url": UNRAID_API_URL,
"machine_id": info.get("machineId"),
"time": info.get("time"),
"version": info.get("versions", {}).get("unraid"),
"uptime": info.get("os", {}).get("uptime")
}
else:
health_status = "degraded"
issues.append("Unable to retrieve system info")
# Array health analysis
array_info = response_data.get("array", {})
if array_info:
array_state = array_info.get("state", "unknown")
health_info["array_status"] = {
"state": array_state,
"healthy": array_state in ["STARTED", "STOPPED"]
}
if array_state not in ["STARTED", "STOPPED"]:
health_status = "warning"
issues.append(f"Array in unexpected state: {array_state}")
else:
health_status = "warning"
issues.append("Unable to retrieve array status")
# Notifications analysis
notifications = response_data.get("notifications", {})
if notifications and notifications.get("overview"):
unread = notifications["overview"].get("unread", {})
alert_count = unread.get("alert", 0)
warning_count = unread.get("warning", 0)
total_unread = unread.get("total", 0)
health_info["notifications"] = {
"unread_total": total_unread,
"unread_alerts": alert_count,
"unread_warnings": warning_count,
"has_critical_notifications": alert_count > 0
"online": data.get("online"),
"latency_ms": latency,
}
if alert_count > 0:
health_status = "warning"
issues.append(f"{alert_count} unread alert notification(s)")
if action == "check":
return await _comprehensive_check()
# Docker services analysis
docker_info = response_data.get("docker", {})
if docker_info and docker_info.get("containers"):
containers = docker_info["containers"]
running_containers = [c for c in containers if c.get("state") == "running"]
stopped_containers = [c for c in containers if c.get("state") == "exited"]
if action == "diagnose":
return await _diagnose_subscriptions()
health_info["docker_services"] = {
"total_containers": len(containers),
"running_containers": len(running_containers),
"stopped_containers": len(stopped_containers),
"containers_healthy": len([c for c in containers if c.get("status", "").startswith("Up")])
}
return {}
# API performance assessment
if api_latency > 5000: # > 5 seconds
health_status = "warning"
issues.append(f"High API latency: {api_latency}ms")
elif api_latency > 10000: # > 10 seconds
health_status = "degraded"
issues.append(f"Very high API latency: {api_latency}ms")
except ToolError:
raise
except Exception as e:
logger.error(f"Error in unraid_health action={action}: {e}", exc_info=True)
raise ToolError(f"Failed to execute health/{action}: {str(e)}") from e
# Final status determination
health_info["status"] = health_status
if issues:
health_info["issues"] = issues
logger.info("Health tool registered successfully")
# Add performance metrics
health_info["performance"] = {
"api_response_time_ms": api_latency,
"health_check_duration_ms": round((time.time() - start_time) * 1000, 2)
}
async def _comprehensive_check() -> dict[str, Any]:
"""Run comprehensive health check against the Unraid system."""
start_time = time.time()
health_severity = 0 # Track as int to prevent downgrade
issues: list[str] = []
def _escalate(level: str) -> None:
nonlocal health_severity
health_severity = max(health_severity, _SEVERITY.get(level, 0))
try:
query = """
query ComprehensiveHealthCheck {
info {
machineId time
versions { unraid }
os { uptime }
}
array { state }
notifications {
overview { unread { alert warning total } }
}
docker {
containers(skipCache: true) { id state status }
}
}
"""
data = await make_graphql_request(query)
api_latency = round((time.time() - start_time) * 1000, 2)
health_info: dict[str, Any] = {
"status": "healthy",
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"api_latency_ms": api_latency,
"server": {
"name": "Unraid MCP Server",
"version": VERSION,
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT,
},
}
if not data:
health_info["status"] = "unhealthy"
health_info["issues"] = ["No response from Unraid API"]
return health_info
except Exception as e:
logger.error(f"Health check failed: {e}")
return {
"status": "unhealthy",
"timestamp": datetime.datetime.utcnow().isoformat(),
"error": str(e),
"api_latency_ms": round((time.time() - start_time) * 1000, 2) if 'start_time' in locals() else None,
"server": {
"name": "Unraid MCP Server",
"version": "0.1.0",
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT
}
# System info
info = data.get("info", {})
if info:
health_info["unraid_system"] = {
"status": "connected",
"url": UNRAID_API_URL,
"machine_id": info.get("machineId"),
"version": info.get("versions", {}).get("unraid"),
"uptime": info.get("os", {}).get("uptime"),
}
else:
_escalate("degraded")
issues.append("Unable to retrieve system info")
# Array
array_info = data.get("array", {})
if array_info:
state = array_info.get("state", "unknown")
health_info["array_status"] = {
"state": state,
"healthy": state in ("STARTED", "STOPPED"),
}
if state not in ("STARTED", "STOPPED"):
_escalate("warning")
issues.append(f"Array in unexpected state: {state}")
else:
_escalate("warning")
issues.append("Unable to retrieve array status")
# Notifications
notifications = data.get("notifications", {})
if notifications and notifications.get("overview"):
unread = notifications["overview"].get("unread", {})
alerts = unread.get("alert", 0)
health_info["notifications"] = {
"unread_total": unread.get("total", 0),
"unread_alerts": alerts,
"unread_warnings": unread.get("warning", 0),
}
if alerts > 0:
_escalate("warning")
issues.append(f"{alerts} unread alert(s)")
# Docker
docker = data.get("docker", {})
if docker and docker.get("containers"):
containers = docker["containers"]
health_info["docker_services"] = {
"total": len(containers),
"running": len([c for c in containers if c.get("state") == "running"]),
"stopped": len([c for c in containers if c.get("state") == "exited"]),
}
logger.info("Health tools registered successfully")
# Latency assessment
if api_latency > 10000:
_escalate("degraded")
issues.append(f"Very high API latency: {api_latency}ms")
elif api_latency > 5000:
_escalate("warning")
issues.append(f"High API latency: {api_latency}ms")
# Resolve final status from severity level
severity_to_status = {v: k for k, v in _SEVERITY.items()}
health_info["status"] = severity_to_status.get(health_severity, "healthy")
if issues:
health_info["issues"] = issues
health_info["performance"] = {
"api_response_time_ms": api_latency,
"check_duration_ms": round((time.time() - start_time) * 1000, 2),
}
return health_info
except Exception as e:
logger.error(f"Health check failed: {e}")
return {
"status": "unhealthy",
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"error": str(e),
"server": {
"name": "Unraid MCP Server",
"version": VERSION,
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT,
},
}
async def _diagnose_subscriptions() -> dict[str, Any]:
"""Import and run subscription diagnostics."""
try:
from ..subscriptions.manager import subscription_manager
from ..subscriptions.resources import ensure_subscriptions_started
await ensure_subscriptions_started()
status = subscription_manager.get_subscription_status()
connection_issues: list[dict[str, Any]] = []
diagnostic_info: dict[str, Any] = {
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"environment": {
"auto_start_enabled": subscription_manager.auto_start_enabled,
"max_reconnect_attempts": subscription_manager.max_reconnect_attempts,
"api_url_configured": bool(UNRAID_API_URL),
},
"subscriptions": status,
"summary": {
"total_configured": len(subscription_manager.subscription_configs),
"active_count": len(subscription_manager.active_subscriptions),
"with_data": len(subscription_manager.resource_data),
"in_error_state": 0,
"connection_issues": connection_issues,
},
}
for sub_name, sub_status in status.items():
runtime = sub_status.get("runtime", {})
conn_state = runtime.get("connection_state", "unknown")
if conn_state in ("error", "auth_failed", "timeout", "max_retries_exceeded"):
diagnostic_info["summary"]["in_error_state"] += 1
if runtime.get("last_error"):
connection_issues.append({
"subscription": sub_name,
"state": conn_state,
"error": runtime["last_error"],
})
return diagnostic_info
except ImportError:
return {
"error": "Subscription modules not available",
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
}
except Exception as e:
raise ToolError(f"Failed to generate diagnostics: {str(e)}") from e