mirror of
https://github.com/jmagar/unraid-mcp.git
synced 2026-03-02 00:04:45 -08:00
Critical bug fixes from PR review agents: - client.py: eager asyncio.Lock init, Final[frozenset] for _SENSITIVE_KEYS, explicit 429 ToolError after retries exhausted, removed lazy _get_client_lock() and _RateLimiter._get_lock() patterns - exceptions.py: use builtin TimeoutError (UP041), explicit handler before broad except so asyncio timeouts get descriptive messages - docker.py: add update_all to DESTRUCTIVE_ACTIONS (was missing), remove dead _MUTATION_ACTIONS constant - manager.py: _cap_log_content returns new dict (immutable), lock write to resource_data, clean dead task from active_subscriptions after loop exits - diagnostics.py: fix inaccurate comment about semicolon injection guard - health.py: narrow except ValueError in _safe_display_url, fix TODO comment New test coverage (98 tests added, 529 → 598 passing): - test_subscription_validation.py: 27 tests for _validate_subscription_query (security-critical allow-list, forbidden keyword guards, word-boundary test) - test_subscription_manager.py: 12 tests for _cap_log_content (immutability, truncation, nesting, passthrough) - test_client.py: +57 tests — _RateLimiter (token math, refill, sleep-on-empty), _QueryCache (TTL, invalidation, is_cacheable), 429 retry loop (1/2/3 failures) - test_health.py: +10 tests for _safe_display_url (credential strip, port, path/query removal, malformed IPv6 → <unparseable>) - test_notifications.py: +7 importance enum and field length validation tests - test_rclone.py: +7 _validate_config_data security guard tests - test_storage.py: +15 (tail_lines bounds, format_kb, safe_get) - test_docker.py: update_all now requires confirm=True + new guard test - test_destructive_guards.py: update audit to include update_all Co-authored-by: Claude <noreply@anthropic.com>
303 lines
10 KiB
Python
303 lines
10 KiB
Python
"""Health monitoring and diagnostics.
|
|
|
|
Provides the `unraid_health` tool with 3 actions for system health checks,
|
|
connection testing, and subscription diagnostics.
|
|
"""
|
|
|
|
import datetime
|
|
import time
|
|
from typing import Any, Literal
|
|
from urllib.parse import urlparse
|
|
|
|
from fastmcp import FastMCP
|
|
|
|
from ..config.logging import logger
|
|
from ..config.settings import (
|
|
UNRAID_API_URL,
|
|
UNRAID_MCP_HOST,
|
|
UNRAID_MCP_PORT,
|
|
UNRAID_MCP_TRANSPORT,
|
|
VERSION,
|
|
)
|
|
from ..core.client import make_graphql_request
|
|
from ..core.exceptions import ToolError, tool_error_handler
|
|
|
|
|
|
def _safe_display_url(url: str | None) -> str | None:
|
|
"""Return a redacted URL showing only scheme + host + port.
|
|
|
|
Strips path, query parameters, credentials, and fragments to avoid
|
|
leaking internal network topology or embedded secrets (CWE-200).
|
|
"""
|
|
if not url:
|
|
return None
|
|
try:
|
|
parsed = urlparse(url)
|
|
host = parsed.hostname or "unknown"
|
|
if parsed.port:
|
|
return f"{parsed.scheme}://{host}:{parsed.port}"
|
|
return f"{parsed.scheme}://{host}"
|
|
except ValueError:
|
|
# urlparse raises ValueError for invalid URLs (e.g. contains control chars)
|
|
return "<unparseable>"
|
|
|
|
|
|
ALL_ACTIONS = {"check", "test_connection", "diagnose"}
|
|
|
|
HEALTH_ACTIONS = Literal["check", "test_connection", "diagnose"]
|
|
|
|
# Severity ordering: only upgrade, never downgrade
|
|
_SEVERITY = {"healthy": 0, "warning": 1, "degraded": 2, "unhealthy": 3}
|
|
|
|
|
|
def _server_info() -> dict[str, Any]:
|
|
"""Return the standard server info block used in health responses."""
|
|
return {
|
|
"name": "Unraid MCP Server",
|
|
"version": VERSION,
|
|
"transport": UNRAID_MCP_TRANSPORT,
|
|
"host": UNRAID_MCP_HOST,
|
|
"port": UNRAID_MCP_PORT,
|
|
}
|
|
|
|
|
|
def register_health_tool(mcp: FastMCP) -> None:
|
|
"""Register the unraid_health tool with the FastMCP instance."""
|
|
|
|
@mcp.tool()
|
|
async def unraid_health(
|
|
action: HEALTH_ACTIONS,
|
|
) -> dict[str, Any]:
|
|
"""Monitor Unraid MCP server and system health.
|
|
|
|
Actions:
|
|
check - Comprehensive health check (API latency, array, notifications, Docker)
|
|
test_connection - Quick connectivity test (just checks { online })
|
|
diagnose - Subscription system diagnostics
|
|
"""
|
|
if action not in ALL_ACTIONS:
|
|
raise ToolError(f"Invalid action '{action}'. Must be one of: {sorted(ALL_ACTIONS)}")
|
|
|
|
with tool_error_handler("health", action, logger):
|
|
logger.info(f"Executing unraid_health action={action}")
|
|
|
|
if action == "test_connection":
|
|
start = time.time()
|
|
data = await make_graphql_request("query { online }")
|
|
latency = round((time.time() - start) * 1000, 2)
|
|
return {
|
|
"status": "connected",
|
|
"online": data.get("online"),
|
|
"latency_ms": latency,
|
|
}
|
|
|
|
if action == "check":
|
|
return await _comprehensive_check()
|
|
|
|
if action == "diagnose":
|
|
return await _diagnose_subscriptions()
|
|
|
|
raise ToolError(f"Unhandled action '{action}' — this is a bug")
|
|
|
|
logger.info("Health tool registered successfully")
|
|
|
|
|
|
async def _comprehensive_check() -> dict[str, Any]:
|
|
"""Run comprehensive health check against the Unraid system."""
|
|
start_time = time.time()
|
|
health_severity = 0 # Track as int to prevent downgrade
|
|
issues: list[str] = []
|
|
|
|
def _escalate(level: str) -> None:
|
|
nonlocal health_severity
|
|
health_severity = max(health_severity, _SEVERITY.get(level, 0))
|
|
|
|
try:
|
|
query = """
|
|
query ComprehensiveHealthCheck {
|
|
info {
|
|
machineId time
|
|
versions { unraid }
|
|
os { uptime }
|
|
}
|
|
array { state }
|
|
notifications {
|
|
overview { unread { alert warning total } }
|
|
}
|
|
docker {
|
|
containers { id state status }
|
|
}
|
|
}
|
|
"""
|
|
data = await make_graphql_request(query)
|
|
api_latency = round((time.time() - start_time) * 1000, 2)
|
|
|
|
health_info: dict[str, Any] = {
|
|
"status": "healthy",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"api_latency_ms": api_latency,
|
|
"server": _server_info(),
|
|
}
|
|
|
|
if not data:
|
|
health_info["status"] = "unhealthy"
|
|
health_info["issues"] = ["No response from Unraid API"]
|
|
return health_info
|
|
|
|
# System info
|
|
info = data.get("info", {})
|
|
if info:
|
|
health_info["unraid_system"] = {
|
|
"status": "connected",
|
|
"url": _safe_display_url(UNRAID_API_URL),
|
|
"machine_id": info.get("machineId"),
|
|
"version": info.get("versions", {}).get("unraid"),
|
|
"uptime": info.get("os", {}).get("uptime"),
|
|
}
|
|
else:
|
|
_escalate("degraded")
|
|
issues.append("Unable to retrieve system info")
|
|
|
|
# Array
|
|
array_info = data.get("array", {})
|
|
if array_info:
|
|
state = array_info.get("state", "unknown")
|
|
health_info["array_status"] = {
|
|
"state": state,
|
|
"healthy": state in ("STARTED", "STOPPED"),
|
|
}
|
|
if state not in ("STARTED", "STOPPED"):
|
|
_escalate("warning")
|
|
issues.append(f"Array in unexpected state: {state}")
|
|
else:
|
|
_escalate("warning")
|
|
issues.append("Unable to retrieve array status")
|
|
|
|
# Notifications
|
|
notifications = data.get("notifications", {})
|
|
if notifications and notifications.get("overview"):
|
|
unread = notifications["overview"].get("unread", {})
|
|
alerts = unread.get("alert", 0)
|
|
health_info["notifications"] = {
|
|
"unread_total": unread.get("total", 0),
|
|
"unread_alerts": alerts,
|
|
"unread_warnings": unread.get("warning", 0),
|
|
}
|
|
if alerts > 0:
|
|
_escalate("warning")
|
|
issues.append(f"{alerts} unread alert(s)")
|
|
|
|
# Docker
|
|
docker = data.get("docker", {})
|
|
if docker and docker.get("containers"):
|
|
containers = docker["containers"]
|
|
health_info["docker_services"] = {
|
|
"total": len(containers),
|
|
"running": len([c for c in containers if c.get("state") == "running"]),
|
|
"stopped": len([c for c in containers if c.get("state") == "exited"]),
|
|
}
|
|
|
|
# Latency assessment
|
|
if api_latency > 10000:
|
|
_escalate("degraded")
|
|
issues.append(f"Very high API latency: {api_latency}ms")
|
|
elif api_latency > 5000:
|
|
_escalate("warning")
|
|
issues.append(f"High API latency: {api_latency}ms")
|
|
|
|
# Resolve final status from severity level
|
|
severity_to_status = {v: k for k, v in _SEVERITY.items()}
|
|
health_info["status"] = severity_to_status.get(health_severity, "healthy")
|
|
if issues:
|
|
health_info["issues"] = issues
|
|
health_info["performance"] = {
|
|
"api_response_time_ms": api_latency,
|
|
"check_duration_ms": round((time.time() - start_time) * 1000, 2),
|
|
}
|
|
|
|
return health_info
|
|
|
|
except Exception as e:
|
|
# Intentionally broad: health checks must always return a result,
|
|
# even on unexpected failures, so callers never get an unhandled exception.
|
|
logger.error(f"Health check failed: {e}")
|
|
return {
|
|
"status": "unhealthy",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"error": str(e),
|
|
"server": _server_info(),
|
|
}
|
|
|
|
|
|
def _analyze_subscription_status(
|
|
status: dict[str, Any],
|
|
) -> tuple[int, list[dict[str, Any]]]:
|
|
"""Analyze subscription status dict, returning error count and connection issues.
|
|
|
|
This is the canonical implementation of subscription status analysis.
|
|
TODO: subscriptions/diagnostics.py has a similar status-analysis pattern
|
|
in diagnose_subscriptions(). That module could import and call this helper
|
|
directly to avoid divergence. See Code-H05.
|
|
|
|
Args:
|
|
status: Dict of subscription name -> status info from get_subscription_status().
|
|
|
|
Returns:
|
|
Tuple of (error_count, connection_issues_list).
|
|
"""
|
|
error_count = 0
|
|
connection_issues: list[dict[str, Any]] = []
|
|
|
|
for sub_name, sub_status in status.items():
|
|
runtime = sub_status.get("runtime", {})
|
|
conn_state = runtime.get("connection_state", "unknown")
|
|
if conn_state in ("error", "auth_failed", "timeout", "max_retries_exceeded"):
|
|
error_count += 1
|
|
if runtime.get("last_error"):
|
|
connection_issues.append(
|
|
{
|
|
"subscription": sub_name,
|
|
"state": conn_state,
|
|
"error": runtime["last_error"],
|
|
}
|
|
)
|
|
|
|
return error_count, connection_issues
|
|
|
|
|
|
async def _diagnose_subscriptions() -> dict[str, Any]:
|
|
"""Import and run subscription diagnostics."""
|
|
try:
|
|
from ..subscriptions.manager import subscription_manager
|
|
from ..subscriptions.resources import ensure_subscriptions_started
|
|
|
|
await ensure_subscriptions_started()
|
|
|
|
status = await subscription_manager.get_subscription_status()
|
|
error_count, connection_issues = _analyze_subscription_status(status)
|
|
|
|
return {
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"environment": {
|
|
"auto_start_enabled": subscription_manager.auto_start_enabled,
|
|
"max_reconnect_attempts": subscription_manager.max_reconnect_attempts,
|
|
"api_url_configured": bool(UNRAID_API_URL),
|
|
},
|
|
"subscriptions": status,
|
|
"summary": {
|
|
"total_configured": len(subscription_manager.subscription_configs),
|
|
"active_count": len(subscription_manager.active_subscriptions),
|
|
"with_data": len(subscription_manager.resource_data),
|
|
"in_error_state": error_count,
|
|
"connection_issues": connection_issues,
|
|
},
|
|
}
|
|
|
|
except ImportError:
|
|
return {
|
|
"error": "Subscription modules not available",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
}
|
|
except Exception as e:
|
|
raise ToolError(f"Failed to generate diagnostics: {e!s}") from e
|