Files
unraid-mcp/unraid_mcp/tools/health.py
Jacob Magar f76e676fd4 test: close critical coverage gaps and harden PR review fixes
Critical bug fixes from PR review agents:
- client.py: eager asyncio.Lock init, Final[frozenset] for _SENSITIVE_KEYS,
  explicit 429 ToolError after retries exhausted, removed lazy _get_client_lock()
  and _RateLimiter._get_lock() patterns
- exceptions.py: use builtin TimeoutError (UP041), explicit handler before broad
  except so asyncio timeouts get descriptive messages
- docker.py: add update_all to DESTRUCTIVE_ACTIONS (was missing), remove dead
  _MUTATION_ACTIONS constant
- manager.py: _cap_log_content returns new dict (immutable), lock write to
  resource_data, clean dead task from active_subscriptions after loop exits
- diagnostics.py: fix inaccurate comment about semicolon injection guard
- health.py: narrow except ValueError in _safe_display_url, fix TODO comment

New test coverage (98 tests added, 529 → 598 passing):
- test_subscription_validation.py: 27 tests for _validate_subscription_query
  (security-critical allow-list, forbidden keyword guards, word-boundary test)
- test_subscription_manager.py: 12 tests for _cap_log_content
  (immutability, truncation, nesting, passthrough)
- test_client.py: +57 tests — _RateLimiter (token math, refill, sleep-on-empty),
  _QueryCache (TTL, invalidation, is_cacheable), 429 retry loop (1/2/3 failures)
- test_health.py: +10 tests for _safe_display_url (credential strip, port,
  path/query removal, malformed IPv6 → <unparseable>)
- test_notifications.py: +7 importance enum and field length validation tests
- test_rclone.py: +7 _validate_config_data security guard tests
- test_storage.py: +15 (tail_lines bounds, format_kb, safe_get)
- test_docker.py: update_all now requires confirm=True + new guard test
- test_destructive_guards.py: update audit to include update_all

Co-authored-by: Claude <noreply@anthropic.com>
2026-02-18 01:28:40 -05:00

303 lines
10 KiB
Python

"""Health monitoring and diagnostics.
Provides the `unraid_health` tool with 3 actions for system health checks,
connection testing, and subscription diagnostics.
"""
import datetime
import time
from typing import Any, Literal
from urllib.parse import urlparse
from fastmcp import FastMCP
from ..config.logging import logger
from ..config.settings import (
UNRAID_API_URL,
UNRAID_MCP_HOST,
UNRAID_MCP_PORT,
UNRAID_MCP_TRANSPORT,
VERSION,
)
from ..core.client import make_graphql_request
from ..core.exceptions import ToolError, tool_error_handler
def _safe_display_url(url: str | None) -> str | None:
"""Return a redacted URL showing only scheme + host + port.
Strips path, query parameters, credentials, and fragments to avoid
leaking internal network topology or embedded secrets (CWE-200).
"""
if not url:
return None
try:
parsed = urlparse(url)
host = parsed.hostname or "unknown"
if parsed.port:
return f"{parsed.scheme}://{host}:{parsed.port}"
return f"{parsed.scheme}://{host}"
except ValueError:
# urlparse raises ValueError for invalid URLs (e.g. contains control chars)
return "<unparseable>"
ALL_ACTIONS = {"check", "test_connection", "diagnose"}
HEALTH_ACTIONS = Literal["check", "test_connection", "diagnose"]
# Severity ordering: only upgrade, never downgrade
_SEVERITY = {"healthy": 0, "warning": 1, "degraded": 2, "unhealthy": 3}
def _server_info() -> dict[str, Any]:
"""Return the standard server info block used in health responses."""
return {
"name": "Unraid MCP Server",
"version": VERSION,
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT,
}
def register_health_tool(mcp: FastMCP) -> None:
"""Register the unraid_health tool with the FastMCP instance."""
@mcp.tool()
async def unraid_health(
action: HEALTH_ACTIONS,
) -> dict[str, Any]:
"""Monitor Unraid MCP server and system health.
Actions:
check - Comprehensive health check (API latency, array, notifications, Docker)
test_connection - Quick connectivity test (just checks { online })
diagnose - Subscription system diagnostics
"""
if action not in ALL_ACTIONS:
raise ToolError(f"Invalid action '{action}'. Must be one of: {sorted(ALL_ACTIONS)}")
with tool_error_handler("health", action, logger):
logger.info(f"Executing unraid_health action={action}")
if action == "test_connection":
start = time.time()
data = await make_graphql_request("query { online }")
latency = round((time.time() - start) * 1000, 2)
return {
"status": "connected",
"online": data.get("online"),
"latency_ms": latency,
}
if action == "check":
return await _comprehensive_check()
if action == "diagnose":
return await _diagnose_subscriptions()
raise ToolError(f"Unhandled action '{action}' — this is a bug")
logger.info("Health tool registered successfully")
async def _comprehensive_check() -> dict[str, Any]:
"""Run comprehensive health check against the Unraid system."""
start_time = time.time()
health_severity = 0 # Track as int to prevent downgrade
issues: list[str] = []
def _escalate(level: str) -> None:
nonlocal health_severity
health_severity = max(health_severity, _SEVERITY.get(level, 0))
try:
query = """
query ComprehensiveHealthCheck {
info {
machineId time
versions { unraid }
os { uptime }
}
array { state }
notifications {
overview { unread { alert warning total } }
}
docker {
containers { id state status }
}
}
"""
data = await make_graphql_request(query)
api_latency = round((time.time() - start_time) * 1000, 2)
health_info: dict[str, Any] = {
"status": "healthy",
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
"api_latency_ms": api_latency,
"server": _server_info(),
}
if not data:
health_info["status"] = "unhealthy"
health_info["issues"] = ["No response from Unraid API"]
return health_info
# System info
info = data.get("info", {})
if info:
health_info["unraid_system"] = {
"status": "connected",
"url": _safe_display_url(UNRAID_API_URL),
"machine_id": info.get("machineId"),
"version": info.get("versions", {}).get("unraid"),
"uptime": info.get("os", {}).get("uptime"),
}
else:
_escalate("degraded")
issues.append("Unable to retrieve system info")
# Array
array_info = data.get("array", {})
if array_info:
state = array_info.get("state", "unknown")
health_info["array_status"] = {
"state": state,
"healthy": state in ("STARTED", "STOPPED"),
}
if state not in ("STARTED", "STOPPED"):
_escalate("warning")
issues.append(f"Array in unexpected state: {state}")
else:
_escalate("warning")
issues.append("Unable to retrieve array status")
# Notifications
notifications = data.get("notifications", {})
if notifications and notifications.get("overview"):
unread = notifications["overview"].get("unread", {})
alerts = unread.get("alert", 0)
health_info["notifications"] = {
"unread_total": unread.get("total", 0),
"unread_alerts": alerts,
"unread_warnings": unread.get("warning", 0),
}
if alerts > 0:
_escalate("warning")
issues.append(f"{alerts} unread alert(s)")
# Docker
docker = data.get("docker", {})
if docker and docker.get("containers"):
containers = docker["containers"]
health_info["docker_services"] = {
"total": len(containers),
"running": len([c for c in containers if c.get("state") == "running"]),
"stopped": len([c for c in containers if c.get("state") == "exited"]),
}
# Latency assessment
if api_latency > 10000:
_escalate("degraded")
issues.append(f"Very high API latency: {api_latency}ms")
elif api_latency > 5000:
_escalate("warning")
issues.append(f"High API latency: {api_latency}ms")
# Resolve final status from severity level
severity_to_status = {v: k for k, v in _SEVERITY.items()}
health_info["status"] = severity_to_status.get(health_severity, "healthy")
if issues:
health_info["issues"] = issues
health_info["performance"] = {
"api_response_time_ms": api_latency,
"check_duration_ms": round((time.time() - start_time) * 1000, 2),
}
return health_info
except Exception as e:
# Intentionally broad: health checks must always return a result,
# even on unexpected failures, so callers never get an unhandled exception.
logger.error(f"Health check failed: {e}")
return {
"status": "unhealthy",
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
"error": str(e),
"server": _server_info(),
}
def _analyze_subscription_status(
status: dict[str, Any],
) -> tuple[int, list[dict[str, Any]]]:
"""Analyze subscription status dict, returning error count and connection issues.
This is the canonical implementation of subscription status analysis.
TODO: subscriptions/diagnostics.py has a similar status-analysis pattern
in diagnose_subscriptions(). That module could import and call this helper
directly to avoid divergence. See Code-H05.
Args:
status: Dict of subscription name -> status info from get_subscription_status().
Returns:
Tuple of (error_count, connection_issues_list).
"""
error_count = 0
connection_issues: list[dict[str, Any]] = []
for sub_name, sub_status in status.items():
runtime = sub_status.get("runtime", {})
conn_state = runtime.get("connection_state", "unknown")
if conn_state in ("error", "auth_failed", "timeout", "max_retries_exceeded"):
error_count += 1
if runtime.get("last_error"):
connection_issues.append(
{
"subscription": sub_name,
"state": conn_state,
"error": runtime["last_error"],
}
)
return error_count, connection_issues
async def _diagnose_subscriptions() -> dict[str, Any]:
"""Import and run subscription diagnostics."""
try:
from ..subscriptions.manager import subscription_manager
from ..subscriptions.resources import ensure_subscriptions_started
await ensure_subscriptions_started()
status = await subscription_manager.get_subscription_status()
error_count, connection_issues = _analyze_subscription_status(status)
return {
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
"environment": {
"auto_start_enabled": subscription_manager.auto_start_enabled,
"max_reconnect_attempts": subscription_manager.max_reconnect_attempts,
"api_url_configured": bool(UNRAID_API_URL),
},
"subscriptions": status,
"summary": {
"total_configured": len(subscription_manager.subscription_configs),
"active_count": len(subscription_manager.active_subscriptions),
"with_data": len(subscription_manager.resource_data),
"in_error_state": error_count,
"connection_issues": connection_issues,
},
}
except ImportError:
return {
"error": "Subscription modules not available",
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
}
except Exception as e:
raise ToolError(f"Failed to generate diagnostics: {e!s}") from e