mirror of
https://github.com/jmagar/unraid-mcp.git
synced 2026-03-01 16:04:24 -08:00
Addresses all critical, high, medium, and low issues from full codebase review. 494 tests pass, ruff clean, ty type-check clean. Security: - Add tool_error_handler context manager (exceptions.py) — standardised error handling, eliminates 11 bare except-reraise patterns - Remove unused exception subclasses (ConfigurationError, UnraidAPIError, SubscriptionError, ValidationError, IdempotentOperationError) - Harden GraphQL subscription query validator with allow-list and forbidden-keyword regex (diagnostics.py) - Add input validation for rclone create_remote config_data: injection, path-traversal, and key-count limits (rclone.py) - Validate notifications importance enum before GraphQL request (notifications.py) - Sanitise HTTP/network/JSON error messages — no raw exception strings leaked to clients (client.py) - Strip path/creds from displayed API URL via _safe_display_url (health.py) - Enable Ruff S (bandit) rule category in pyproject.toml - Harden container mutations to strict-only matching — no fuzzy/substring for destructive operations (docker.py) Performance: - Token-bucket rate limiter (90 tokens, 9 req/s) with 429 retry backoff (client.py) - Lazy asyncio.Lock init via _get_client_lock() — fixes event-loop module-load crash (client.py) - Double-checked locking in get_http_client() for fast-path (client.py) - Short hex container ID fast-path skips list fetch (docker.py) - Cap resource_data log content to 1 MB / 5,000 lines (manager.py) - Reset reconnect counter after 30 s stable connection (manager.py) - Move tail_lines validation to module level; enforce 10,000 line cap (storage.py, docker.py) - force_terminal=True removed from logging RichHandler (logging.py) Architecture: - Register diagnostic tools in server startup (server.py) - Move ALL_ACTIONS computation to module level in all tools - Consolidate format_kb / format_bytes into shared core/utils.py - Add _safe_get() helper in core/utils.py for nested dict traversal - Extract _analyze_subscription_status() from health.py diagnose handler - Validate required config at startup — fail fast with CRITICAL log (server.py) Code quality: - Remove ~90 lines of dead Rich formatting helpers from logging.py - Remove dead self.websocket attribute from SubscriptionManager - Remove dead setup_uvicorn_logging() wrapper - Move _VALID_IMPORTANCE to module level (N806 fix) - Add slots=True to all three dataclasses (SubscriptionData, SystemHealth, APIResponse) - Fix None rendering as literal "None" string in info.py summaries - Change fuzzy-match log messages from INFO to DEBUG (docker.py) - UTC-aware datetimes throughout (manager.py, diagnostics.py) Infrastructure: - Upgrade base image python:3.11-slim → python:3.12-slim (Dockerfile) - Add non-root appuser (UID/GID 1000) with HEALTHCHECK (Dockerfile) - Add read_only, cap_drop: ALL, tmpfs /tmp to docker-compose.yml - Single-source version via importlib.metadata (pyproject.toml → __init__.py) - Add open_timeout to all websockets.connect() calls Tests: - Update error message matchers to match sanitised messages (test_client.py) - Fix patch targets for UNRAID_API_URL → utils module (test_subscriptions.py) - Fix importance="info" → importance="normal" (test_notifications.py, http_layer) - Fix naive datetime fixtures → UTC-aware (test_subscriptions.py) Co-authored-by: Claude <claude@anthropic.com>
303 lines
10 KiB
Python
303 lines
10 KiB
Python
"""Health monitoring and diagnostics.
|
|
|
|
Provides the `unraid_health` tool with 3 actions for system health checks,
|
|
connection testing, and subscription diagnostics.
|
|
"""
|
|
|
|
import datetime
|
|
import time
|
|
from typing import Any, Literal
|
|
from urllib.parse import urlparse
|
|
|
|
from fastmcp import FastMCP
|
|
|
|
from ..config.logging import logger
|
|
from ..config.settings import (
|
|
UNRAID_API_URL,
|
|
UNRAID_MCP_HOST,
|
|
UNRAID_MCP_PORT,
|
|
UNRAID_MCP_TRANSPORT,
|
|
VERSION,
|
|
)
|
|
from ..core.client import make_graphql_request
|
|
from ..core.exceptions import ToolError, tool_error_handler
|
|
|
|
|
|
def _safe_display_url(url: str | None) -> str | None:
|
|
"""Return a redacted URL showing only scheme + host + port.
|
|
|
|
Strips path, query parameters, credentials, and fragments to avoid
|
|
leaking internal network topology or embedded secrets (CWE-200).
|
|
"""
|
|
if not url:
|
|
return None
|
|
try:
|
|
parsed = urlparse(url)
|
|
host = parsed.hostname or "unknown"
|
|
if parsed.port:
|
|
return f"{parsed.scheme}://{host}:{parsed.port}"
|
|
return f"{parsed.scheme}://{host}"
|
|
except Exception:
|
|
# If parsing fails, show nothing rather than leaking the raw URL
|
|
return "<unparseable>"
|
|
|
|
|
|
ALL_ACTIONS = {"check", "test_connection", "diagnose"}
|
|
|
|
HEALTH_ACTIONS = Literal["check", "test_connection", "diagnose"]
|
|
|
|
# Severity ordering: only upgrade, never downgrade
|
|
_SEVERITY = {"healthy": 0, "warning": 1, "degraded": 2, "unhealthy": 3}
|
|
|
|
|
|
def _server_info() -> dict[str, Any]:
|
|
"""Return the standard server info block used in health responses."""
|
|
return {
|
|
"name": "Unraid MCP Server",
|
|
"version": VERSION,
|
|
"transport": UNRAID_MCP_TRANSPORT,
|
|
"host": UNRAID_MCP_HOST,
|
|
"port": UNRAID_MCP_PORT,
|
|
}
|
|
|
|
|
|
def register_health_tool(mcp: FastMCP) -> None:
|
|
"""Register the unraid_health tool with the FastMCP instance."""
|
|
|
|
@mcp.tool()
|
|
async def unraid_health(
|
|
action: HEALTH_ACTIONS,
|
|
) -> dict[str, Any]:
|
|
"""Monitor Unraid MCP server and system health.
|
|
|
|
Actions:
|
|
check - Comprehensive health check (API latency, array, notifications, Docker)
|
|
test_connection - Quick connectivity test (just checks { online })
|
|
diagnose - Subscription system diagnostics
|
|
"""
|
|
if action not in ALL_ACTIONS:
|
|
raise ToolError(f"Invalid action '{action}'. Must be one of: {sorted(ALL_ACTIONS)}")
|
|
|
|
with tool_error_handler("health", action, logger):
|
|
logger.info(f"Executing unraid_health action={action}")
|
|
|
|
if action == "test_connection":
|
|
start = time.time()
|
|
data = await make_graphql_request("query { online }")
|
|
latency = round((time.time() - start) * 1000, 2)
|
|
return {
|
|
"status": "connected",
|
|
"online": data.get("online"),
|
|
"latency_ms": latency,
|
|
}
|
|
|
|
if action == "check":
|
|
return await _comprehensive_check()
|
|
|
|
if action == "diagnose":
|
|
return await _diagnose_subscriptions()
|
|
|
|
raise ToolError(f"Unhandled action '{action}' — this is a bug")
|
|
|
|
logger.info("Health tool registered successfully")
|
|
|
|
|
|
async def _comprehensive_check() -> dict[str, Any]:
|
|
"""Run comprehensive health check against the Unraid system."""
|
|
start_time = time.time()
|
|
health_severity = 0 # Track as int to prevent downgrade
|
|
issues: list[str] = []
|
|
|
|
def _escalate(level: str) -> None:
|
|
nonlocal health_severity
|
|
health_severity = max(health_severity, _SEVERITY.get(level, 0))
|
|
|
|
try:
|
|
query = """
|
|
query ComprehensiveHealthCheck {
|
|
info {
|
|
machineId time
|
|
versions { unraid }
|
|
os { uptime }
|
|
}
|
|
array { state }
|
|
notifications {
|
|
overview { unread { alert warning total } }
|
|
}
|
|
docker {
|
|
containers { id state status }
|
|
}
|
|
}
|
|
"""
|
|
data = await make_graphql_request(query)
|
|
api_latency = round((time.time() - start_time) * 1000, 2)
|
|
|
|
health_info: dict[str, Any] = {
|
|
"status": "healthy",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"api_latency_ms": api_latency,
|
|
"server": _server_info(),
|
|
}
|
|
|
|
if not data:
|
|
health_info["status"] = "unhealthy"
|
|
health_info["issues"] = ["No response from Unraid API"]
|
|
return health_info
|
|
|
|
# System info
|
|
info = data.get("info", {})
|
|
if info:
|
|
health_info["unraid_system"] = {
|
|
"status": "connected",
|
|
"url": _safe_display_url(UNRAID_API_URL),
|
|
"machine_id": info.get("machineId"),
|
|
"version": info.get("versions", {}).get("unraid"),
|
|
"uptime": info.get("os", {}).get("uptime"),
|
|
}
|
|
else:
|
|
_escalate("degraded")
|
|
issues.append("Unable to retrieve system info")
|
|
|
|
# Array
|
|
array_info = data.get("array", {})
|
|
if array_info:
|
|
state = array_info.get("state", "unknown")
|
|
health_info["array_status"] = {
|
|
"state": state,
|
|
"healthy": state in ("STARTED", "STOPPED"),
|
|
}
|
|
if state not in ("STARTED", "STOPPED"):
|
|
_escalate("warning")
|
|
issues.append(f"Array in unexpected state: {state}")
|
|
else:
|
|
_escalate("warning")
|
|
issues.append("Unable to retrieve array status")
|
|
|
|
# Notifications
|
|
notifications = data.get("notifications", {})
|
|
if notifications and notifications.get("overview"):
|
|
unread = notifications["overview"].get("unread", {})
|
|
alerts = unread.get("alert", 0)
|
|
health_info["notifications"] = {
|
|
"unread_total": unread.get("total", 0),
|
|
"unread_alerts": alerts,
|
|
"unread_warnings": unread.get("warning", 0),
|
|
}
|
|
if alerts > 0:
|
|
_escalate("warning")
|
|
issues.append(f"{alerts} unread alert(s)")
|
|
|
|
# Docker
|
|
docker = data.get("docker", {})
|
|
if docker and docker.get("containers"):
|
|
containers = docker["containers"]
|
|
health_info["docker_services"] = {
|
|
"total": len(containers),
|
|
"running": len([c for c in containers if c.get("state") == "running"]),
|
|
"stopped": len([c for c in containers if c.get("state") == "exited"]),
|
|
}
|
|
|
|
# Latency assessment
|
|
if api_latency > 10000:
|
|
_escalate("degraded")
|
|
issues.append(f"Very high API latency: {api_latency}ms")
|
|
elif api_latency > 5000:
|
|
_escalate("warning")
|
|
issues.append(f"High API latency: {api_latency}ms")
|
|
|
|
# Resolve final status from severity level
|
|
severity_to_status = {v: k for k, v in _SEVERITY.items()}
|
|
health_info["status"] = severity_to_status.get(health_severity, "healthy")
|
|
if issues:
|
|
health_info["issues"] = issues
|
|
health_info["performance"] = {
|
|
"api_response_time_ms": api_latency,
|
|
"check_duration_ms": round((time.time() - start_time) * 1000, 2),
|
|
}
|
|
|
|
return health_info
|
|
|
|
except Exception as e:
|
|
# Intentionally broad: health checks must always return a result,
|
|
# even on unexpected failures, so callers never get an unhandled exception.
|
|
logger.error(f"Health check failed: {e}")
|
|
return {
|
|
"status": "unhealthy",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"error": str(e),
|
|
"server": _server_info(),
|
|
}
|
|
|
|
|
|
def _analyze_subscription_status(
|
|
status: dict[str, Any],
|
|
) -> tuple[int, list[dict[str, Any]]]:
|
|
"""Analyze subscription status dict, returning error count and connection issues.
|
|
|
|
This is the canonical implementation of subscription status analysis.
|
|
TODO: subscriptions/diagnostics.py (lines 168-182) duplicates this logic.
|
|
That module should be refactored to call this helper once file ownership
|
|
allows cross-agent edits. See Code-H05.
|
|
|
|
Args:
|
|
status: Dict of subscription name -> status info from get_subscription_status().
|
|
|
|
Returns:
|
|
Tuple of (error_count, connection_issues_list).
|
|
"""
|
|
error_count = 0
|
|
connection_issues: list[dict[str, Any]] = []
|
|
|
|
for sub_name, sub_status in status.items():
|
|
runtime = sub_status.get("runtime", {})
|
|
conn_state = runtime.get("connection_state", "unknown")
|
|
if conn_state in ("error", "auth_failed", "timeout", "max_retries_exceeded"):
|
|
error_count += 1
|
|
if runtime.get("last_error"):
|
|
connection_issues.append(
|
|
{
|
|
"subscription": sub_name,
|
|
"state": conn_state,
|
|
"error": runtime["last_error"],
|
|
}
|
|
)
|
|
|
|
return error_count, connection_issues
|
|
|
|
|
|
async def _diagnose_subscriptions() -> dict[str, Any]:
|
|
"""Import and run subscription diagnostics."""
|
|
try:
|
|
from ..subscriptions.manager import subscription_manager
|
|
from ..subscriptions.resources import ensure_subscriptions_started
|
|
|
|
await ensure_subscriptions_started()
|
|
|
|
status = await subscription_manager.get_subscription_status()
|
|
error_count, connection_issues = _analyze_subscription_status(status)
|
|
|
|
return {
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"environment": {
|
|
"auto_start_enabled": subscription_manager.auto_start_enabled,
|
|
"max_reconnect_attempts": subscription_manager.max_reconnect_attempts,
|
|
"api_url_configured": bool(UNRAID_API_URL),
|
|
},
|
|
"subscriptions": status,
|
|
"summary": {
|
|
"total_configured": len(subscription_manager.subscription_configs),
|
|
"active_count": len(subscription_manager.active_subscriptions),
|
|
"with_data": len(subscription_manager.resource_data),
|
|
"in_error_state": error_count,
|
|
"connection_issues": connection_issues,
|
|
},
|
|
}
|
|
|
|
except ImportError:
|
|
return {
|
|
"error": "Subscription modules not available",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
}
|
|
except Exception as e:
|
|
raise ToolError(f"Failed to generate diagnostics: {e!s}") from e
|