mirror of
https://github.com/jmagar/unraid-mcp.git
synced 2026-03-02 00:04:45 -08:00
Addresses issues found by 4 parallel review agents (code-reviewer,
silent-failure-hunter, type-design-analyzer, pr-test-analyzer).
Source fixes:
- core/utils.py: add public safe_display_url() (moved from tools/health.py)
- core/client.py: rename _redact_sensitive → redact_sensitive (public API)
- core/types.py: add SubscriptionData.__post_init__ for tz-aware datetime
enforcement; remove 6 unused type aliases (SystemHealth, APIResponse, etc.)
- subscriptions/manager.py: add exc_info=True to both except-Exception blocks;
add except ValueError break-on-config-error before retry loop; import
redact_sensitive by new public name
- subscriptions/resources.py: re-raise in autostart_subscriptions() so
ensure_subscriptions_started() doesn't permanently set _subscriptions_started
- subscriptions/diagnostics.py: except ToolError: raise before broad except;
use safe_display_url() instead of raw URL slice
- tools/health.py: move _safe_display_url to core/utils; add exc_info=True;
raise ToolError (not return dict) on ImportError
- tools/info.py: use get_args(INFO_ACTIONS) instead of INFO_ACTIONS.__args__
- tools/{array,docker,keys,notifications,rclone,storage,virtualization}.py:
add Literal-vs-ALL_ACTIONS sync check at import time
Test fixes:
- test_health.py: import safe_display_url from core.utils; update
test_diagnose_import_error_internal to expect ToolError (not error dict)
- test_storage.py: add 3 safe_get tests for zero/False/empty-string values
- test_subscription_manager.py: add TestCapLogContentSingleMassiveLine (2 tests)
- test_client.py: rename _redact_sensitive → redact_sensitive; add tests for
new sensitive keys and is_cacheable explicit-keyword form
289 lines
10 KiB
Python
289 lines
10 KiB
Python
"""Health monitoring and diagnostics.
|
|
|
|
Provides the `unraid_health` tool with 3 actions for system health checks,
|
|
connection testing, and subscription diagnostics.
|
|
"""
|
|
|
|
import datetime
|
|
import time
|
|
from typing import Any, Literal, get_args
|
|
|
|
from fastmcp import FastMCP
|
|
|
|
from ..config.logging import logger
|
|
from ..config.settings import (
|
|
UNRAID_API_URL,
|
|
UNRAID_MCP_HOST,
|
|
UNRAID_MCP_PORT,
|
|
UNRAID_MCP_TRANSPORT,
|
|
VERSION,
|
|
)
|
|
from ..core.client import make_graphql_request
|
|
from ..core.exceptions import ToolError, tool_error_handler
|
|
from ..core.utils import safe_display_url
|
|
|
|
|
|
ALL_ACTIONS = {"check", "test_connection", "diagnose"}
|
|
|
|
HEALTH_ACTIONS = Literal["check", "test_connection", "diagnose"]
|
|
|
|
if set(get_args(HEALTH_ACTIONS)) != ALL_ACTIONS:
|
|
_missing = ALL_ACTIONS - set(get_args(HEALTH_ACTIONS))
|
|
_extra = set(get_args(HEALTH_ACTIONS)) - ALL_ACTIONS
|
|
raise RuntimeError(
|
|
"HEALTH_ACTIONS and ALL_ACTIONS are out of sync. "
|
|
f"Missing in HEALTH_ACTIONS: {_missing}; extra in HEALTH_ACTIONS: {_extra}"
|
|
)
|
|
|
|
# Severity ordering: only upgrade, never downgrade
|
|
_SEVERITY = {"healthy": 0, "warning": 1, "degraded": 2, "unhealthy": 3}
|
|
|
|
|
|
def _server_info() -> dict[str, Any]:
|
|
"""Return the standard server info block used in health responses."""
|
|
return {
|
|
"name": "Unraid MCP Server",
|
|
"version": VERSION,
|
|
"transport": UNRAID_MCP_TRANSPORT,
|
|
"host": UNRAID_MCP_HOST,
|
|
"port": UNRAID_MCP_PORT,
|
|
}
|
|
|
|
|
|
def register_health_tool(mcp: FastMCP) -> None:
|
|
"""Register the unraid_health tool with the FastMCP instance."""
|
|
|
|
@mcp.tool()
|
|
async def unraid_health(
|
|
action: HEALTH_ACTIONS,
|
|
) -> dict[str, Any]:
|
|
"""Monitor Unraid MCP server and system health.
|
|
|
|
Actions:
|
|
check - Comprehensive health check (API latency, array, notifications, Docker)
|
|
test_connection - Quick connectivity test (just checks { online })
|
|
diagnose - Subscription system diagnostics
|
|
"""
|
|
if action not in ALL_ACTIONS:
|
|
raise ToolError(f"Invalid action '{action}'. Must be one of: {sorted(ALL_ACTIONS)}")
|
|
|
|
with tool_error_handler("health", action, logger):
|
|
logger.info(f"Executing unraid_health action={action}")
|
|
|
|
if action == "test_connection":
|
|
start = time.time()
|
|
data = await make_graphql_request("query { online }")
|
|
latency = round((time.time() - start) * 1000, 2)
|
|
return {
|
|
"status": "connected",
|
|
"online": data.get("online"),
|
|
"latency_ms": latency,
|
|
}
|
|
|
|
if action == "check":
|
|
return await _comprehensive_check()
|
|
|
|
if action == "diagnose":
|
|
return await _diagnose_subscriptions()
|
|
|
|
raise ToolError(f"Unhandled action '{action}' — this is a bug")
|
|
|
|
logger.info("Health tool registered successfully")
|
|
|
|
|
|
async def _comprehensive_check() -> dict[str, Any]:
|
|
"""Run comprehensive health check against the Unraid system."""
|
|
start_time = time.time()
|
|
health_severity = 0 # Track as int to prevent downgrade
|
|
issues: list[str] = []
|
|
|
|
def _escalate(level: str) -> None:
|
|
nonlocal health_severity
|
|
health_severity = max(health_severity, _SEVERITY.get(level, 0))
|
|
|
|
try:
|
|
query = """
|
|
query ComprehensiveHealthCheck {
|
|
info {
|
|
machineId time
|
|
versions { unraid }
|
|
os { uptime }
|
|
}
|
|
array { state }
|
|
notifications {
|
|
overview { unread { alert warning total } }
|
|
}
|
|
docker {
|
|
containers { id state status }
|
|
}
|
|
}
|
|
"""
|
|
data = await make_graphql_request(query)
|
|
api_latency = round((time.time() - start_time) * 1000, 2)
|
|
|
|
health_info: dict[str, Any] = {
|
|
"status": "healthy",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"api_latency_ms": api_latency,
|
|
"server": _server_info(),
|
|
}
|
|
|
|
if not data:
|
|
health_info["status"] = "unhealthy"
|
|
health_info["issues"] = ["No response from Unraid API"]
|
|
return health_info
|
|
|
|
# System info
|
|
info = data.get("info", {})
|
|
if info:
|
|
health_info["unraid_system"] = {
|
|
"status": "connected",
|
|
"url": safe_display_url(UNRAID_API_URL),
|
|
"machine_id": info.get("machineId"),
|
|
"version": info.get("versions", {}).get("unraid"),
|
|
"uptime": info.get("os", {}).get("uptime"),
|
|
}
|
|
else:
|
|
_escalate("degraded")
|
|
issues.append("Unable to retrieve system info")
|
|
|
|
# Array
|
|
array_info = data.get("array", {})
|
|
if array_info:
|
|
state = array_info.get("state", "unknown")
|
|
health_info["array_status"] = {
|
|
"state": state,
|
|
"healthy": state in ("STARTED", "STOPPED"),
|
|
}
|
|
if state not in ("STARTED", "STOPPED"):
|
|
_escalate("warning")
|
|
issues.append(f"Array in unexpected state: {state}")
|
|
else:
|
|
_escalate("warning")
|
|
issues.append("Unable to retrieve array status")
|
|
|
|
# Notifications
|
|
notifications = data.get("notifications", {})
|
|
if notifications and notifications.get("overview"):
|
|
unread = notifications["overview"].get("unread", {})
|
|
alerts = unread.get("alert", 0)
|
|
health_info["notifications"] = {
|
|
"unread_total": unread.get("total", 0),
|
|
"unread_alerts": alerts,
|
|
"unread_warnings": unread.get("warning", 0),
|
|
}
|
|
if alerts > 0:
|
|
_escalate("warning")
|
|
issues.append(f"{alerts} unread alert(s)")
|
|
|
|
# Docker
|
|
docker = data.get("docker", {})
|
|
if docker and docker.get("containers"):
|
|
containers = docker["containers"]
|
|
health_info["docker_services"] = {
|
|
"total": len(containers),
|
|
"running": len([c for c in containers if c.get("state") == "running"]),
|
|
"stopped": len([c for c in containers if c.get("state") == "exited"]),
|
|
}
|
|
|
|
# Latency assessment
|
|
if api_latency > 10000:
|
|
_escalate("degraded")
|
|
issues.append(f"Very high API latency: {api_latency}ms")
|
|
elif api_latency > 5000:
|
|
_escalate("warning")
|
|
issues.append(f"High API latency: {api_latency}ms")
|
|
|
|
# Resolve final status from severity level
|
|
severity_to_status = {v: k for k, v in _SEVERITY.items()}
|
|
health_info["status"] = severity_to_status.get(health_severity, "healthy")
|
|
if issues:
|
|
health_info["issues"] = issues
|
|
health_info["performance"] = {
|
|
"api_response_time_ms": api_latency,
|
|
"check_duration_ms": round((time.time() - start_time) * 1000, 2),
|
|
}
|
|
|
|
return health_info
|
|
|
|
except Exception as e:
|
|
# Intentionally broad: health checks must always return a result,
|
|
# even on unexpected failures, so callers never get an unhandled exception.
|
|
logger.error(f"Health check failed: {e}", exc_info=True)
|
|
return {
|
|
"status": "unhealthy",
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"error": str(e),
|
|
"server": _server_info(),
|
|
}
|
|
|
|
|
|
def _analyze_subscription_status(
|
|
status: dict[str, Any],
|
|
) -> tuple[int, list[dict[str, Any]]]:
|
|
"""Analyze subscription status dict, returning error count and connection issues.
|
|
|
|
This is the canonical implementation of subscription status analysis.
|
|
TODO: subscriptions/diagnostics.py has a similar status-analysis pattern
|
|
in diagnose_subscriptions(). That module could import and call this helper
|
|
directly to avoid divergence. See Code-H05.
|
|
|
|
Args:
|
|
status: Dict of subscription name -> status info from get_subscription_status().
|
|
|
|
Returns:
|
|
Tuple of (error_count, connection_issues_list).
|
|
"""
|
|
error_count = 0
|
|
connection_issues: list[dict[str, Any]] = []
|
|
|
|
for sub_name, sub_status in status.items():
|
|
runtime = sub_status.get("runtime", {})
|
|
conn_state = runtime.get("connection_state", "unknown")
|
|
if conn_state in ("error", "auth_failed", "timeout", "max_retries_exceeded"):
|
|
error_count += 1
|
|
if runtime.get("last_error"):
|
|
connection_issues.append(
|
|
{
|
|
"subscription": sub_name,
|
|
"state": conn_state,
|
|
"error": runtime["last_error"],
|
|
}
|
|
)
|
|
|
|
return error_count, connection_issues
|
|
|
|
|
|
async def _diagnose_subscriptions() -> dict[str, Any]:
|
|
"""Import and run subscription diagnostics."""
|
|
try:
|
|
from ..subscriptions.manager import subscription_manager
|
|
from ..subscriptions.resources import ensure_subscriptions_started
|
|
|
|
await ensure_subscriptions_started()
|
|
|
|
status = await subscription_manager.get_subscription_status()
|
|
error_count, connection_issues = _analyze_subscription_status(status)
|
|
|
|
return {
|
|
"timestamp": datetime.datetime.now(datetime.UTC).isoformat(),
|
|
"environment": {
|
|
"auto_start_enabled": subscription_manager.auto_start_enabled,
|
|
"max_reconnect_attempts": subscription_manager.max_reconnect_attempts,
|
|
"api_url_configured": bool(UNRAID_API_URL),
|
|
},
|
|
"subscriptions": status,
|
|
"summary": {
|
|
"total_configured": len(subscription_manager.subscription_configs),
|
|
"active_count": len(subscription_manager.active_subscriptions),
|
|
"with_data": len(subscription_manager.resource_data),
|
|
"in_error_state": error_count,
|
|
"connection_issues": connection_issues,
|
|
},
|
|
}
|
|
|
|
except ImportError as e:
|
|
raise ToolError("Subscription modules not available") from e
|
|
except Exception as e:
|
|
raise ToolError(f"Failed to generate diagnostics: {e!s}") from e
|