Files
unraid-mcp/unraid_mcp/tools/health.py
Jacob Magar 2697c269a3 chore: enhance project metadata, tooling, and documentation
**Project Configuration:**
- Enhance pyproject.toml with comprehensive metadata, keywords, and classifiers
- Add LICENSE file (MIT) for proper open-source distribution
- Add PUBLISHING.md with comprehensive publishing guidelines
- Update .gitignore to exclude tool artifacts (.cache, .pytest_cache, .ruff_cache, .ty_cache)
- Ignore documentation working directories (.docs, .full-review, docs/plans, docs/sessions)

**Documentation:**
- Add extensive Unraid API research documentation
  - API source code analysis and resolver mapping
  - Competitive analysis and feature gap assessment
  - Release notes analysis (7.0.0, 7.1.0, 7.2.0)
  - Connect platform overview and remote access documentation
- Document known API patterns, limitations, and edge cases

**Testing & Code Quality:**
- Expand test coverage across all tool modules
- Add destructive action confirmation tests
- Improve test assertions and error case validation
- Refine type annotations for better static analysis

**Tool Improvements:**
- Enhance error handling consistency across all tools
- Improve type safety with explicit type annotations
- Refine GraphQL query construction patterns
- Better handling of optional parameters and edge cases

This commit prepares the project for v0.2.0 release with improved
metadata, comprehensive documentation, and enhanced code quality.

Co-authored-by: Claude <noreply@anthropic.com>
2026-02-15 15:32:09 -05:00

265 lines
9.3 KiB
Python

"""Health monitoring and diagnostics.
Provides the `unraid_health` tool with 3 actions for system health checks,
connection testing, and subscription diagnostics.
"""
import datetime
import time
from typing import Any, Literal
from fastmcp import FastMCP
from ..config.logging import logger
from ..config.settings import (
UNRAID_API_URL,
UNRAID_MCP_HOST,
UNRAID_MCP_PORT,
UNRAID_MCP_TRANSPORT,
VERSION,
)
from ..core.client import make_graphql_request
from ..core.exceptions import ToolError
HEALTH_ACTIONS = Literal["check", "test_connection", "diagnose"]
# Severity ordering: only upgrade, never downgrade
_SEVERITY = {"healthy": 0, "warning": 1, "degraded": 2, "unhealthy": 3}
def register_health_tool(mcp: FastMCP) -> None:
"""Register the unraid_health tool with the FastMCP instance."""
@mcp.tool()
async def unraid_health(
action: HEALTH_ACTIONS,
) -> dict[str, Any]:
"""Monitor Unraid MCP server and system health.
Actions:
check - Comprehensive health check (API latency, array, notifications, Docker)
test_connection - Quick connectivity test (just checks { online })
diagnose - Subscription system diagnostics
"""
if action not in ("check", "test_connection", "diagnose"):
raise ToolError(
f"Invalid action '{action}'. Must be one of: check, test_connection, diagnose"
)
try:
logger.info(f"Executing unraid_health action={action}")
if action == "test_connection":
start = time.time()
data = await make_graphql_request("query { online }")
latency = round((time.time() - start) * 1000, 2)
return {
"status": "connected",
"online": data.get("online"),
"latency_ms": latency,
}
if action == "check":
return await _comprehensive_check()
if action == "diagnose":
return await _diagnose_subscriptions()
raise ToolError(f"Unhandled action '{action}' — this is a bug")
except ToolError:
raise
except Exception as e:
logger.error(f"Error in unraid_health action={action}: {e}", exc_info=True)
raise ToolError(f"Failed to execute health/{action}: {str(e)}") from e
logger.info("Health tool registered successfully")
async def _comprehensive_check() -> dict[str, Any]:
"""Run comprehensive health check against the Unraid system."""
start_time = time.time()
health_severity = 0 # Track as int to prevent downgrade
issues: list[str] = []
def _escalate(level: str) -> None:
nonlocal health_severity
health_severity = max(health_severity, _SEVERITY.get(level, 0))
try:
query = """
query ComprehensiveHealthCheck {
info {
machineId time
versions { unraid }
os { uptime }
}
array { state }
notifications {
overview { unread { alert warning total } }
}
docker {
containers(skipCache: true) { id state status }
}
}
"""
data = await make_graphql_request(query)
api_latency = round((time.time() - start_time) * 1000, 2)
health_info: dict[str, Any] = {
"status": "healthy",
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"api_latency_ms": api_latency,
"server": {
"name": "Unraid MCP Server",
"version": VERSION,
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT,
},
}
if not data:
health_info["status"] = "unhealthy"
health_info["issues"] = ["No response from Unraid API"]
return health_info
# System info
info = data.get("info", {})
if info:
health_info["unraid_system"] = {
"status": "connected",
"url": UNRAID_API_URL,
"machine_id": info.get("machineId"),
"version": info.get("versions", {}).get("unraid"),
"uptime": info.get("os", {}).get("uptime"),
}
else:
_escalate("degraded")
issues.append("Unable to retrieve system info")
# Array
array_info = data.get("array", {})
if array_info:
state = array_info.get("state", "unknown")
health_info["array_status"] = {
"state": state,
"healthy": state in ("STARTED", "STOPPED"),
}
if state not in ("STARTED", "STOPPED"):
_escalate("warning")
issues.append(f"Array in unexpected state: {state}")
else:
_escalate("warning")
issues.append("Unable to retrieve array status")
# Notifications
notifications = data.get("notifications", {})
if notifications and notifications.get("overview"):
unread = notifications["overview"].get("unread", {})
alerts = unread.get("alert", 0)
health_info["notifications"] = {
"unread_total": unread.get("total", 0),
"unread_alerts": alerts,
"unread_warnings": unread.get("warning", 0),
}
if alerts > 0:
_escalate("warning")
issues.append(f"{alerts} unread alert(s)")
# Docker
docker = data.get("docker", {})
if docker and docker.get("containers"):
containers = docker["containers"]
health_info["docker_services"] = {
"total": len(containers),
"running": len([c for c in containers if c.get("state") == "running"]),
"stopped": len([c for c in containers if c.get("state") == "exited"]),
}
# Latency assessment
if api_latency > 10000:
_escalate("degraded")
issues.append(f"Very high API latency: {api_latency}ms")
elif api_latency > 5000:
_escalate("warning")
issues.append(f"High API latency: {api_latency}ms")
# Resolve final status from severity level
severity_to_status = {v: k for k, v in _SEVERITY.items()}
health_info["status"] = severity_to_status.get(health_severity, "healthy")
if issues:
health_info["issues"] = issues
health_info["performance"] = {
"api_response_time_ms": api_latency,
"check_duration_ms": round((time.time() - start_time) * 1000, 2),
}
return health_info
except Exception as e:
logger.error(f"Health check failed: {e}")
return {
"status": "unhealthy",
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"error": str(e),
"server": {
"name": "Unraid MCP Server",
"version": VERSION,
"transport": UNRAID_MCP_TRANSPORT,
"host": UNRAID_MCP_HOST,
"port": UNRAID_MCP_PORT,
},
}
async def _diagnose_subscriptions() -> dict[str, Any]:
"""Import and run subscription diagnostics."""
try:
from ..subscriptions.manager import subscription_manager
from ..subscriptions.resources import ensure_subscriptions_started
await ensure_subscriptions_started()
status = subscription_manager.get_subscription_status()
connection_issues: list[dict[str, Any]] = []
diagnostic_info: dict[str, Any] = {
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
"environment": {
"auto_start_enabled": subscription_manager.auto_start_enabled,
"max_reconnect_attempts": subscription_manager.max_reconnect_attempts,
"api_url_configured": bool(UNRAID_API_URL),
},
"subscriptions": status,
"summary": {
"total_configured": len(subscription_manager.subscription_configs),
"active_count": len(subscription_manager.active_subscriptions),
"with_data": len(subscription_manager.resource_data),
"in_error_state": 0,
"connection_issues": connection_issues,
},
}
for sub_name, sub_status in status.items():
runtime = sub_status.get("runtime", {})
conn_state = runtime.get("connection_state", "unknown")
if conn_state in ("error", "auth_failed", "timeout", "max_retries_exceeded"):
diagnostic_info["summary"]["in_error_state"] += 1
if runtime.get("last_error"):
connection_issues.append({
"subscription": sub_name,
"state": conn_state,
"error": runtime["last_error"],
})
return diagnostic_info
except ImportError:
return {
"error": "Subscription modules not available",
"timestamp": datetime.datetime.now(datetime.timezone.utc).isoformat(),
}
except Exception as e:
raise ToolError(f"Failed to generate diagnostics: {str(e)}") from e