286 lines
9.2 KiB
Bash
286 lines
9.2 KiB
Bash
#!/bin/bash
|
|
|
|
# homelab-sentinel-health-quick.sh
|
|
# Quick terminal pulse check for Docker services in Nathan's homelab
|
|
# Validates uptime stability, resource pressure, and network exposure
|
|
|
|
# Color codes for output
|
|
RED='\033[0;31m'
|
|
GREEN='\033[0;32m'
|
|
YELLOW='\033[1;33m'
|
|
BLUE='\033[0;34m'
|
|
NC='\033[0m' # No Color
|
|
|
|
# Resource thresholds (from generic_host_conversational.yml)
|
|
MAX_SAFE_RAM_MB=16384 # 16GB
|
|
MAX_SAFE_CPU_CORES=8
|
|
|
|
# VLAN definitions from ansible/group_vars/all.yml
|
|
declare -A VLAN_CIDRS
|
|
VLAN_CIDRS["main"]="10.0.0.0/24"
|
|
VLAN_CIDRS["infra"]="10.0.10.0/24"
|
|
VLAN_CIDRS["iot"]="10.0.50.0/24"
|
|
VLAN_CIDRS["guest"]="10.0.30.0/24"
|
|
VLAN_CIDRS["compute"]="10.0.200.0/24"
|
|
|
|
# Zone definitions
|
|
declare -A ZONES
|
|
ZONES["core"]="main"
|
|
ZONES["infrastructure"]="infra"
|
|
ZONES["iot"]="iot"
|
|
ZONES["guest"]="guest"
|
|
ZONES["compute"]="compute"
|
|
|
|
# Help function
|
|
usage() {
|
|
echo "Usage: $0 <service_name>"
|
|
echo " Performs a quick health check on a Docker service"
|
|
echo ""
|
|
echo "Arguments:"
|
|
echo " service_name Name of the Docker service to check"
|
|
echo ""
|
|
echo "Examples:"
|
|
echo " $0 watchtower"
|
|
echo " $0 prometheus"
|
|
exit 1
|
|
}
|
|
|
|
# Validate service name follows naming convention (<owner>-<role>-<node>)
|
|
validate_service_naming() {
|
|
local service_name=$1
|
|
|
|
# Check if service name follows the pattern *_*_* (at least two underscores)
|
|
if [[ ! "$service_name" =~ .*-.*-.* ]]; then
|
|
echo -e "${YELLOW}⚠️ Naming Convention Warning:${NC} Service '$service_name' does not follow the <owner>-<role>-<node> naming convention"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# Get zone assignment for a service based on naming convention
|
|
get_service_zone() {
|
|
local service_name=$1
|
|
|
|
# Extract role from service name (middle part)
|
|
local role=$(echo "$service_name" | cut -d'-' -f2)
|
|
|
|
# Map common roles to zones
|
|
case "$role" in
|
|
"pve"|"proxmox"|"nas"|"heimdall"|"watchtower")
|
|
echo "infrastructure"
|
|
;;
|
|
"swarm"|"ai"|"compute")
|
|
echo "compute"
|
|
;;
|
|
"controller"|"omada")
|
|
echo "iot"
|
|
;;
|
|
*)
|
|
# Default to infrastructure for unknown roles
|
|
echo "infrastructure"
|
|
;;
|
|
esac
|
|
}
|
|
|
|
# Check if IP is in correct VLAN range
|
|
is_ip_in_correct_vlan() {
|
|
local ip=$1
|
|
local zone=$2
|
|
|
|
# Get expected CIDR for zone
|
|
local expected_cidr=${VLAN_CIDRS[$zone]}
|
|
|
|
if [ -z "$expected_cidr" ]; then
|
|
echo "Unknown zone: $zone"
|
|
return 1
|
|
fi
|
|
|
|
# Simple check - in real implementation, would use ipcalc or similar
|
|
case "$zone" in
|
|
"infrastructure")
|
|
[[ $ip =~ ^10\.0\.10\. ]] && return 0 || return 1
|
|
;;
|
|
"compute")
|
|
[[ $ip =~ ^10\.0\.200\. ]] && return 0 || return 1
|
|
;;
|
|
"iot")
|
|
[[ $ip =~ ^10\.0\.50\. ]] && return 0 || return 1
|
|
;;
|
|
"guest")
|
|
[[ $ip =~ ^10\.0\.30\. ]] && return 0 || return 1
|
|
;;
|
|
"main")
|
|
[[ $ip =~ ^10\.0\.0\. ]] && return 0 || return 1
|
|
;;
|
|
esac
|
|
|
|
return 1
|
|
}
|
|
|
|
# Parse docker stats output
|
|
parse_docker_stats() {
|
|
local service_name=$1
|
|
|
|
# Get docker stats in JSON format
|
|
local stats_json=$(docker stats "$service_name" --no-stream --format json 2>/dev/null)
|
|
|
|
if [ -z "$stats_json" ]; then
|
|
echo "{}"
|
|
return
|
|
fi
|
|
|
|
echo "$stats_json"
|
|
}
|
|
|
|
# Parse docker ps output
|
|
parse_docker_ps() {
|
|
local service_name=$1
|
|
|
|
# Get docker ps info in JSON format
|
|
local ps_info=$(docker ps -a --filter "name=$service_name" --format json 2>/dev/null)
|
|
|
|
if [ -z "$ps_info" ]; then
|
|
echo "{}"
|
|
return
|
|
fi
|
|
|
|
echo "$ps_info"
|
|
}
|
|
|
|
# Main health check function
|
|
check_service_health() {
|
|
local service_name=$1
|
|
|
|
echo -e "${BLUE}🔍 Homelab Sentinel Quick Health Check${NC}"
|
|
echo -e "${BLUE}=====================================${NC}"
|
|
echo "Service: $service_name"
|
|
echo ""
|
|
|
|
# Validate service naming
|
|
validate_service_naming "$service_name"
|
|
|
|
# Determine expected zone
|
|
local expected_zone=$(get_service_zone "$service_name")
|
|
local expected_vlan=${ZONES[$expected_zone]}
|
|
echo -e "${BLUE}📍 Expected Zone:${NC} $expected_zone (${VLAN_CIDRS[$expected_vlan]})"
|
|
echo ""
|
|
|
|
# Get docker ps info
|
|
echo -e "${BLUE}📊 Container Status:${NC}"
|
|
docker ps -a --filter "name=$service_name" --format "table {{.Names}}\t{{.Status}}\t{{.RunningFor}}\t{{.Ports}}"
|
|
echo ""
|
|
|
|
# Check uptime stability
|
|
echo -e "${BLUE}⏱️ Uptime Stability:${NC}"
|
|
local ps_output=$(docker ps -a --filter "name=$service_name" --format "{{.Status}}\t{{.RunningFor}}" 2>/dev/null)
|
|
if [ -n "$ps_output" ]; then
|
|
local status=$(echo "$ps_output" | cut -f1)
|
|
local running_for=$(echo "$ps_output" | cut -f2)
|
|
|
|
if [[ "$status" == *"Restarting"* ]]; then
|
|
echo -e "${RED}❌ Unstable:${NC} Container is restarting ($status)"
|
|
elif [[ "$status" == *"Exited"* ]]; then
|
|
echo -e "${YELLOW}⚠️ Stopped:${NC} Container is not running ($status)"
|
|
else
|
|
echo -e "${GREEN}✅ Stable:${NC} Container has been running for $running_for"
|
|
fi
|
|
else
|
|
echo -e "${RED}❌ Not Found:${NC} No container found with name '$service_name'"
|
|
return 1
|
|
fi
|
|
echo ""
|
|
|
|
# Check resource pressure
|
|
echo -e "${BLUE}⚡ Resource Pressure:${NC}"
|
|
local stats_json=$(parse_docker_stats "$service_name")
|
|
if [ -n "$stats_json" ] && [ "$stats_json" != "{}" ]; then
|
|
# Extract CPU and memory usage; docker may return units like B/KiB/MiB/GiB.
|
|
local cpu_percent=$(echo "$stats_json" | jq -r '.CPUPerc' 2>/dev/null | sed 's/%//' | cut -d'.' -f1)
|
|
local mem_usage_raw=$(echo "$stats_json" | jq -r '.MemUsage' 2>/dev/null | cut -d'/' -f1 | xargs)
|
|
local mem_mb=""
|
|
|
|
# Parse values like 0B, 512KiB, 85.3MiB, 1.2GiB into MiB.
|
|
if [[ "$mem_usage_raw" =~ ^([0-9]+([.][0-9]+)?)([A-Za-z]+)$ ]]; then
|
|
local mem_val="${BASH_REMATCH[1]}"
|
|
local mem_unit="${BASH_REMATCH[3]}"
|
|
mem_mb=$(awk -v v="$mem_val" -v u="$mem_unit" 'BEGIN {
|
|
if (u == "B") printf "%.0f", v / 1048576;
|
|
else if (u == "KiB" || u == "KB" || u == "kB") printf "%.0f", v / 1024;
|
|
else if (u == "MiB" || u == "MB") printf "%.0f", v;
|
|
else if (u == "GiB" || u == "GB") printf "%.0f", v * 1024;
|
|
else if (u == "TiB" || u == "TB") printf "%.0f", v * 1048576;
|
|
}')
|
|
fi
|
|
|
|
if [ -n "$cpu_percent" ] && [ "$cpu_percent" != "null" ]; then
|
|
if [ "$cpu_percent" -gt $((MAX_SAFE_CPU_CORES * 10)) ]; then # 10% per core threshold
|
|
echo -e "${YELLOW}⚠️ High CPU:${NC} ${cpu_percent}% (threshold: $((MAX_SAFE_CPU_CORES * 10))%)"
|
|
else
|
|
echo -e "${GREEN}✅ CPU OK:${NC} ${cpu_percent}%"
|
|
fi
|
|
fi
|
|
|
|
if [ -n "$mem_mb" ] && [ "$mem_mb" != "null" ]; then
|
|
if [ "$mem_mb" -gt "$MAX_SAFE_RAM_MB" ]; then
|
|
echo -e "${YELLOW}⚠️ High Memory:${NC} ${mem_usage_raw} (threshold: ${MAX_SAFE_RAM_MB}MiB)"
|
|
else
|
|
echo -e "${GREEN}✅ Memory OK:${NC} ${mem_usage_raw}"
|
|
fi
|
|
fi
|
|
else
|
|
echo -e "${YELLOW}⚠️ No Stats:${NC} Unable to retrieve resource statistics"
|
|
fi
|
|
echo ""
|
|
|
|
# Check network exposure
|
|
echo -e "${BLUE}🌐 Network Exposure:${NC}"
|
|
local port_info=$(docker ps --filter "name=$service_name" --format "{{.Ports}}" 2>/dev/null)
|
|
if [ -n "$port_info" ] && [ "$port_info" != "<no value>" ]; then
|
|
echo "Published Ports: $port_info"
|
|
|
|
# Check if ports are exposed on unexpected interfaces
|
|
if [[ "$port_info" == *":"* ]]; then
|
|
echo -e "${YELLOW}⚠️ Port Exposure:${NC} Container exposes ports on host interfaces"
|
|
echo " Review port mappings to ensure they comply with network zoning"
|
|
else
|
|
echo -e "${GREEN}✅ Port Exposure:${NC} No host ports exposed"
|
|
fi
|
|
else
|
|
echo -e "${GREEN}✅ Port Exposure:${NC} No ports exposed"
|
|
fi
|
|
echo ""
|
|
|
|
# Summary
|
|
echo -e "${BLUE}📋 Summary:${NC}"
|
|
echo "Expected Zone: $expected_zone"
|
|
echo "Naming Convention: $(validate_service_naming "$service_name" && echo -e "${GREEN}✅ Valid${NC}" || echo -e "${YELLOW}⚠️ Invalid${NC}")"
|
|
echo ""
|
|
echo -e "${BLUE}💡 Next Actions:${NC}"
|
|
echo "1. If uptime is unstable, check logs with: docker logs $service_name"
|
|
echo "2. If resources are high, consider optimization or scaling"
|
|
echo "3. If network exposure seems wrong, review docker-compose configuration"
|
|
echo "4. For detailed analysis, run Deep mode health check"
|
|
}
|
|
|
|
# Main script execution
|
|
main() {
|
|
# Check if service name is provided
|
|
if [ $# -eq 0 ]; then
|
|
usage
|
|
fi
|
|
|
|
local service_name=$1
|
|
|
|
# Check if docker is installed and accessible
|
|
if ! command -v docker &> /dev/null; then
|
|
echo -e "${RED}❌ Docker Error:${NC} Docker is not installed or not accessible"
|
|
exit 1
|
|
fi
|
|
|
|
# Perform health check
|
|
check_service_health "$service_name"
|
|
}
|
|
|
|
# Run main function with all arguments
|
|
main "$@" |