#!/bin/bash # homelab-sentinel-health-quick.sh # Quick terminal pulse check for Docker services in Nathan's homelab # Validates uptime stability, resource pressure, and network exposure # Color codes for output RED='\033[0;31m' GREEN='\033[0;32m' YELLOW='\033[1;33m' BLUE='\033[0;34m' NC='\033[0m' # No Color # Resource thresholds (from generic_host_conversational.yml) MAX_SAFE_RAM_MB=16384 # 16GB MAX_SAFE_CPU_CORES=8 # VLAN definitions from ansible/group_vars/all.yml declare -A VLAN_CIDRS VLAN_CIDRS["main"]="10.0.0.0/24" VLAN_CIDRS["infra"]="10.0.10.0/24" VLAN_CIDRS["iot"]="10.0.50.0/24" VLAN_CIDRS["guest"]="10.0.30.0/24" VLAN_CIDRS["compute"]="10.0.200.0/24" # Zone definitions declare -A ZONES ZONES["core"]="main" ZONES["infrastructure"]="infra" ZONES["iot"]="iot" ZONES["guest"]="guest" ZONES["compute"]="compute" # Help function usage() { echo "Usage: $0 " echo " Performs a quick health check on a Docker service" echo "" echo "Arguments:" echo " service_name Name of the Docker service to check" echo "" echo "Examples:" echo " $0 watchtower" echo " $0 prometheus" exit 1 } # Validate service name follows naming convention (--) validate_service_naming() { local service_name=$1 # Check if service name follows the pattern *_*_* (at least two underscores) if [[ ! "$service_name" =~ .*-.*-.* ]]; then echo -e "${YELLOW}⚠️ Naming Convention Warning:${NC} Service '$service_name' does not follow the -- naming convention" return 1 fi return 0 } # Get zone assignment for a service based on naming convention get_service_zone() { local service_name=$1 # Extract role from service name (middle part) local role=$(echo "$service_name" | cut -d'-' -f2) # Map common roles to zones case "$role" in "pve"|"proxmox"|"nas"|"heimdall"|"watchtower") echo "infrastructure" ;; "swarm"|"ai"|"compute") echo "compute" ;; "controller"|"omada") echo "iot" ;; *) # Default to infrastructure for unknown roles echo "infrastructure" ;; esac } # Check if IP is in correct VLAN range is_ip_in_correct_vlan() { local ip=$1 local zone=$2 # Get expected CIDR for zone local expected_cidr=${VLAN_CIDRS[$zone]} if [ -z "$expected_cidr" ]; then echo "Unknown zone: $zone" return 1 fi # Simple check - in real implementation, would use ipcalc or similar case "$zone" in "infrastructure") [[ $ip =~ ^10\.0\.10\. ]] && return 0 || return 1 ;; "compute") [[ $ip =~ ^10\.0\.200\. ]] && return 0 || return 1 ;; "iot") [[ $ip =~ ^10\.0\.50\. ]] && return 0 || return 1 ;; "guest") [[ $ip =~ ^10\.0\.30\. ]] && return 0 || return 1 ;; "main") [[ $ip =~ ^10\.0\.0\. ]] && return 0 || return 1 ;; esac return 1 } # Parse docker stats output parse_docker_stats() { local service_name=$1 # Get docker stats in JSON format local stats_json=$(docker stats "$service_name" --no-stream --format json 2>/dev/null) if [ -z "$stats_json" ]; then echo "{}" return fi echo "$stats_json" } # Parse docker ps output parse_docker_ps() { local service_name=$1 # Get docker ps info in JSON format local ps_info=$(docker ps -a --filter "name=$service_name" --format json 2>/dev/null) if [ -z "$ps_info" ]; then echo "{}" return fi echo "$ps_info" } # Main health check function check_service_health() { local service_name=$1 echo -e "${BLUE}🔍 Homelab Sentinel Quick Health Check${NC}" echo -e "${BLUE}=====================================${NC}" echo "Service: $service_name" echo "" # Validate service naming validate_service_naming "$service_name" # Determine expected zone local expected_zone=$(get_service_zone "$service_name") local expected_vlan=${ZONES[$expected_zone]} echo -e "${BLUE}📍 Expected Zone:${NC} $expected_zone (${VLAN_CIDRS[$expected_vlan]})" echo "" # Get docker ps info echo -e "${BLUE}📊 Container Status:${NC}" docker ps -a --filter "name=$service_name" --format "table {{.Names}}\t{{.Status}}\t{{.RunningFor}}\t{{.Ports}}" echo "" # Check uptime stability echo -e "${BLUE}⏱️ Uptime Stability:${NC}" local ps_output=$(docker ps -a --filter "name=$service_name" --format "{{.Status}}\t{{.RunningFor}}" 2>/dev/null) if [ -n "$ps_output" ]; then local status=$(echo "$ps_output" | cut -f1) local running_for=$(echo "$ps_output" | cut -f2) if [[ "$status" == *"Restarting"* ]]; then echo -e "${RED}❌ Unstable:${NC} Container is restarting ($status)" elif [[ "$status" == *"Exited"* ]]; then echo -e "${YELLOW}⚠️ Stopped:${NC} Container is not running ($status)" else echo -e "${GREEN}✅ Stable:${NC} Container has been running for $running_for" fi else echo -e "${RED}❌ Not Found:${NC} No container found with name '$service_name'" return 1 fi echo "" # Check resource pressure echo -e "${BLUE}⚡ Resource Pressure:${NC}" local stats_json=$(parse_docker_stats "$service_name") if [ -n "$stats_json" ] && [ "$stats_json" != "{}" ]; then # Extract CPU and memory usage; docker may return units like B/KiB/MiB/GiB. local cpu_percent=$(echo "$stats_json" | jq -r '.CPUPerc' 2>/dev/null | sed 's/%//' | cut -d'.' -f1) local mem_usage_raw=$(echo "$stats_json" | jq -r '.MemUsage' 2>/dev/null | cut -d'/' -f1 | xargs) local mem_mb="" # Parse values like 0B, 512KiB, 85.3MiB, 1.2GiB into MiB. if [[ "$mem_usage_raw" =~ ^([0-9]+([.][0-9]+)?)([A-Za-z]+)$ ]]; then local mem_val="${BASH_REMATCH[1]}" local mem_unit="${BASH_REMATCH[3]}" mem_mb=$(awk -v v="$mem_val" -v u="$mem_unit" 'BEGIN { if (u == "B") printf "%.0f", v / 1048576; else if (u == "KiB" || u == "KB" || u == "kB") printf "%.0f", v / 1024; else if (u == "MiB" || u == "MB") printf "%.0f", v; else if (u == "GiB" || u == "GB") printf "%.0f", v * 1024; else if (u == "TiB" || u == "TB") printf "%.0f", v * 1048576; }') fi if [ -n "$cpu_percent" ] && [ "$cpu_percent" != "null" ]; then if [ "$cpu_percent" -gt $((MAX_SAFE_CPU_CORES * 10)) ]; then # 10% per core threshold echo -e "${YELLOW}⚠️ High CPU:${NC} ${cpu_percent}% (threshold: $((MAX_SAFE_CPU_CORES * 10))%)" else echo -e "${GREEN}✅ CPU OK:${NC} ${cpu_percent}%" fi fi if [ -n "$mem_mb" ] && [ "$mem_mb" != "null" ]; then if [ "$mem_mb" -gt "$MAX_SAFE_RAM_MB" ]; then echo -e "${YELLOW}⚠️ High Memory:${NC} ${mem_usage_raw} (threshold: ${MAX_SAFE_RAM_MB}MiB)" else echo -e "${GREEN}✅ Memory OK:${NC} ${mem_usage_raw}" fi fi else echo -e "${YELLOW}⚠️ No Stats:${NC} Unable to retrieve resource statistics" fi echo "" # Check network exposure echo -e "${BLUE}🌐 Network Exposure:${NC}" local port_info=$(docker ps --filter "name=$service_name" --format "{{.Ports}}" 2>/dev/null) if [ -n "$port_info" ] && [ "$port_info" != "" ]; then echo "Published Ports: $port_info" # Check if ports are exposed on unexpected interfaces if [[ "$port_info" == *":"* ]]; then echo -e "${YELLOW}⚠️ Port Exposure:${NC} Container exposes ports on host interfaces" echo " Review port mappings to ensure they comply with network zoning" else echo -e "${GREEN}✅ Port Exposure:${NC} No host ports exposed" fi else echo -e "${GREEN}✅ Port Exposure:${NC} No ports exposed" fi echo "" # Summary echo -e "${BLUE}📋 Summary:${NC}" echo "Expected Zone: $expected_zone" echo "Naming Convention: $(validate_service_naming "$service_name" && echo -e "${GREEN}✅ Valid${NC}" || echo -e "${YELLOW}⚠️ Invalid${NC}")" echo "" echo -e "${BLUE}💡 Next Actions:${NC}" echo "1. If uptime is unstable, check logs with: docker logs $service_name" echo "2. If resources are high, consider optimization or scaling" echo "3. If network exposure seems wrong, review docker-compose configuration" echo "4. For detailed analysis, run Deep mode health check" } # Main script execution main() { # Check if service name is provided if [ $# -eq 0 ]; then usage fi local service_name=$1 # Check if docker is installed and accessible if ! command -v docker &> /dev/null; then echo -e "${RED}❌ Docker Error:${NC} Docker is not installed or not accessible" exit 1 fi # Perform health check check_service_health "$service_name" } # Run main function with all arguments main "$@"