homelab/ansible/ansible-old/scripts/health-check-quick.sh

286 lines
9.2 KiB
Bash

#!/bin/bash
# homelab-sentinel-health-quick.sh
# Quick terminal pulse check for Docker services in Nathan's homelab
# Validates uptime stability, resource pressure, and network exposure
# Color codes for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Resource thresholds (from generic_host_conversational.yml)
MAX_SAFE_RAM_MB=16384 # 16GB
MAX_SAFE_CPU_CORES=8
# VLAN definitions from ansible/group_vars/all.yml
declare -A VLAN_CIDRS
VLAN_CIDRS["main"]="10.0.0.0/24"
VLAN_CIDRS["infra"]="10.0.10.0/24"
VLAN_CIDRS["iot"]="10.0.50.0/24"
VLAN_CIDRS["guest"]="10.0.30.0/24"
VLAN_CIDRS["compute"]="10.0.200.0/24"
# Zone definitions
declare -A ZONES
ZONES["core"]="main"
ZONES["infrastructure"]="infra"
ZONES["iot"]="iot"
ZONES["guest"]="guest"
ZONES["compute"]="compute"
# Help function
usage() {
echo "Usage: $0 <service_name>"
echo " Performs a quick health check on a Docker service"
echo ""
echo "Arguments:"
echo " service_name Name of the Docker service to check"
echo ""
echo "Examples:"
echo " $0 watchtower"
echo " $0 prometheus"
exit 1
}
# Validate service name follows naming convention (<owner>-<role>-<node>)
validate_service_naming() {
local service_name=$1
# Check if service name follows the pattern *_*_* (at least two underscores)
if [[ ! "$service_name" =~ .*-.*-.* ]]; then
echo -e "${YELLOW}⚠️ Naming Convention Warning:${NC} Service '$service_name' does not follow the <owner>-<role>-<node> naming convention"
return 1
fi
return 0
}
# Get zone assignment for a service based on naming convention
get_service_zone() {
local service_name=$1
# Extract role from service name (middle part)
local role=$(echo "$service_name" | cut -d'-' -f2)
# Map common roles to zones
case "$role" in
"pve"|"proxmox"|"nas"|"heimdall"|"watchtower")
echo "infrastructure"
;;
"swarm"|"ai"|"compute")
echo "compute"
;;
"controller"|"omada")
echo "iot"
;;
*)
# Default to infrastructure for unknown roles
echo "infrastructure"
;;
esac
}
# Check if IP is in correct VLAN range
is_ip_in_correct_vlan() {
local ip=$1
local zone=$2
# Get expected CIDR for zone
local expected_cidr=${VLAN_CIDRS[$zone]}
if [ -z "$expected_cidr" ]; then
echo "Unknown zone: $zone"
return 1
fi
# Simple check - in real implementation, would use ipcalc or similar
case "$zone" in
"infrastructure")
[[ $ip =~ ^10\.0\.10\. ]] && return 0 || return 1
;;
"compute")
[[ $ip =~ ^10\.0\.200\. ]] && return 0 || return 1
;;
"iot")
[[ $ip =~ ^10\.0\.50\. ]] && return 0 || return 1
;;
"guest")
[[ $ip =~ ^10\.0\.30\. ]] && return 0 || return 1
;;
"main")
[[ $ip =~ ^10\.0\.0\. ]] && return 0 || return 1
;;
esac
return 1
}
# Parse docker stats output
parse_docker_stats() {
local service_name=$1
# Get docker stats in JSON format
local stats_json=$(docker stats "$service_name" --no-stream --format json 2>/dev/null)
if [ -z "$stats_json" ]; then
echo "{}"
return
fi
echo "$stats_json"
}
# Parse docker ps output
parse_docker_ps() {
local service_name=$1
# Get docker ps info in JSON format
local ps_info=$(docker ps -a --filter "name=$service_name" --format json 2>/dev/null)
if [ -z "$ps_info" ]; then
echo "{}"
return
fi
echo "$ps_info"
}
# Main health check function
check_service_health() {
local service_name=$1
echo -e "${BLUE}🔍 Homelab Sentinel Quick Health Check${NC}"
echo -e "${BLUE}=====================================${NC}"
echo "Service: $service_name"
echo ""
# Validate service naming
validate_service_naming "$service_name"
# Determine expected zone
local expected_zone=$(get_service_zone "$service_name")
local expected_vlan=${ZONES[$expected_zone]}
echo -e "${BLUE}📍 Expected Zone:${NC} $expected_zone (${VLAN_CIDRS[$expected_vlan]})"
echo ""
# Get docker ps info
echo -e "${BLUE}📊 Container Status:${NC}"
docker ps -a --filter "name=$service_name" --format "table {{.Names}}\t{{.Status}}\t{{.RunningFor}}\t{{.Ports}}"
echo ""
# Check uptime stability
echo -e "${BLUE}⏱️ Uptime Stability:${NC}"
local ps_output=$(docker ps -a --filter "name=$service_name" --format "{{.Status}}\t{{.RunningFor}}" 2>/dev/null)
if [ -n "$ps_output" ]; then
local status=$(echo "$ps_output" | cut -f1)
local running_for=$(echo "$ps_output" | cut -f2)
if [[ "$status" == *"Restarting"* ]]; then
echo -e "${RED}❌ Unstable:${NC} Container is restarting ($status)"
elif [[ "$status" == *"Exited"* ]]; then
echo -e "${YELLOW}⚠️ Stopped:${NC} Container is not running ($status)"
else
echo -e "${GREEN}✅ Stable:${NC} Container has been running for $running_for"
fi
else
echo -e "${RED}❌ Not Found:${NC} No container found with name '$service_name'"
return 1
fi
echo ""
# Check resource pressure
echo -e "${BLUE}⚡ Resource Pressure:${NC}"
local stats_json=$(parse_docker_stats "$service_name")
if [ -n "$stats_json" ] && [ "$stats_json" != "{}" ]; then
# Extract CPU and memory usage; docker may return units like B/KiB/MiB/GiB.
local cpu_percent=$(echo "$stats_json" | jq -r '.CPUPerc' 2>/dev/null | sed 's/%//' | cut -d'.' -f1)
local mem_usage_raw=$(echo "$stats_json" | jq -r '.MemUsage' 2>/dev/null | cut -d'/' -f1 | xargs)
local mem_mb=""
# Parse values like 0B, 512KiB, 85.3MiB, 1.2GiB into MiB.
if [[ "$mem_usage_raw" =~ ^([0-9]+([.][0-9]+)?)([A-Za-z]+)$ ]]; then
local mem_val="${BASH_REMATCH[1]}"
local mem_unit="${BASH_REMATCH[3]}"
mem_mb=$(awk -v v="$mem_val" -v u="$mem_unit" 'BEGIN {
if (u == "B") printf "%.0f", v / 1048576;
else if (u == "KiB" || u == "KB" || u == "kB") printf "%.0f", v / 1024;
else if (u == "MiB" || u == "MB") printf "%.0f", v;
else if (u == "GiB" || u == "GB") printf "%.0f", v * 1024;
else if (u == "TiB" || u == "TB") printf "%.0f", v * 1048576;
}')
fi
if [ -n "$cpu_percent" ] && [ "$cpu_percent" != "null" ]; then
if [ "$cpu_percent" -gt $((MAX_SAFE_CPU_CORES * 10)) ]; then # 10% per core threshold
echo -e "${YELLOW}⚠️ High CPU:${NC} ${cpu_percent}% (threshold: $((MAX_SAFE_CPU_CORES * 10))%)"
else
echo -e "${GREEN}✅ CPU OK:${NC} ${cpu_percent}%"
fi
fi
if [ -n "$mem_mb" ] && [ "$mem_mb" != "null" ]; then
if [ "$mem_mb" -gt "$MAX_SAFE_RAM_MB" ]; then
echo -e "${YELLOW}⚠️ High Memory:${NC} ${mem_usage_raw} (threshold: ${MAX_SAFE_RAM_MB}MiB)"
else
echo -e "${GREEN}✅ Memory OK:${NC} ${mem_usage_raw}"
fi
fi
else
echo -e "${YELLOW}⚠️ No Stats:${NC} Unable to retrieve resource statistics"
fi
echo ""
# Check network exposure
echo -e "${BLUE}🌐 Network Exposure:${NC}"
local port_info=$(docker ps --filter "name=$service_name" --format "{{.Ports}}" 2>/dev/null)
if [ -n "$port_info" ] && [ "$port_info" != "<no value>" ]; then
echo "Published Ports: $port_info"
# Check if ports are exposed on unexpected interfaces
if [[ "$port_info" == *":"* ]]; then
echo -e "${YELLOW}⚠️ Port Exposure:${NC} Container exposes ports on host interfaces"
echo " Review port mappings to ensure they comply with network zoning"
else
echo -e "${GREEN}✅ Port Exposure:${NC} No host ports exposed"
fi
else
echo -e "${GREEN}✅ Port Exposure:${NC} No ports exposed"
fi
echo ""
# Summary
echo -e "${BLUE}📋 Summary:${NC}"
echo "Expected Zone: $expected_zone"
echo "Naming Convention: $(validate_service_naming "$service_name" && echo -e "${GREEN}✅ Valid${NC}" || echo -e "${YELLOW}⚠️ Invalid${NC}")"
echo ""
echo -e "${BLUE}💡 Next Actions:${NC}"
echo "1. If uptime is unstable, check logs with: docker logs $service_name"
echo "2. If resources are high, consider optimization or scaling"
echo "3. If network exposure seems wrong, review docker-compose configuration"
echo "4. For detailed analysis, run Deep mode health check"
}
# Main script execution
main() {
# Check if service name is provided
if [ $# -eq 0 ]; then
usage
fi
local service_name=$1
# Check if docker is installed and accessible
if ! command -v docker &> /dev/null; then
echo -e "${RED}❌ Docker Error:${NC} Docker is not installed or not accessible"
exit 1
fi
# Perform health check
check_service_health "$service_name"
}
# Run main function with all arguments
main "$@"