#!/bin/bash # ============================================================================== # VALIDATION LIBRARY: Comprehensive System Health Checks # ============================================================================== # Part of unified bootstrap system for homelab infrastructure # Provides comprehensive pre-flight and post-bootstrap validation with # severity levels (critical, warning, info) for operational readiness. # ============================================================================== # Source detection library if not already loaded if ! type -t detect_os_family &>/dev/null; then SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" # shellcheck source=./detection.sh source "${SCRIPT_DIR}/detection.sh" fi # --- VALIDATION TRACKING --- declare -g VALIDATION_ERRORS=0 declare -g VALIDATION_WARNINGS=0 declare -g VALIDATION_PASSED=0 reset_validation_counters() { VALIDATION_ERRORS=0 VALIDATION_WARNINGS=0 VALIDATION_PASSED=0 } log_pass() { local message="$1" echo " [✓] $message" >&2 ((VALIDATION_PASSED++)) } log_warning() { local message="$1" echo " [!] WARNING: $message" >&2 ((VALIDATION_WARNINGS++)) } log_error() { local message="$1" echo " [✗] ERROR: $message" >&2 ((VALIDATION_ERRORS++)) } # --- DISK VALIDATION --- check_disk_space() { # Validate sufficient disk space for Docker and system operations # Critical: Root partition must have at least 10GB free # Warning: Root partition should have at least 20GB free echo "[⚙] Checking disk space..." >&2 local root_avail=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//') if [ -z "$root_avail" ]; then log_error "Could not determine disk space" return 1 fi if [ "$root_avail" -lt 10 ]; then log_error "Insufficient disk space on /: ${root_avail}GB (minimum 10GB required)" return 1 elif [ "$root_avail" -lt 20 ]; then log_warning "Low disk space on /: ${root_avail}GB (recommend 20GB+)" else log_pass "Disk space on /: ${root_avail}GB available" fi # Check for separate /var partition (common on servers) if mountpoint -q /var 2>/dev/null; then local var_avail=$(df -BG /var | awk 'NR==2 {print $4}' | sed 's/G//') if [ "$var_avail" -lt 20 ]; then log_warning "Low disk space on /var: ${var_avail}GB (Docker images will go here)" else log_pass "Disk space on /var: ${var_avail}GB available" fi fi return 0 } check_disk_performance() { # Check if root is on SSD vs HDD (performance indicator) echo "[⚙] Checking disk performance characteristics..." >&2 local root_device=$(df / | awk 'NR==2 {print $1}' | sed 's/[0-9]*$//') local device_name=$(basename "$root_device") if [ -f "/sys/block/$device_name/queue/rotational" ]; then local rotational=$(cat "/sys/block/$device_name/queue/rotational") if [ "$rotational" -eq 0 ]; then log_pass "Root filesystem on SSD ($device_name)" else log_warning "Root filesystem on HDD ($device_name) - SSD recommended for Docker" fi else log_warning "Could not determine disk type for $device_name" fi return 0 } # --- MEMORY VALIDATION --- check_memory() { # Validate sufficient RAM for deployment type # Proxmox: 8GB minimum, Docker Swarm: 4GB minimum, Pi: 2GB minimum echo "[⚙] Checking memory..." >&2 local mem_total_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}') local mem_total_gb=$((mem_total_kb / 1024 / 1024)) if [ -z "$mem_total_gb" ] || [ "$mem_total_gb" -eq 0 ]; then log_error "Could not determine system memory" return 1 fi local hardware_type=$(detect_hardware_type) local min_required=2 case "$hardware_type" in proxmox) min_required=8 ;; docker-vm|physical-docker) min_required=4 ;; ai-workstation) min_required=16 ;; pi) min_required=2 ;; esac if [ "$mem_total_gb" -lt "$min_required" ]; then log_error "Insufficient RAM: ${mem_total_gb}GB (minimum ${min_required}GB for $hardware_type)" return 1 else log_pass "Memory: ${mem_total_gb}GB (meets ${min_required}GB minimum for $hardware_type)" fi return 0 } check_swap() { # Check swap configuration (important for memory-constrained systems) echo "[⚙] Checking swap configuration..." >&2 local swap_total_kb=$(grep SwapTotal /proc/meminfo | awk '{print $2}') local swap_total_gb=$((swap_total_kb / 1024 / 1024)) local hardware_type=$(detect_hardware_type) if [ "$hardware_type" == "proxmox" ]; then # Proxmox hosts should NOT have swap enabled (best practice) if [ "$swap_total_gb" -gt 0 ]; then log_warning "Swap enabled on Proxmox host (${swap_total_gb}GB) - consider disabling" else log_pass "Swap disabled (correct for Proxmox)" fi else # Other systems benefit from swap if [ "$swap_total_gb" -eq 0 ]; then log_warning "No swap configured - may cause OOM issues under load" else log_pass "Swap: ${swap_total_gb}GB configured" fi fi return 0 } # --- NETWORK VALIDATION --- check_network_routes() { # Validate proper network routing configuration echo "[⚙] Checking network routing..." >&2 # Check for default route if ip route show default &>/dev/null; then local gateway=$(ip route show default | awk '/^default/ {print $3; exit}') log_pass "Default gateway configured: $gateway" else log_error "No default gateway configured" return 1 fi # Check for DNS servers if [ -f /etc/resolv.conf ]; then local dns_count=$(grep -c "^nameserver" /etc/resolv.conf) if [ "$dns_count" -gt 0 ]; then log_pass "DNS servers configured ($dns_count entries)" else log_warning "No DNS servers in /etc/resolv.conf" fi else log_error "/etc/resolv.conf missing" return 1 fi return 0 } check_hostname_resolution() { # Validate hostname resolution (important for cluster operations) echo "[⚙] Checking hostname resolution..." >&2 local hostname=$(hostname) local fqdn=$(hostname -f 2>/dev/null || echo "") if [ -n "$hostname" ]; then log_pass "Hostname: $hostname" else log_error "Hostname not set" return 1 fi # Check if hostname resolves if host "$hostname" &>/dev/null || getent hosts "$hostname" &>/dev/null; then log_pass "Hostname resolves" else log_warning "Hostname '$hostname' does not resolve - may cause cluster issues" fi # Check /etc/hosts if grep -q "127.0.1.1.*$hostname" /etc/hosts 2>/dev/null; then log_pass "Hostname in /etc/hosts" else log_warning "Hostname not in /etc/hosts - adding is recommended" fi return 0 } # --- NFS CLIENT VALIDATION --- check_nfs_client() { # Validate NFS client packages and kernel modules echo "[⚙] Checking NFS client..." >&2 # Check for NFS common package if dpkg -l | grep -q "^ii.*nfs-common"; then log_pass "nfs-common package installed" else log_warning "nfs-common not installed - required for NFS mounts" return 1 fi # Check for NFS kernel modules if lsmod | grep -q "^nfs "; then log_pass "NFS kernel module loaded" else # Try to load it if sudo modprobe nfs 2>/dev/null; then log_pass "NFS kernel module loaded successfully" else log_warning "Could not load NFS kernel module" fi fi return 0 } # --- DOCKER VALIDATION --- check_docker_daemon() { # Validate Docker installation and daemon health echo "[⚙] Checking Docker installation..." >&2 # Check if Docker is installed if ! command -v docker &>/dev/null; then log_warning "Docker not installed (will be installed during bootstrap)" return 0 fi log_pass "Docker binary found: $(docker --version 2>/dev/null | head -n1)" # Check if daemon is running if systemctl is-active docker &>/dev/null; then log_pass "Docker daemon running" else log_warning "Docker daemon not running" return 0 fi # Check Docker socket permissions if [ -S /var/run/docker.sock ]; then if sudo docker ps &>/dev/null; then log_pass "Docker socket accessible" else log_warning "Docker socket exists but not accessible" fi else log_warning "Docker socket not found" fi # Check storage driver local storage_driver=$(docker info 2>/dev/null | grep "Storage Driver" | awk '{print $3}') if [ -n "$storage_driver" ]; then log_pass "Storage driver: $storage_driver" # Warn about devicemapper (deprecated) if [ "$storage_driver" == "devicemapper" ]; then log_warning "devicemapper storage driver is deprecated - consider overlay2" fi fi return 0 } # --- PROXMOX VALIDATION --- check_proxmox_api() { # Validate Proxmox VE installation and API accessibility local hardware_type=$(detect_hardware_type) if [ "$hardware_type" != "proxmox" ]; then return 0 # Skip if not Proxmox fi echo "[⚙] Checking Proxmox VE..." >&2 # Check for pveversion if command -v pveversion &>/dev/null; then local pve_version=$(pveversion | head -n1) log_pass "Proxmox installed: $pve_version" else log_error "Proxmox tools not found (pveversion missing)" return 1 fi # Check cluster status if command -v pvesh &>/dev/null; then if sudo pvesh get /cluster/status &>/dev/null; then log_pass "Proxmox API accessible" else log_warning "Proxmox API not responding" fi fi # Check for required repositories if [ -f /etc/apt/sources.list.d/pve-no-subscription.list ]; then log_pass "No-subscription repository configured" else log_warning "No-subscription repository not configured" fi return 0 } # --- SECURITY VALIDATION --- check_ssh_security() { # Basic SSH security validation echo "[⚙] Checking SSH security..." >&2 # Check if SSH is running if systemctl is-active ssh &>/dev/null || systemctl is-active sshd &>/dev/null; then log_pass "SSH service running" else log_error "SSH service not running" return 1 fi # Check for password authentication (should be disabled in production) if [ -f /etc/ssh/sshd_config ]; then if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then log_pass "SSH password authentication disabled (secure)" else log_warning "SSH password authentication may be enabled - key-only is recommended" fi fi # Check for root login if [ -f /etc/ssh/sshd_config ]; then if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then log_pass "SSH root login disabled (secure)" else log_warning "SSH root login may be enabled - consider disabling" fi fi return 0 } check_firewall() { # Check firewall status (informational) echo "[⚙] Checking firewall..." >&2 if systemctl is-active ufw &>/dev/null; then local ufw_status=$(sudo ufw status 2>/dev/null | head -n1) log_pass "UFW active: $ufw_status" elif systemctl is-active firewalld &>/dev/null; then log_pass "firewalld active" elif command -v iptables &>/dev/null; then local iptables_rules=$(sudo iptables -L -n | wc -l) if [ "$iptables_rules" -gt 8 ]; then log_pass "iptables rules configured ($iptables_rules lines)" else log_warning "No firewall detected - consider enabling UFW" fi else log_warning "No firewall detected" fi return 0 } # --- TIME SYNCHRONIZATION --- check_time_sync() { # Validate NTP/timesyncd for cluster time synchronization echo "[⚙] Checking time synchronization..." >&2 if systemctl is-active systemd-timesyncd &>/dev/null; then local ntp_status=$(timedatectl status 2>/dev/null | grep "synchronized" | awk '{print $3}') if [ "$ntp_status" == "yes" ]; then log_pass "Time synchronized via systemd-timesyncd" else log_warning "Time sync not confirmed" fi elif systemctl is-active ntp &>/dev/null || systemctl is-active ntpd &>/dev/null; then log_pass "NTP service running" else log_warning "No time synchronization service detected - critical for clusters" fi return 0 } # --- COMPREHENSIVE VALIDATION SUITE --- run_validation_suite() { # Run all validation checks and return summary # Returns 0 if no critical errors, 1 if critical errors found reset_validation_counters echo "======================================" >&2 echo "SYSTEM VALIDATION SUITE" >&2 echo "======================================" >&2 # Run all checks (continue even if some fail) check_disk_space || true check_disk_performance || true check_memory || true check_swap || true check_network_routes || true check_hostname_resolution || true check_nfs_client || true check_docker_daemon || true check_proxmox_api || true check_ssh_security || true check_firewall || true check_time_sync || true # Summary echo "======================================" >&2 echo "VALIDATION SUMMARY" >&2 echo " Passed: $VALIDATION_PASSED" >&2 echo " Warnings: $VALIDATION_WARNINGS" >&2 echo " Errors: $VALIDATION_ERRORS" >&2 echo "======================================" >&2 if [ $VALIDATION_ERRORS -gt 0 ]; then echo "[✗] CRITICAL: $VALIDATION_ERRORS validation errors - manual intervention required" >&2 return 1 elif [ $VALIDATION_WARNINGS -gt 0 ]; then echo "[!] WARNINGS: $VALIDATION_WARNINGS issues detected - review recommended" >&2 return 0 else echo "[✓] ALL CHECKS PASSED" >&2 return 0 fi } # Export functions export -f reset_validation_counters export -f log_pass export -f log_warning export -f log_error export -f check_disk_space export -f check_disk_performance export -f check_memory export -f check_swap export -f check_network_routes export -f check_hostname_resolution export -f check_nfs_client export -f check_docker_daemon export -f check_proxmox_api export -f check_ssh_security export -f check_firewall export -f check_time_sync export -f run_validation_suite