homelab/scripts/lib/validation.sh
nathan e16f98a183 feat(bootstrap)!: introduce unified bootstrap system with modular libraries
BREAKING CHANGE: day0bootstrap.sh deprecated in favor of bootstrap.sh

- Add scripts/bootstrap.sh (488 lines): Unified entrypoint supporting multiple hardware types (Proxmox/Docker VMs/Pi)
- Create scripts/lib/ modular library system:
  - detection.sh: OS/hardware/container detection (362 lines)
  - fingerprint.sh: System fingerprinting and inventory (494 lines)
  - network.sh: IP configuration and VLAN placement (356 lines)
  - proxmox.sh: PVE post-install automation (453 lines)
  - validation.sh: Comprehensive pre-flight checks (510 lines)
- Add validation tools: validate-node.sh, onboarding.sh, pi_init.sh
- Deprecate scripts/day0bootstrap.sh with graceful redirect wrapper
- Document architecture in scripts/README.md (495 lines) and PROXMOX-COMPARISON.md
- Update SOP-002 with new bootstrap workflow
- Add nodes/watchtower/compose.yaml (Raspberry Pi 5 stack)

Migration: Existing day0bootstrap.sh users automatically redirected to new system after 5-second warning. No manual intervention required.

Ref: Infrastructure automation modernization per active-tasks.md
2026-04-12 22:48:19 -04:00

511 lines
15 KiB
Bash

#!/bin/bash
# ==============================================================================
# VALIDATION LIBRARY: Comprehensive System Health Checks
# ==============================================================================
# Part of unified bootstrap system for homelab infrastructure
# Provides comprehensive pre-flight and post-bootstrap validation with
# severity levels (critical, warning, info) for operational readiness.
# ==============================================================================
# Source detection library if not already loaded
if ! type -t detect_os_family &>/dev/null; then
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=./detection.sh
source "${SCRIPT_DIR}/detection.sh"
fi
# --- VALIDATION TRACKING ---
declare -g VALIDATION_ERRORS=0
declare -g VALIDATION_WARNINGS=0
declare -g VALIDATION_PASSED=0
reset_validation_counters() {
VALIDATION_ERRORS=0
VALIDATION_WARNINGS=0
VALIDATION_PASSED=0
}
log_pass() {
local message="$1"
echo " [✓] $message" >&2
((VALIDATION_PASSED++))
}
log_warning() {
local message="$1"
echo " [!] WARNING: $message" >&2
((VALIDATION_WARNINGS++))
}
log_error() {
local message="$1"
echo " [✗] ERROR: $message" >&2
((VALIDATION_ERRORS++))
}
# --- DISK VALIDATION ---
check_disk_space() {
# Validate sufficient disk space for Docker and system operations
# Critical: Root partition must have at least 10GB free
# Warning: Root partition should have at least 20GB free
echo "[⚙] Checking disk space..." >&2
local root_avail=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//')
if [ -z "$root_avail" ]; then
log_error "Could not determine disk space"
return 1
fi
if [ "$root_avail" -lt 10 ]; then
log_error "Insufficient disk space on /: ${root_avail}GB (minimum 10GB required)"
return 1
elif [ "$root_avail" -lt 20 ]; then
log_warning "Low disk space on /: ${root_avail}GB (recommend 20GB+)"
else
log_pass "Disk space on /: ${root_avail}GB available"
fi
# Check for separate /var partition (common on servers)
if mountpoint -q /var 2>/dev/null; then
local var_avail=$(df -BG /var | awk 'NR==2 {print $4}' | sed 's/G//')
if [ "$var_avail" -lt 20 ]; then
log_warning "Low disk space on /var: ${var_avail}GB (Docker images will go here)"
else
log_pass "Disk space on /var: ${var_avail}GB available"
fi
fi
return 0
}
check_disk_performance() {
# Check if root is on SSD vs HDD (performance indicator)
echo "[⚙] Checking disk performance characteristics..." >&2
local root_device=$(df / | awk 'NR==2 {print $1}' | sed 's/[0-9]*$//')
local device_name=$(basename "$root_device")
if [ -f "/sys/block/$device_name/queue/rotational" ]; then
local rotational=$(cat "/sys/block/$device_name/queue/rotational")
if [ "$rotational" -eq 0 ]; then
log_pass "Root filesystem on SSD ($device_name)"
else
log_warning "Root filesystem on HDD ($device_name) - SSD recommended for Docker"
fi
else
log_warning "Could not determine disk type for $device_name"
fi
return 0
}
# --- MEMORY VALIDATION ---
check_memory() {
# Validate sufficient RAM for deployment type
# Proxmox: 8GB minimum, Docker Swarm: 4GB minimum, Pi: 2GB minimum
echo "[⚙] Checking memory..." >&2
local mem_total_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}')
local mem_total_gb=$((mem_total_kb / 1024 / 1024))
if [ -z "$mem_total_gb" ] || [ "$mem_total_gb" -eq 0 ]; then
log_error "Could not determine system memory"
return 1
fi
local hardware_type=$(detect_hardware_type)
local min_required=2
case "$hardware_type" in
proxmox)
min_required=8
;;
docker-vm|physical-docker)
min_required=4
;;
ai-workstation)
min_required=16
;;
pi)
min_required=2
;;
esac
if [ "$mem_total_gb" -lt "$min_required" ]; then
log_error "Insufficient RAM: ${mem_total_gb}GB (minimum ${min_required}GB for $hardware_type)"
return 1
else
log_pass "Memory: ${mem_total_gb}GB (meets ${min_required}GB minimum for $hardware_type)"
fi
return 0
}
check_swap() {
# Check swap configuration (important for memory-constrained systems)
echo "[⚙] Checking swap configuration..." >&2
local swap_total_kb=$(grep SwapTotal /proc/meminfo | awk '{print $2}')
local swap_total_gb=$((swap_total_kb / 1024 / 1024))
local hardware_type=$(detect_hardware_type)
if [ "$hardware_type" == "proxmox" ]; then
# Proxmox hosts should NOT have swap enabled (best practice)
if [ "$swap_total_gb" -gt 0 ]; then
log_warning "Swap enabled on Proxmox host (${swap_total_gb}GB) - consider disabling"
else
log_pass "Swap disabled (correct for Proxmox)"
fi
else
# Other systems benefit from swap
if [ "$swap_total_gb" -eq 0 ]; then
log_warning "No swap configured - may cause OOM issues under load"
else
log_pass "Swap: ${swap_total_gb}GB configured"
fi
fi
return 0
}
# --- NETWORK VALIDATION ---
check_network_routes() {
# Validate proper network routing configuration
echo "[⚙] Checking network routing..." >&2
# Check for default route
if ip route show default &>/dev/null; then
local gateway=$(ip route show default | awk '/^default/ {print $3; exit}')
log_pass "Default gateway configured: $gateway"
else
log_error "No default gateway configured"
return 1
fi
# Check for DNS servers
if [ -f /etc/resolv.conf ]; then
local dns_count=$(grep -c "^nameserver" /etc/resolv.conf)
if [ "$dns_count" -gt 0 ]; then
log_pass "DNS servers configured ($dns_count entries)"
else
log_warning "No DNS servers in /etc/resolv.conf"
fi
else
log_error "/etc/resolv.conf missing"
return 1
fi
return 0
}
check_hostname_resolution() {
# Validate hostname resolution (important for cluster operations)
echo "[⚙] Checking hostname resolution..." >&2
local hostname=$(hostname)
local fqdn=$(hostname -f 2>/dev/null || echo "")
if [ -n "$hostname" ]; then
log_pass "Hostname: $hostname"
else
log_error "Hostname not set"
return 1
fi
# Check if hostname resolves
if host "$hostname" &>/dev/null || getent hosts "$hostname" &>/dev/null; then
log_pass "Hostname resolves"
else
log_warning "Hostname '$hostname' does not resolve - may cause cluster issues"
fi
# Check /etc/hosts
if grep -q "127.0.1.1.*$hostname" /etc/hosts 2>/dev/null; then
log_pass "Hostname in /etc/hosts"
else
log_warning "Hostname not in /etc/hosts - adding is recommended"
fi
return 0
}
# --- NFS CLIENT VALIDATION ---
check_nfs_client() {
# Validate NFS client packages and kernel modules
echo "[⚙] Checking NFS client..." >&2
# Check for NFS common package
if dpkg -l | grep -q "^ii.*nfs-common"; then
log_pass "nfs-common package installed"
else
log_warning "nfs-common not installed - required for NFS mounts"
return 1
fi
# Check for NFS kernel modules
if lsmod | grep -q "^nfs "; then
log_pass "NFS kernel module loaded"
else
# Try to load it
if sudo modprobe nfs 2>/dev/null; then
log_pass "NFS kernel module loaded successfully"
else
log_warning "Could not load NFS kernel module"
fi
fi
return 0
}
# --- DOCKER VALIDATION ---
check_docker_daemon() {
# Validate Docker installation and daemon health
echo "[⚙] Checking Docker installation..." >&2
# Check if Docker is installed
if ! command -v docker &>/dev/null; then
log_warning "Docker not installed (will be installed during bootstrap)"
return 0
fi
log_pass "Docker binary found: $(docker --version 2>/dev/null | head -n1)"
# Check if daemon is running
if systemctl is-active docker &>/dev/null; then
log_pass "Docker daemon running"
else
log_warning "Docker daemon not running"
return 0
fi
# Check Docker socket permissions
if [ -S /var/run/docker.sock ]; then
if sudo docker ps &>/dev/null; then
log_pass "Docker socket accessible"
else
log_warning "Docker socket exists but not accessible"
fi
else
log_warning "Docker socket not found"
fi
# Check storage driver
local storage_driver=$(docker info 2>/dev/null | grep "Storage Driver" | awk '{print $3}')
if [ -n "$storage_driver" ]; then
log_pass "Storage driver: $storage_driver"
# Warn about devicemapper (deprecated)
if [ "$storage_driver" == "devicemapper" ]; then
log_warning "devicemapper storage driver is deprecated - consider overlay2"
fi
fi
return 0
}
# --- PROXMOX VALIDATION ---
check_proxmox_api() {
# Validate Proxmox VE installation and API accessibility
local hardware_type=$(detect_hardware_type)
if [ "$hardware_type" != "proxmox" ]; then
return 0 # Skip if not Proxmox
fi
echo "[⚙] Checking Proxmox VE..." >&2
# Check for pveversion
if command -v pveversion &>/dev/null; then
local pve_version=$(pveversion | head -n1)
log_pass "Proxmox installed: $pve_version"
else
log_error "Proxmox tools not found (pveversion missing)"
return 1
fi
# Check cluster status
if command -v pvesh &>/dev/null; then
if sudo pvesh get /cluster/status &>/dev/null; then
log_pass "Proxmox API accessible"
else
log_warning "Proxmox API not responding"
fi
fi
# Check for required repositories
if [ -f /etc/apt/sources.list.d/pve-no-subscription.list ]; then
log_pass "No-subscription repository configured"
else
log_warning "No-subscription repository not configured"
fi
return 0
}
# --- SECURITY VALIDATION ---
check_ssh_security() {
# Basic SSH security validation
echo "[⚙] Checking SSH security..." >&2
# Check if SSH is running
if systemctl is-active ssh &>/dev/null || systemctl is-active sshd &>/dev/null; then
log_pass "SSH service running"
else
log_error "SSH service not running"
return 1
fi
# Check for password authentication (should be disabled in production)
if [ -f /etc/ssh/sshd_config ]; then
if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then
log_pass "SSH password authentication disabled (secure)"
else
log_warning "SSH password authentication may be enabled - key-only is recommended"
fi
fi
# Check for root login
if [ -f /etc/ssh/sshd_config ]; then
if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then
log_pass "SSH root login disabled (secure)"
else
log_warning "SSH root login may be enabled - consider disabling"
fi
fi
return 0
}
check_firewall() {
# Check firewall status (informational)
echo "[⚙] Checking firewall..." >&2
if systemctl is-active ufw &>/dev/null; then
local ufw_status=$(sudo ufw status 2>/dev/null | head -n1)
log_pass "UFW active: $ufw_status"
elif systemctl is-active firewalld &>/dev/null; then
log_pass "firewalld active"
elif command -v iptables &>/dev/null; then
local iptables_rules=$(sudo iptables -L -n | wc -l)
if [ "$iptables_rules" -gt 8 ]; then
log_pass "iptables rules configured ($iptables_rules lines)"
else
log_warning "No firewall detected - consider enabling UFW"
fi
else
log_warning "No firewall detected"
fi
return 0
}
# --- TIME SYNCHRONIZATION ---
check_time_sync() {
# Validate NTP/timesyncd for cluster time synchronization
echo "[⚙] Checking time synchronization..." >&2
if systemctl is-active systemd-timesyncd &>/dev/null; then
local ntp_status=$(timedatectl status 2>/dev/null | grep "synchronized" | awk '{print $3}')
if [ "$ntp_status" == "yes" ]; then
log_pass "Time synchronized via systemd-timesyncd"
else
log_warning "Time sync not confirmed"
fi
elif systemctl is-active ntp &>/dev/null || systemctl is-active ntpd &>/dev/null; then
log_pass "NTP service running"
else
log_warning "No time synchronization service detected - critical for clusters"
fi
return 0
}
# --- COMPREHENSIVE VALIDATION SUITE ---
run_validation_suite() {
# Run all validation checks and return summary
# Returns 0 if no critical errors, 1 if critical errors found
reset_validation_counters
echo "======================================" >&2
echo "SYSTEM VALIDATION SUITE" >&2
echo "======================================" >&2
# Run all checks (continue even if some fail)
check_disk_space || true
check_disk_performance || true
check_memory || true
check_swap || true
check_network_routes || true
check_hostname_resolution || true
check_nfs_client || true
check_docker_daemon || true
check_proxmox_api || true
check_ssh_security || true
check_firewall || true
check_time_sync || true
# Summary
echo "======================================" >&2
echo "VALIDATION SUMMARY" >&2
echo " Passed: $VALIDATION_PASSED" >&2
echo " Warnings: $VALIDATION_WARNINGS" >&2
echo " Errors: $VALIDATION_ERRORS" >&2
echo "======================================" >&2
if [ $VALIDATION_ERRORS -gt 0 ]; then
echo "[✗] CRITICAL: $VALIDATION_ERRORS validation errors - manual intervention required" >&2
return 1
elif [ $VALIDATION_WARNINGS -gt 0 ]; then
echo "[!] WARNINGS: $VALIDATION_WARNINGS issues detected - review recommended" >&2
return 0
else
echo "[✓] ALL CHECKS PASSED" >&2
return 0
fi
}
# Export functions
export -f reset_validation_counters
export -f log_pass
export -f log_warning
export -f log_error
export -f check_disk_space
export -f check_disk_performance
export -f check_memory
export -f check_swap
export -f check_network_routes
export -f check_hostname_resolution
export -f check_nfs_client
export -f check_docker_daemon
export -f check_proxmox_api
export -f check_ssh_security
export -f check_firewall
export -f check_time_sync
export -f run_validation_suite