BREAKING CHANGE: day0bootstrap.sh deprecated in favor of bootstrap.sh - Add scripts/bootstrap.sh (488 lines): Unified entrypoint supporting multiple hardware types (Proxmox/Docker VMs/Pi) - Create scripts/lib/ modular library system: - detection.sh: OS/hardware/container detection (362 lines) - fingerprint.sh: System fingerprinting and inventory (494 lines) - network.sh: IP configuration and VLAN placement (356 lines) - proxmox.sh: PVE post-install automation (453 lines) - validation.sh: Comprehensive pre-flight checks (510 lines) - Add validation tools: validate-node.sh, onboarding.sh, pi_init.sh - Deprecate scripts/day0bootstrap.sh with graceful redirect wrapper - Document architecture in scripts/README.md (495 lines) and PROXMOX-COMPARISON.md - Update SOP-002 with new bootstrap workflow - Add nodes/watchtower/compose.yaml (Raspberry Pi 5 stack) Migration: Existing day0bootstrap.sh users automatically redirected to new system after 5-second warning. No manual intervention required. Ref: Infrastructure automation modernization per active-tasks.md
511 lines
15 KiB
Bash
511 lines
15 KiB
Bash
#!/bin/bash
|
|
|
|
# ==============================================================================
|
|
# VALIDATION LIBRARY: Comprehensive System Health Checks
|
|
# ==============================================================================
|
|
# Part of unified bootstrap system for homelab infrastructure
|
|
# Provides comprehensive pre-flight and post-bootstrap validation with
|
|
# severity levels (critical, warning, info) for operational readiness.
|
|
# ==============================================================================
|
|
|
|
# Source detection library if not already loaded
|
|
if ! type -t detect_os_family &>/dev/null; then
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
# shellcheck source=./detection.sh
|
|
source "${SCRIPT_DIR}/detection.sh"
|
|
fi
|
|
|
|
# --- VALIDATION TRACKING ---
|
|
|
|
declare -g VALIDATION_ERRORS=0
|
|
declare -g VALIDATION_WARNINGS=0
|
|
declare -g VALIDATION_PASSED=0
|
|
|
|
reset_validation_counters() {
|
|
VALIDATION_ERRORS=0
|
|
VALIDATION_WARNINGS=0
|
|
VALIDATION_PASSED=0
|
|
}
|
|
|
|
log_pass() {
|
|
local message="$1"
|
|
echo " [✓] $message" >&2
|
|
((VALIDATION_PASSED++))
|
|
}
|
|
|
|
log_warning() {
|
|
local message="$1"
|
|
echo " [!] WARNING: $message" >&2
|
|
((VALIDATION_WARNINGS++))
|
|
}
|
|
|
|
log_error() {
|
|
local message="$1"
|
|
echo " [✗] ERROR: $message" >&2
|
|
((VALIDATION_ERRORS++))
|
|
}
|
|
|
|
# --- DISK VALIDATION ---
|
|
|
|
check_disk_space() {
|
|
# Validate sufficient disk space for Docker and system operations
|
|
# Critical: Root partition must have at least 10GB free
|
|
# Warning: Root partition should have at least 20GB free
|
|
|
|
echo "[⚙] Checking disk space..." >&2
|
|
|
|
local root_avail=$(df -BG / | awk 'NR==2 {print $4}' | sed 's/G//')
|
|
|
|
if [ -z "$root_avail" ]; then
|
|
log_error "Could not determine disk space"
|
|
return 1
|
|
fi
|
|
|
|
if [ "$root_avail" -lt 10 ]; then
|
|
log_error "Insufficient disk space on /: ${root_avail}GB (minimum 10GB required)"
|
|
return 1
|
|
elif [ "$root_avail" -lt 20 ]; then
|
|
log_warning "Low disk space on /: ${root_avail}GB (recommend 20GB+)"
|
|
else
|
|
log_pass "Disk space on /: ${root_avail}GB available"
|
|
fi
|
|
|
|
# Check for separate /var partition (common on servers)
|
|
if mountpoint -q /var 2>/dev/null; then
|
|
local var_avail=$(df -BG /var | awk 'NR==2 {print $4}' | sed 's/G//')
|
|
if [ "$var_avail" -lt 20 ]; then
|
|
log_warning "Low disk space on /var: ${var_avail}GB (Docker images will go here)"
|
|
else
|
|
log_pass "Disk space on /var: ${var_avail}GB available"
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
check_disk_performance() {
|
|
# Check if root is on SSD vs HDD (performance indicator)
|
|
|
|
echo "[⚙] Checking disk performance characteristics..." >&2
|
|
|
|
local root_device=$(df / | awk 'NR==2 {print $1}' | sed 's/[0-9]*$//')
|
|
local device_name=$(basename "$root_device")
|
|
|
|
if [ -f "/sys/block/$device_name/queue/rotational" ]; then
|
|
local rotational=$(cat "/sys/block/$device_name/queue/rotational")
|
|
if [ "$rotational" -eq 0 ]; then
|
|
log_pass "Root filesystem on SSD ($device_name)"
|
|
else
|
|
log_warning "Root filesystem on HDD ($device_name) - SSD recommended for Docker"
|
|
fi
|
|
else
|
|
log_warning "Could not determine disk type for $device_name"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- MEMORY VALIDATION ---
|
|
|
|
check_memory() {
|
|
# Validate sufficient RAM for deployment type
|
|
# Proxmox: 8GB minimum, Docker Swarm: 4GB minimum, Pi: 2GB minimum
|
|
|
|
echo "[⚙] Checking memory..." >&2
|
|
|
|
local mem_total_kb=$(grep MemTotal /proc/meminfo | awk '{print $2}')
|
|
local mem_total_gb=$((mem_total_kb / 1024 / 1024))
|
|
|
|
if [ -z "$mem_total_gb" ] || [ "$mem_total_gb" -eq 0 ]; then
|
|
log_error "Could not determine system memory"
|
|
return 1
|
|
fi
|
|
|
|
local hardware_type=$(detect_hardware_type)
|
|
local min_required=2
|
|
|
|
case "$hardware_type" in
|
|
proxmox)
|
|
min_required=8
|
|
;;
|
|
docker-vm|physical-docker)
|
|
min_required=4
|
|
;;
|
|
ai-workstation)
|
|
min_required=16
|
|
;;
|
|
pi)
|
|
min_required=2
|
|
;;
|
|
esac
|
|
|
|
if [ "$mem_total_gb" -lt "$min_required" ]; then
|
|
log_error "Insufficient RAM: ${mem_total_gb}GB (minimum ${min_required}GB for $hardware_type)"
|
|
return 1
|
|
else
|
|
log_pass "Memory: ${mem_total_gb}GB (meets ${min_required}GB minimum for $hardware_type)"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
check_swap() {
|
|
# Check swap configuration (important for memory-constrained systems)
|
|
|
|
echo "[⚙] Checking swap configuration..." >&2
|
|
|
|
local swap_total_kb=$(grep SwapTotal /proc/meminfo | awk '{print $2}')
|
|
local swap_total_gb=$((swap_total_kb / 1024 / 1024))
|
|
|
|
local hardware_type=$(detect_hardware_type)
|
|
|
|
if [ "$hardware_type" == "proxmox" ]; then
|
|
# Proxmox hosts should NOT have swap enabled (best practice)
|
|
if [ "$swap_total_gb" -gt 0 ]; then
|
|
log_warning "Swap enabled on Proxmox host (${swap_total_gb}GB) - consider disabling"
|
|
else
|
|
log_pass "Swap disabled (correct for Proxmox)"
|
|
fi
|
|
else
|
|
# Other systems benefit from swap
|
|
if [ "$swap_total_gb" -eq 0 ]; then
|
|
log_warning "No swap configured - may cause OOM issues under load"
|
|
else
|
|
log_pass "Swap: ${swap_total_gb}GB configured"
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- NETWORK VALIDATION ---
|
|
|
|
check_network_routes() {
|
|
# Validate proper network routing configuration
|
|
|
|
echo "[⚙] Checking network routing..." >&2
|
|
|
|
# Check for default route
|
|
if ip route show default &>/dev/null; then
|
|
local gateway=$(ip route show default | awk '/^default/ {print $3; exit}')
|
|
log_pass "Default gateway configured: $gateway"
|
|
else
|
|
log_error "No default gateway configured"
|
|
return 1
|
|
fi
|
|
|
|
# Check for DNS servers
|
|
if [ -f /etc/resolv.conf ]; then
|
|
local dns_count=$(grep -c "^nameserver" /etc/resolv.conf)
|
|
if [ "$dns_count" -gt 0 ]; then
|
|
log_pass "DNS servers configured ($dns_count entries)"
|
|
else
|
|
log_warning "No DNS servers in /etc/resolv.conf"
|
|
fi
|
|
else
|
|
log_error "/etc/resolv.conf missing"
|
|
return 1
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
check_hostname_resolution() {
|
|
# Validate hostname resolution (important for cluster operations)
|
|
|
|
echo "[⚙] Checking hostname resolution..." >&2
|
|
|
|
local hostname=$(hostname)
|
|
local fqdn=$(hostname -f 2>/dev/null || echo "")
|
|
|
|
if [ -n "$hostname" ]; then
|
|
log_pass "Hostname: $hostname"
|
|
else
|
|
log_error "Hostname not set"
|
|
return 1
|
|
fi
|
|
|
|
# Check if hostname resolves
|
|
if host "$hostname" &>/dev/null || getent hosts "$hostname" &>/dev/null; then
|
|
log_pass "Hostname resolves"
|
|
else
|
|
log_warning "Hostname '$hostname' does not resolve - may cause cluster issues"
|
|
fi
|
|
|
|
# Check /etc/hosts
|
|
if grep -q "127.0.1.1.*$hostname" /etc/hosts 2>/dev/null; then
|
|
log_pass "Hostname in /etc/hosts"
|
|
else
|
|
log_warning "Hostname not in /etc/hosts - adding is recommended"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- NFS CLIENT VALIDATION ---
|
|
|
|
check_nfs_client() {
|
|
# Validate NFS client packages and kernel modules
|
|
|
|
echo "[⚙] Checking NFS client..." >&2
|
|
|
|
# Check for NFS common package
|
|
if dpkg -l | grep -q "^ii.*nfs-common"; then
|
|
log_pass "nfs-common package installed"
|
|
else
|
|
log_warning "nfs-common not installed - required for NFS mounts"
|
|
return 1
|
|
fi
|
|
|
|
# Check for NFS kernel modules
|
|
if lsmod | grep -q "^nfs "; then
|
|
log_pass "NFS kernel module loaded"
|
|
else
|
|
# Try to load it
|
|
if sudo modprobe nfs 2>/dev/null; then
|
|
log_pass "NFS kernel module loaded successfully"
|
|
else
|
|
log_warning "Could not load NFS kernel module"
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- DOCKER VALIDATION ---
|
|
|
|
check_docker_daemon() {
|
|
# Validate Docker installation and daemon health
|
|
|
|
echo "[⚙] Checking Docker installation..." >&2
|
|
|
|
# Check if Docker is installed
|
|
if ! command -v docker &>/dev/null; then
|
|
log_warning "Docker not installed (will be installed during bootstrap)"
|
|
return 0
|
|
fi
|
|
|
|
log_pass "Docker binary found: $(docker --version 2>/dev/null | head -n1)"
|
|
|
|
# Check if daemon is running
|
|
if systemctl is-active docker &>/dev/null; then
|
|
log_pass "Docker daemon running"
|
|
else
|
|
log_warning "Docker daemon not running"
|
|
return 0
|
|
fi
|
|
|
|
# Check Docker socket permissions
|
|
if [ -S /var/run/docker.sock ]; then
|
|
if sudo docker ps &>/dev/null; then
|
|
log_pass "Docker socket accessible"
|
|
else
|
|
log_warning "Docker socket exists but not accessible"
|
|
fi
|
|
else
|
|
log_warning "Docker socket not found"
|
|
fi
|
|
|
|
# Check storage driver
|
|
local storage_driver=$(docker info 2>/dev/null | grep "Storage Driver" | awk '{print $3}')
|
|
if [ -n "$storage_driver" ]; then
|
|
log_pass "Storage driver: $storage_driver"
|
|
|
|
# Warn about devicemapper (deprecated)
|
|
if [ "$storage_driver" == "devicemapper" ]; then
|
|
log_warning "devicemapper storage driver is deprecated - consider overlay2"
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- PROXMOX VALIDATION ---
|
|
|
|
check_proxmox_api() {
|
|
# Validate Proxmox VE installation and API accessibility
|
|
|
|
local hardware_type=$(detect_hardware_type)
|
|
|
|
if [ "$hardware_type" != "proxmox" ]; then
|
|
return 0 # Skip if not Proxmox
|
|
fi
|
|
|
|
echo "[⚙] Checking Proxmox VE..." >&2
|
|
|
|
# Check for pveversion
|
|
if command -v pveversion &>/dev/null; then
|
|
local pve_version=$(pveversion | head -n1)
|
|
log_pass "Proxmox installed: $pve_version"
|
|
else
|
|
log_error "Proxmox tools not found (pveversion missing)"
|
|
return 1
|
|
fi
|
|
|
|
# Check cluster status
|
|
if command -v pvesh &>/dev/null; then
|
|
if sudo pvesh get /cluster/status &>/dev/null; then
|
|
log_pass "Proxmox API accessible"
|
|
else
|
|
log_warning "Proxmox API not responding"
|
|
fi
|
|
fi
|
|
|
|
# Check for required repositories
|
|
if [ -f /etc/apt/sources.list.d/pve-no-subscription.list ]; then
|
|
log_pass "No-subscription repository configured"
|
|
else
|
|
log_warning "No-subscription repository not configured"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- SECURITY VALIDATION ---
|
|
|
|
check_ssh_security() {
|
|
# Basic SSH security validation
|
|
|
|
echo "[⚙] Checking SSH security..." >&2
|
|
|
|
# Check if SSH is running
|
|
if systemctl is-active ssh &>/dev/null || systemctl is-active sshd &>/dev/null; then
|
|
log_pass "SSH service running"
|
|
else
|
|
log_error "SSH service not running"
|
|
return 1
|
|
fi
|
|
|
|
# Check for password authentication (should be disabled in production)
|
|
if [ -f /etc/ssh/sshd_config ]; then
|
|
if grep -q "^PasswordAuthentication no" /etc/ssh/sshd_config; then
|
|
log_pass "SSH password authentication disabled (secure)"
|
|
else
|
|
log_warning "SSH password authentication may be enabled - key-only is recommended"
|
|
fi
|
|
fi
|
|
|
|
# Check for root login
|
|
if [ -f /etc/ssh/sshd_config ]; then
|
|
if grep -q "^PermitRootLogin no" /etc/ssh/sshd_config; then
|
|
log_pass "SSH root login disabled (secure)"
|
|
else
|
|
log_warning "SSH root login may be enabled - consider disabling"
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
check_firewall() {
|
|
# Check firewall status (informational)
|
|
|
|
echo "[⚙] Checking firewall..." >&2
|
|
|
|
if systemctl is-active ufw &>/dev/null; then
|
|
local ufw_status=$(sudo ufw status 2>/dev/null | head -n1)
|
|
log_pass "UFW active: $ufw_status"
|
|
elif systemctl is-active firewalld &>/dev/null; then
|
|
log_pass "firewalld active"
|
|
elif command -v iptables &>/dev/null; then
|
|
local iptables_rules=$(sudo iptables -L -n | wc -l)
|
|
if [ "$iptables_rules" -gt 8 ]; then
|
|
log_pass "iptables rules configured ($iptables_rules lines)"
|
|
else
|
|
log_warning "No firewall detected - consider enabling UFW"
|
|
fi
|
|
else
|
|
log_warning "No firewall detected"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- TIME SYNCHRONIZATION ---
|
|
|
|
check_time_sync() {
|
|
# Validate NTP/timesyncd for cluster time synchronization
|
|
|
|
echo "[⚙] Checking time synchronization..." >&2
|
|
|
|
if systemctl is-active systemd-timesyncd &>/dev/null; then
|
|
local ntp_status=$(timedatectl status 2>/dev/null | grep "synchronized" | awk '{print $3}')
|
|
if [ "$ntp_status" == "yes" ]; then
|
|
log_pass "Time synchronized via systemd-timesyncd"
|
|
else
|
|
log_warning "Time sync not confirmed"
|
|
fi
|
|
elif systemctl is-active ntp &>/dev/null || systemctl is-active ntpd &>/dev/null; then
|
|
log_pass "NTP service running"
|
|
else
|
|
log_warning "No time synchronization service detected - critical for clusters"
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- COMPREHENSIVE VALIDATION SUITE ---
|
|
|
|
run_validation_suite() {
|
|
# Run all validation checks and return summary
|
|
# Returns 0 if no critical errors, 1 if critical errors found
|
|
|
|
reset_validation_counters
|
|
|
|
echo "======================================" >&2
|
|
echo "SYSTEM VALIDATION SUITE" >&2
|
|
echo "======================================" >&2
|
|
|
|
# Run all checks (continue even if some fail)
|
|
check_disk_space || true
|
|
check_disk_performance || true
|
|
check_memory || true
|
|
check_swap || true
|
|
check_network_routes || true
|
|
check_hostname_resolution || true
|
|
check_nfs_client || true
|
|
check_docker_daemon || true
|
|
check_proxmox_api || true
|
|
check_ssh_security || true
|
|
check_firewall || true
|
|
check_time_sync || true
|
|
|
|
# Summary
|
|
echo "======================================" >&2
|
|
echo "VALIDATION SUMMARY" >&2
|
|
echo " Passed: $VALIDATION_PASSED" >&2
|
|
echo " Warnings: $VALIDATION_WARNINGS" >&2
|
|
echo " Errors: $VALIDATION_ERRORS" >&2
|
|
echo "======================================" >&2
|
|
|
|
if [ $VALIDATION_ERRORS -gt 0 ]; then
|
|
echo "[✗] CRITICAL: $VALIDATION_ERRORS validation errors - manual intervention required" >&2
|
|
return 1
|
|
elif [ $VALIDATION_WARNINGS -gt 0 ]; then
|
|
echo "[!] WARNINGS: $VALIDATION_WARNINGS issues detected - review recommended" >&2
|
|
return 0
|
|
else
|
|
echo "[✓] ALL CHECKS PASSED" >&2
|
|
return 0
|
|
fi
|
|
}
|
|
|
|
# Export functions
|
|
export -f reset_validation_counters
|
|
export -f log_pass
|
|
export -f log_warning
|
|
export -f log_error
|
|
export -f check_disk_space
|
|
export -f check_disk_performance
|
|
export -f check_memory
|
|
export -f check_swap
|
|
export -f check_network_routes
|
|
export -f check_hostname_resolution
|
|
export -f check_nfs_client
|
|
export -f check_docker_daemon
|
|
export -f check_proxmox_api
|
|
export -f check_ssh_security
|
|
export -f check_firewall
|
|
export -f check_time_sync
|
|
export -f run_validation_suite
|