BREAKING CHANGE: day0bootstrap.sh deprecated in favor of bootstrap.sh - Add scripts/bootstrap.sh (488 lines): Unified entrypoint supporting multiple hardware types (Proxmox/Docker VMs/Pi) - Create scripts/lib/ modular library system: - detection.sh: OS/hardware/container detection (362 lines) - fingerprint.sh: System fingerprinting and inventory (494 lines) - network.sh: IP configuration and VLAN placement (356 lines) - proxmox.sh: PVE post-install automation (453 lines) - validation.sh: Comprehensive pre-flight checks (510 lines) - Add validation tools: validate-node.sh, onboarding.sh, pi_init.sh - Deprecate scripts/day0bootstrap.sh with graceful redirect wrapper - Document architecture in scripts/README.md (495 lines) and PROXMOX-COMPARISON.md - Update SOP-002 with new bootstrap workflow - Add nodes/watchtower/compose.yaml (Raspberry Pi 5 stack) Migration: Existing day0bootstrap.sh users automatically redirected to new system after 5-second warning. No manual intervention required. Ref: Infrastructure automation modernization per active-tasks.md
357 lines
10 KiB
Bash
357 lines
10 KiB
Bash
#!/bin/bash
|
|
|
|
# ==============================================================================
|
|
# NETWORK LIBRARY: Network Configuration and Validation
|
|
# ==============================================================================
|
|
# Part of unified bootstrap system for homelab infrastructure
|
|
# Handles static IP configuration via netplan, network validation, and
|
|
# VLAN capability detection for future network segmentation.
|
|
# ==============================================================================
|
|
|
|
# Source detection library if not already loaded
|
|
if ! type -t detect_primary_interface &>/dev/null; then
|
|
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
|
# shellcheck source=./detection.sh
|
|
source "${SCRIPT_DIR}/detection.sh"
|
|
fi
|
|
|
|
# --- NETWORK CONFIGURATION ---
|
|
|
|
apply_static_ip() {
|
|
# Configure static IP via netplan (Ubuntu/Debian)
|
|
# Args: $1 = Target IP, $2 = Gateway (default: 10.0.0.1), $3 = DNS (default: 10.0.0.2)
|
|
|
|
local target_ip="$1"
|
|
local gateway="${2:-10.0.0.1}"
|
|
local dns="${3:-10.0.0.2}"
|
|
|
|
if [ -z "$target_ip" ]; then
|
|
echo "ERROR: Target IP address required" >&2
|
|
return 1
|
|
fi
|
|
|
|
# Validate IP format
|
|
if ! [[ "$target_ip" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
|
|
echo "ERROR: Invalid IP address format: $target_ip" >&2
|
|
return 1
|
|
fi
|
|
|
|
local interface=$(detect_primary_interface)
|
|
if [ "$interface" == "unknown" ]; then
|
|
echo "ERROR: Could not detect primary network interface" >&2
|
|
return 1
|
|
fi
|
|
|
|
echo "[⚙] Configuring static IP: $target_ip on $interface..." >&2
|
|
|
|
# Fix permissions on existing netplan files (common issue)
|
|
sudo chmod 600 /lib/netplan/*.yaml 2>/dev/null || true
|
|
sudo chmod 600 /etc/netplan/*.yaml 2>/dev/null || true
|
|
|
|
# Create netplan directory if missing
|
|
sudo mkdir -p /etc/netplan
|
|
|
|
# Generate netplan configuration
|
|
sudo tee /etc/netplan/01-netcfg.yaml >/dev/null <<EOF
|
|
network:
|
|
version: 2
|
|
renderer: networkd
|
|
ethernets:
|
|
$interface:
|
|
addresses:
|
|
- ${target_ip}/24
|
|
nameservers:
|
|
addresses: [${dns}, 8.8.8.8]
|
|
routes:
|
|
- to: default
|
|
via: ${gateway}
|
|
EOF
|
|
|
|
# Fix permissions (netplan requires 600)
|
|
sudo chmod 600 /etc/netplan/01-netcfg.yaml
|
|
|
|
echo "[✓] Netplan configuration created" >&2
|
|
return 0
|
|
}
|
|
|
|
apply_network_changes() {
|
|
# Apply netplan configuration (WARNING: may cause SSH disconnect)
|
|
# Uses background apply to prevent SSH session hang
|
|
|
|
echo "[⚙] Applying network configuration (SSH may disconnect)..." >&2
|
|
|
|
# Test configuration first
|
|
if ! sudo netplan generate 2>/dev/null; then
|
|
echo "ERROR: netplan configuration validation failed" >&2
|
|
return 1
|
|
fi
|
|
|
|
# Apply in background to avoid blocking SSH
|
|
sudo netplan apply &
|
|
local apply_pid=$!
|
|
|
|
echo "[✓] Network apply started (PID: $apply_pid)" >&2
|
|
echo "[!] SSH connection will drop. Reconnect to new IP address." >&2
|
|
|
|
# Give it a moment to start
|
|
sleep 2
|
|
|
|
return 0
|
|
}
|
|
|
|
configure_network_safe() {
|
|
# Safe wrapper: configure IP + apply with reconnection instructions
|
|
# Args: $1 = Target IP, $2 = Gateway (optional), $3 = DNS (optional)
|
|
|
|
local target_ip="$1"
|
|
local current_ip=$(get_current_ip)
|
|
|
|
# Check if already configured
|
|
if [ "$current_ip" == "$target_ip" ]; then
|
|
echo "[✓] IP already configured as $target_ip, skipping" >&2
|
|
return 0
|
|
fi
|
|
|
|
# Configure
|
|
if ! apply_static_ip "$@"; then
|
|
return 1
|
|
fi
|
|
|
|
# Apply
|
|
apply_network_changes
|
|
|
|
echo "" >&2
|
|
echo "=========================================" >&2
|
|
echo "Network configuration applied" >&2
|
|
echo "Old IP: $current_ip" >&2
|
|
echo "New IP: $target_ip" >&2
|
|
echo "Reconnect with: ssh user@$target_ip" >&2
|
|
echo "=========================================" >&2
|
|
|
|
return 0
|
|
}
|
|
|
|
# --- VLAN CONFIGURATION (PLACEHOLDER) ---
|
|
|
|
get_desired_vlan_ip() {
|
|
# Determine desired VLAN IP based on hardware type
|
|
# Returns IP address from environment-constraints.md topology
|
|
# TODO: Enable when VLAN segmentation is live
|
|
|
|
local hardware_type=$(detect_hardware_type)
|
|
local hostname=$(hostname)
|
|
|
|
# TODO: Implement VLAN placement logic based on:
|
|
# - Proxmox hosts → 10.0.10.x (infra VLAN)
|
|
# - Swarm VMs → 10.0.200.x (compute VLAN)
|
|
# - Control nodes → 10.0.0.x (main VLAN)
|
|
|
|
# For now, return flat network assignment
|
|
case "$hardware_type" in
|
|
proxmox)
|
|
# Currently: 10.0.0.200-209
|
|
# Desired: 10.0.10.11-13 (future VLAN)
|
|
echo "10.0.0.201" # Placeholder
|
|
;;
|
|
docker-vm)
|
|
# Currently: 10.0.0.210-229
|
|
# Desired: 10.0.200.11+ (future VLAN)
|
|
echo "10.0.0.211" # Placeholder
|
|
;;
|
|
pi|physical-docker)
|
|
# Control nodes stay on main VLAN
|
|
echo "10.0.0.200"
|
|
;;
|
|
ai-workstation)
|
|
# Currently: 10.0.0.230-239
|
|
# Desired: 10.0.200.x (future VLAN)
|
|
echo "10.0.0.230" # Placeholder
|
|
;;
|
|
*)
|
|
echo "10.0.0.200" # Safe default
|
|
;;
|
|
esac
|
|
}
|
|
|
|
check_vlan_support() {
|
|
# Check if network hardware supports VLAN tagging
|
|
# Returns 0 if supported, 1 otherwise
|
|
|
|
local interface=$(detect_primary_interface)
|
|
|
|
if [ "$interface" == "unknown" ]; then
|
|
return 1
|
|
fi
|
|
|
|
# Check for 802.1Q VLAN support in kernel modules
|
|
if lsmod | grep -q "^8021q"; then
|
|
return 0
|
|
fi
|
|
|
|
# Check if module can be loaded
|
|
if sudo modprobe 8021q 2>/dev/null; then
|
|
return 0
|
|
fi
|
|
|
|
return 1
|
|
}
|
|
|
|
# --- NETWORK VALIDATION ---
|
|
|
|
validate_connectivity() {
|
|
# Test basic network connectivity
|
|
# Returns 0 if healthy, 1 otherwise
|
|
|
|
local errors=0
|
|
|
|
echo "[⚙] Validating network connectivity..." >&2
|
|
|
|
# Test default gateway
|
|
local gateway=$(ip route show default 2>/dev/null | awk '/^default/ {print $3; exit}')
|
|
if [ -n "$gateway" ]; then
|
|
if ping -c 2 -W 3 "$gateway" &>/dev/null; then
|
|
echo " [✓] Gateway reachable: $gateway" >&2
|
|
else
|
|
echo " [✗] Gateway unreachable: $gateway" >&2
|
|
((errors++))
|
|
fi
|
|
else
|
|
echo " [✗] No default gateway configured" >&2
|
|
((errors++))
|
|
fi
|
|
|
|
# Test DNS resolution
|
|
if ping -c 2 -W 3 8.8.8.8 &>/dev/null; then
|
|
echo " [✓] Internet connectivity (8.8.8.8)" >&2
|
|
else
|
|
echo " [✗] No internet connectivity" >&2
|
|
((errors++))
|
|
fi
|
|
|
|
# Test DNS resolution
|
|
if host google.com &>/dev/null; then
|
|
echo " [✓] DNS resolution working" >&2
|
|
else
|
|
echo " [!] DNS resolution issue (warning)" >&2
|
|
fi
|
|
|
|
if [ $errors -eq 0 ]; then
|
|
echo "[✓] Network validation passed" >&2
|
|
return 0
|
|
else
|
|
echo "[✗] Network validation failed ($errors errors)" >&2
|
|
return 1
|
|
fi
|
|
}
|
|
|
|
check_nfs_accessibility() {
|
|
# Test NFS server accessibility (TerraMaster NAS)
|
|
# Args: $1 = NFS server IP (default: 10.0.0.250)
|
|
|
|
local nfs_server="${1:-10.0.0.250}"
|
|
|
|
echo "[⚙] Checking NFS server accessibility ($nfs_server)..." >&2
|
|
|
|
# Test basic connectivity via ping
|
|
if ! ping -c 2 -W 3 "$nfs_server" &>/dev/null; then
|
|
echo " [✗] NFS server unreachable: $nfs_server" >&2
|
|
return 1
|
|
fi
|
|
|
|
echo " [✓] NFS server reachable" >&2
|
|
|
|
# Test NFS ports (2049 = NFSv3/v4, 111 = portmapper)
|
|
if command -v nc &>/dev/null; then
|
|
if nc -z -w 3 "$nfs_server" 2049 2>/dev/null; then
|
|
echo " [✓] NFS service responding (port 2049)" >&2
|
|
else
|
|
echo " [✗] NFS port 2049 closed" >&2
|
|
return 1
|
|
fi
|
|
fi
|
|
|
|
return 0
|
|
}
|
|
|
|
test_internal_hairpin_nat() {
|
|
# Test for hairpin NAT issues (lessons-learned.md #3)
|
|
# Internal hosts should NOT route through public DNS
|
|
|
|
local test_domain="castaldifamily.com"
|
|
|
|
echo "[⚙] Testing for hairpin NAT issues..." >&2
|
|
|
|
# Get public IP of domain
|
|
local public_ip=$(dig +short "$test_domain" @8.8.8.8 2>/dev/null | grep -E '^[0-9.]+$' | head -n1)
|
|
|
|
if [ -z "$public_ip" ]; then
|
|
echo " [!] Could not resolve $test_domain, skipping test" >&2
|
|
return 0
|
|
fi
|
|
|
|
# Try to ping public IP from inside network (should fail on hairpin NAT routers)
|
|
if ping -c 2 -W 2 "$public_ip" &>/dev/null; then
|
|
echo " [✓] No hairpin NAT issue detected" >&2
|
|
return 0
|
|
else
|
|
echo " [!] Possible hairpin NAT - use internal IPs (10.0.0.x) for node-to-node" >&2
|
|
return 0 # Warning, not error
|
|
fi
|
|
}
|
|
|
|
# --- NETWORK RENDERER DETECTION ---
|
|
|
|
detect_network_renderer() {
|
|
# Detect network configuration system: netplan, networkd, NetworkManager
|
|
|
|
if [ -d /etc/netplan ] && command -v netplan &>/dev/null; then
|
|
echo "netplan"
|
|
return 0
|
|
elif systemctl is-active systemd-networkd &>/dev/null; then
|
|
echo "networkd"
|
|
return 0
|
|
elif systemctl is-active NetworkManager &>/dev/null; then
|
|
echo "NetworkManager"
|
|
return 0
|
|
fi
|
|
|
|
echo "unknown"
|
|
return 1
|
|
}
|
|
|
|
# --- WAIT FOR NETWORK ---
|
|
|
|
wait_for_network() {
|
|
# Wait for network to stabilize after configuration change
|
|
# Args: $1 = timeout in seconds (default: 10)
|
|
|
|
local timeout="${1:-10}"
|
|
local elapsed=0
|
|
|
|
echo "[⚙] Waiting for network to stabilize (timeout: ${timeout}s)..." >&2
|
|
|
|
while [ $elapsed -lt $timeout ]; do
|
|
if ping -c 1 -W 1 8.8.8.8 &>/dev/null; then
|
|
echo "[✓] Network ready after ${elapsed}s" >&2
|
|
return 0
|
|
fi
|
|
sleep 1
|
|
((elapsed++))
|
|
done
|
|
|
|
echo "[!] Network not ready after ${timeout}s, continuing anyway" >&2
|
|
return 1
|
|
}
|
|
|
|
# Export functions
|
|
export -f apply_static_ip
|
|
export -f apply_network_changes
|
|
export -f configure_network_safe
|
|
export -f get_desired_vlan_ip
|
|
export -f check_vlan_support
|
|
export -f validate_connectivity
|
|
export -f check_nfs_accessibility
|
|
export -f test_internal_hairpin_nat
|
|
export -f detect_network_renderer
|
|
export -f wait_for_network
|