homelab/scripts/lib/network.sh
nathan e16f98a183 feat(bootstrap)!: introduce unified bootstrap system with modular libraries
BREAKING CHANGE: day0bootstrap.sh deprecated in favor of bootstrap.sh

- Add scripts/bootstrap.sh (488 lines): Unified entrypoint supporting multiple hardware types (Proxmox/Docker VMs/Pi)
- Create scripts/lib/ modular library system:
  - detection.sh: OS/hardware/container detection (362 lines)
  - fingerprint.sh: System fingerprinting and inventory (494 lines)
  - network.sh: IP configuration and VLAN placement (356 lines)
  - proxmox.sh: PVE post-install automation (453 lines)
  - validation.sh: Comprehensive pre-flight checks (510 lines)
- Add validation tools: validate-node.sh, onboarding.sh, pi_init.sh
- Deprecate scripts/day0bootstrap.sh with graceful redirect wrapper
- Document architecture in scripts/README.md (495 lines) and PROXMOX-COMPARISON.md
- Update SOP-002 with new bootstrap workflow
- Add nodes/watchtower/compose.yaml (Raspberry Pi 5 stack)

Migration: Existing day0bootstrap.sh users automatically redirected to new system after 5-second warning. No manual intervention required.

Ref: Infrastructure automation modernization per active-tasks.md
2026-04-12 22:48:19 -04:00

357 lines
10 KiB
Bash

#!/bin/bash
# ==============================================================================
# NETWORK LIBRARY: Network Configuration and Validation
# ==============================================================================
# Part of unified bootstrap system for homelab infrastructure
# Handles static IP configuration via netplan, network validation, and
# VLAN capability detection for future network segmentation.
# ==============================================================================
# Source detection library if not already loaded
if ! type -t detect_primary_interface &>/dev/null; then
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
# shellcheck source=./detection.sh
source "${SCRIPT_DIR}/detection.sh"
fi
# --- NETWORK CONFIGURATION ---
apply_static_ip() {
# Configure static IP via netplan (Ubuntu/Debian)
# Args: $1 = Target IP, $2 = Gateway (default: 10.0.0.1), $3 = DNS (default: 10.0.0.2)
local target_ip="$1"
local gateway="${2:-10.0.0.1}"
local dns="${3:-10.0.0.2}"
if [ -z "$target_ip" ]; then
echo "ERROR: Target IP address required" >&2
return 1
fi
# Validate IP format
if ! [[ "$target_ip" =~ ^[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}\.[0-9]{1,3}$ ]]; then
echo "ERROR: Invalid IP address format: $target_ip" >&2
return 1
fi
local interface=$(detect_primary_interface)
if [ "$interface" == "unknown" ]; then
echo "ERROR: Could not detect primary network interface" >&2
return 1
fi
echo "[⚙] Configuring static IP: $target_ip on $interface..." >&2
# Fix permissions on existing netplan files (common issue)
sudo chmod 600 /lib/netplan/*.yaml 2>/dev/null || true
sudo chmod 600 /etc/netplan/*.yaml 2>/dev/null || true
# Create netplan directory if missing
sudo mkdir -p /etc/netplan
# Generate netplan configuration
sudo tee /etc/netplan/01-netcfg.yaml >/dev/null <<EOF
network:
version: 2
renderer: networkd
ethernets:
$interface:
addresses:
- ${target_ip}/24
nameservers:
addresses: [${dns}, 8.8.8.8]
routes:
- to: default
via: ${gateway}
EOF
# Fix permissions (netplan requires 600)
sudo chmod 600 /etc/netplan/01-netcfg.yaml
echo "[✓] Netplan configuration created" >&2
return 0
}
apply_network_changes() {
# Apply netplan configuration (WARNING: may cause SSH disconnect)
# Uses background apply to prevent SSH session hang
echo "[⚙] Applying network configuration (SSH may disconnect)..." >&2
# Test configuration first
if ! sudo netplan generate 2>/dev/null; then
echo "ERROR: netplan configuration validation failed" >&2
return 1
fi
# Apply in background to avoid blocking SSH
sudo netplan apply &
local apply_pid=$!
echo "[✓] Network apply started (PID: $apply_pid)" >&2
echo "[!] SSH connection will drop. Reconnect to new IP address." >&2
# Give it a moment to start
sleep 2
return 0
}
configure_network_safe() {
# Safe wrapper: configure IP + apply with reconnection instructions
# Args: $1 = Target IP, $2 = Gateway (optional), $3 = DNS (optional)
local target_ip="$1"
local current_ip=$(get_current_ip)
# Check if already configured
if [ "$current_ip" == "$target_ip" ]; then
echo "[✓] IP already configured as $target_ip, skipping" >&2
return 0
fi
# Configure
if ! apply_static_ip "$@"; then
return 1
fi
# Apply
apply_network_changes
echo "" >&2
echo "=========================================" >&2
echo "Network configuration applied" >&2
echo "Old IP: $current_ip" >&2
echo "New IP: $target_ip" >&2
echo "Reconnect with: ssh user@$target_ip" >&2
echo "=========================================" >&2
return 0
}
# --- VLAN CONFIGURATION (PLACEHOLDER) ---
get_desired_vlan_ip() {
# Determine desired VLAN IP based on hardware type
# Returns IP address from environment-constraints.md topology
# TODO: Enable when VLAN segmentation is live
local hardware_type=$(detect_hardware_type)
local hostname=$(hostname)
# TODO: Implement VLAN placement logic based on:
# - Proxmox hosts → 10.0.10.x (infra VLAN)
# - Swarm VMs → 10.0.200.x (compute VLAN)
# - Control nodes → 10.0.0.x (main VLAN)
# For now, return flat network assignment
case "$hardware_type" in
proxmox)
# Currently: 10.0.0.200-209
# Desired: 10.0.10.11-13 (future VLAN)
echo "10.0.0.201" # Placeholder
;;
docker-vm)
# Currently: 10.0.0.210-229
# Desired: 10.0.200.11+ (future VLAN)
echo "10.0.0.211" # Placeholder
;;
pi|physical-docker)
# Control nodes stay on main VLAN
echo "10.0.0.200"
;;
ai-workstation)
# Currently: 10.0.0.230-239
# Desired: 10.0.200.x (future VLAN)
echo "10.0.0.230" # Placeholder
;;
*)
echo "10.0.0.200" # Safe default
;;
esac
}
check_vlan_support() {
# Check if network hardware supports VLAN tagging
# Returns 0 if supported, 1 otherwise
local interface=$(detect_primary_interface)
if [ "$interface" == "unknown" ]; then
return 1
fi
# Check for 802.1Q VLAN support in kernel modules
if lsmod | grep -q "^8021q"; then
return 0
fi
# Check if module can be loaded
if sudo modprobe 8021q 2>/dev/null; then
return 0
fi
return 1
}
# --- NETWORK VALIDATION ---
validate_connectivity() {
# Test basic network connectivity
# Returns 0 if healthy, 1 otherwise
local errors=0
echo "[⚙] Validating network connectivity..." >&2
# Test default gateway
local gateway=$(ip route show default 2>/dev/null | awk '/^default/ {print $3; exit}')
if [ -n "$gateway" ]; then
if ping -c 2 -W 3 "$gateway" &>/dev/null; then
echo " [✓] Gateway reachable: $gateway" >&2
else
echo " [✗] Gateway unreachable: $gateway" >&2
((errors++))
fi
else
echo " [✗] No default gateway configured" >&2
((errors++))
fi
# Test DNS resolution
if ping -c 2 -W 3 8.8.8.8 &>/dev/null; then
echo " [✓] Internet connectivity (8.8.8.8)" >&2
else
echo " [✗] No internet connectivity" >&2
((errors++))
fi
# Test DNS resolution
if host google.com &>/dev/null; then
echo " [✓] DNS resolution working" >&2
else
echo " [!] DNS resolution issue (warning)" >&2
fi
if [ $errors -eq 0 ]; then
echo "[✓] Network validation passed" >&2
return 0
else
echo "[✗] Network validation failed ($errors errors)" >&2
return 1
fi
}
check_nfs_accessibility() {
# Test NFS server accessibility (TerraMaster NAS)
# Args: $1 = NFS server IP (default: 10.0.0.250)
local nfs_server="${1:-10.0.0.250}"
echo "[⚙] Checking NFS server accessibility ($nfs_server)..." >&2
# Test basic connectivity via ping
if ! ping -c 2 -W 3 "$nfs_server" &>/dev/null; then
echo " [✗] NFS server unreachable: $nfs_server" >&2
return 1
fi
echo " [✓] NFS server reachable" >&2
# Test NFS ports (2049 = NFSv3/v4, 111 = portmapper)
if command -v nc &>/dev/null; then
if nc -z -w 3 "$nfs_server" 2049 2>/dev/null; then
echo " [✓] NFS service responding (port 2049)" >&2
else
echo " [✗] NFS port 2049 closed" >&2
return 1
fi
fi
return 0
}
test_internal_hairpin_nat() {
# Test for hairpin NAT issues (lessons-learned.md #3)
# Internal hosts should NOT route through public DNS
local test_domain="castaldifamily.com"
echo "[⚙] Testing for hairpin NAT issues..." >&2
# Get public IP of domain
local public_ip=$(dig +short "$test_domain" @8.8.8.8 2>/dev/null | grep -E '^[0-9.]+$' | head -n1)
if [ -z "$public_ip" ]; then
echo " [!] Could not resolve $test_domain, skipping test" >&2
return 0
fi
# Try to ping public IP from inside network (should fail on hairpin NAT routers)
if ping -c 2 -W 2 "$public_ip" &>/dev/null; then
echo " [✓] No hairpin NAT issue detected" >&2
return 0
else
echo " [!] Possible hairpin NAT - use internal IPs (10.0.0.x) for node-to-node" >&2
return 0 # Warning, not error
fi
}
# --- NETWORK RENDERER DETECTION ---
detect_network_renderer() {
# Detect network configuration system: netplan, networkd, NetworkManager
if [ -d /etc/netplan ] && command -v netplan &>/dev/null; then
echo "netplan"
return 0
elif systemctl is-active systemd-networkd &>/dev/null; then
echo "networkd"
return 0
elif systemctl is-active NetworkManager &>/dev/null; then
echo "NetworkManager"
return 0
fi
echo "unknown"
return 1
}
# --- WAIT FOR NETWORK ---
wait_for_network() {
# Wait for network to stabilize after configuration change
# Args: $1 = timeout in seconds (default: 10)
local timeout="${1:-10}"
local elapsed=0
echo "[⚙] Waiting for network to stabilize (timeout: ${timeout}s)..." >&2
while [ $elapsed -lt $timeout ]; do
if ping -c 1 -W 1 8.8.8.8 &>/dev/null; then
echo "[✓] Network ready after ${elapsed}s" >&2
return 0
fi
sleep 1
((elapsed++))
done
echo "[!] Network not ready after ${timeout}s, continuing anyway" >&2
return 1
}
# Export functions
export -f apply_static_ip
export -f apply_network_changes
export -f configure_network_safe
export -f get_desired_vlan_ip
export -f check_vlan_support
export -f validate_connectivity
export -f check_nfs_accessibility
export -f test_internal_hairpin_nat
export -f detect_network_renderer
export -f wait_for_network