homelab/ansible/archive/playbooks/proxmox/provision_swarm_vms.yml

543 lines
19 KiB
YAML

---
# playbooks/proxmox/provision_swarm_vms.yml
# Provisions Ubuntu 24.04 VMs on Proxmox hosts for Docker Swarm
#
# Prerequisites:
# - community.general collection installed (ansible-galaxy collection install community.general)
# - Ubuntu 24.04 cloud image downloaded to Proxmox storage
# - API token or root SSH access to Proxmox host
#
# Usage:
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/provision_swarm_vms.yml -e target_host=pve01
#
# Variables (can be overridden via -e or group_vars):
# - pve_node_id: extracted from inventory (1-5)
# - vm_template_name: base cloud-init template
# - vm_storage: storage pool for VM disks
# - vm_bridge: network bridge for VM NICs
- name: Provision Swarm VMs on Proxmox
hosts: "{{ target_host | default('proxmox_cluster') }}"
gather_facts: true
vars:
# VM specifications (from standards doc)
vm_disk_size: "32G"
vm_memory_mb: 4096
vm_cores: 2
vm_storage: "local-lvm"
vm_bridge: "vmbr0"
# Cloud image settings
cloud_image_url: "https://cloud-images.ubuntu.com/noble/current/noble-server-cloudimg-amd64.img"
cloud_image_name: "noble-server-cloudimg-amd64.img"
cloud_image_path: "/var/lib/vz/template/iso/{{ cloud_image_name }}"
vm_template_vmid: "{{ 9000 + (node_index | int) }}"
vm_template_name: "ubuntu-24.04-cloud-template-{{ node_index }}"
# Derive node index from hostname (pve01 -> 1, pve02 -> 2, etc.) and coerce to integer
node_index: "{{ (pve_node_id | default(inventory_hostname | regex_replace('[^0-9]', '')) ) | int }}"
# VM IDs (unique per node: 101/102 for node 1, 201/202 for node 2, etc.)
manager_vmid: "{{ (node_index | int) * 100 + 1 }}"
worker_vmid: "{{ (node_index | int) * 100 + 2 }}"
# VM names
manager_name: "swarm-manager-{{ node_index }}"
worker_name: "swarm-worker-{{ node_index }}"
# Static IPs (from inventory scheme: managers .211-.215, workers .221-.225)
manager_ip: "10.0.0.{{ 210 + (node_index | int) }}"
worker_ip: "10.0.0.{{ 220 + (node_index | int) }}"
network_cidr: "24"
gateway_ip: "10.0.0.2"
dns_primary: "10.0.0.2"
dns_secondary: "8.8.8.8"
# Cloud-init user
vm_user: "chester"
vm_ssh_key: "{{ lookup('file', lookup('env', 'HOME') + '/.ssh/id_ed25519.pub') }}"
tasks:
# ========================================
# SECTION 1: Download Cloud Image
# ========================================
- name: Check if cloud image already exists
ansible.builtin.stat:
path: "{{ cloud_image_path }}"
register: cloud_image_stat
tags: ['template', 'download']
- name: Download Ubuntu 24.04 cloud image
ansible.builtin.get_url:
url: "{{ cloud_image_url }}"
dest: "{{ cloud_image_path }}"
mode: '0644'
when: not cloud_image_stat.stat.exists
tags: ['template', 'download']
# ========================================
# SECTION 2: Create VM Template
# ========================================
- name: Check if VM template already exists
ansible.builtin.shell: |
qm status {{ vm_template_vmid }} 2>/dev/null && echo "exists" || echo "missing"
register: template_check
changed_when: false
failed_when: false
tags: ['template']
- name: Create VM template from cloud image
when: "'missing' in template_check.stdout"
tags: ['template']
block:
- name: Create base VM for template
ansible.builtin.shell: |
qm create {{ vm_template_vmid }} \
--name {{ vm_template_name }} \
--memory 2048 \
--cores 2 \
--net0 virtio,bridge={{ vm_bridge }} \
--scsihw virtio-scsi-pci
register: create_vm
changed_when: false
- name: Import cloud image as disk
ansible.builtin.shell: |
qm importdisk {{ vm_template_vmid }} {{ cloud_image_path }} {{ vm_storage }}
register: import_disk
changed_when: false
- name: Attach imported disk to VM
ansible.builtin.shell: |
qm set {{ vm_template_vmid }} \
--scsi0 {{ vm_storage }}:vm-{{ vm_template_vmid }}-disk-0 \
--boot c \
--bootdisk scsi0
changed_when: false
- name: Add cloud-init drive
ansible.builtin.shell: |
qm set {{ vm_template_vmid }} --ide2 {{ vm_storage }}:cloudinit
changed_when: false
- name: Configure serial console for cloud-init
ansible.builtin.shell: |
qm set {{ vm_template_vmid }} --serial0 socket --vga serial0
changed_when: false
- name: Convert VM to template
ansible.builtin.shell: |
qm template {{ vm_template_vmid }}
changed_when: false
# ========================================
# SECTION 3: Clone and Configure Manager VM
# ========================================
- name: Check if manager VM already exists
ansible.builtin.shell: |
qm status {{ manager_vmid }} 2>/dev/null && echo "exists" || echo "missing"
register: manager_check
changed_when: false
failed_when: false
tags: ['provision', 'manager']
- name: Provision Swarm Manager VM
when: "'missing' in manager_check.stdout"
tags: ['provision', 'manager']
block:
- name: Clone template to manager VM
ansible.builtin.shell: |
qm clone {{ vm_template_vmid }} {{ manager_vmid }} \
--name {{ manager_name }} \
--full
changed_when: false
- name: Resize manager disk to {{ vm_disk_size }}
ansible.builtin.shell: |
qm resize {{ manager_vmid }} scsi0 {{ vm_disk_size }}
changed_when: false
- name: Configure manager VM resources
ansible.builtin.shell: |
qm set {{ manager_vmid }} \
--memory {{ vm_memory_mb }} \
--cores {{ vm_cores }} \
--onboot 1 \
--agent enabled=1
changed_when: false
- name: Write SSH public key for manager
ansible.builtin.copy:
content: "{{ vm_ssh_key }}"
dest: "/tmp/sshkey_{{ manager_vmid }}.pub"
mode: '0644'
- name: Configure manager cloud-init
ansible.builtin.shell: |
qm set {{ manager_vmid }} \
--ciuser {{ vm_user }} \
--sshkeys /tmp/sshkey_{{ manager_vmid }}.pub \
--ipconfig0 ip={{ manager_ip }}/{{ network_cidr }},gw={{ gateway_ip }} \
--nameserver {{ dns_primary }} \
--searchdomain local
changed_when: false
- name: Start manager VM
ansible.builtin.shell: |
qm start {{ manager_vmid }}
changed_when: false
- name: Display manager VM info
ansible.builtin.debug:
msg: "Manager VM {{ manager_name }} (ID: {{ manager_vmid }}) configured with IP {{ manager_ip }}"
tags: ['provision', 'manager']
# ========================================
# SECTION 4: Clone and Configure Worker VM
# ========================================
- name: Check if worker VM already exists
ansible.builtin.shell: |
qm status {{ worker_vmid }} 2>/dev/null && echo "exists" || echo "missing"
register: worker_check
changed_when: false
failed_when: false
tags: ['provision', 'worker']
- name: Provision Swarm Worker VM
when: "'missing' in worker_check.stdout"
tags: ['provision', 'worker']
block:
- name: Clone template to worker VM
ansible.builtin.shell: |
qm clone {{ vm_template_vmid }} {{ worker_vmid }} \
--name {{ worker_name }} \
--full
changed_when: false
- name: Resize worker disk to {{ vm_disk_size }}
ansible.builtin.shell: |
qm resize {{ worker_vmid }} scsi0 {{ vm_disk_size }}
changed_when: false
- name: Configure worker VM resources
ansible.builtin.shell: |
qm set {{ worker_vmid }} \
--memory {{ vm_memory_mb }} \
--cores {{ vm_cores }} \
--onboot 1 \
--agent enabled=1
changed_when: false
- name: Write SSH public key for worker
ansible.builtin.copy:
content: "{{ vm_ssh_key }}"
dest: "/tmp/sshkey_{{ worker_vmid }}.pub"
mode: '0644'
- name: Configure worker cloud-init
ansible.builtin.shell: |
qm set {{ worker_vmid }} \
--ciuser {{ vm_user }} \
--sshkeys /tmp/sshkey_{{ worker_vmid }}.pub \
--ipconfig0 ip={{ worker_ip }}/{{ network_cidr }},gw={{ gateway_ip }} \
--nameserver {{ dns_primary }} \
--searchdomain local
changed_when: false
- name: Start worker VM
ansible.builtin.shell: |
qm start {{ worker_vmid }}
changed_when: false
- name: Display worker VM info
ansible.builtin.debug:
msg: "Worker VM {{ worker_name }} (ID: {{ worker_vmid }}) configured with IP {{ worker_ip }}"
tags: ['provision', 'worker']
# ========================================
# SECTION 5: Idempotent Proxmox disk resize
# WHY unconditional: the Provision blocks only run when a VM is absent.
# An existing VM that predates vm_disk_size being set would be left
# undersized. These tasks run on every invocation and are no-ops when
# the disk is already at or above the target size.
# WHY numeric comparison: qm resize cannot shrink; comparing parsed GB
# values prevents an error when the disk is already correct.
# ========================================
- name: Get current manager VM disk size
ansible.builtin.shell: |
qm config {{ manager_vmid }} | grep "^scsi0:" | grep -oP 'size=\K[^,\s]+'
register: disk_grow_manager_current
changed_when: false
tags: ['provision', 'disks']
- name: Resize manager disk to {{ vm_disk_size }} if below target
ansible.builtin.shell: |
qm resize {{ manager_vmid }} scsi0 {{ vm_disk_size }}
when: >
(disk_grow_manager_current.stdout | regex_replace('[^0-9]', '') | int)
< (vm_disk_size | regex_replace('[^0-9]', '') | int)
tags: ['provision', 'disks']
- name: Get current worker VM disk size
ansible.builtin.shell: |
qm config {{ worker_vmid }} | grep "^scsi0:" | grep -oP 'size=\K[^,\s]+'
register: disk_grow_worker_current
changed_when: false
tags: ['provision', 'disks']
- name: Resize worker disk to {{ vm_disk_size }} if below target
ansible.builtin.shell: |
qm resize {{ worker_vmid }} scsi0 {{ vm_disk_size }}
when: >
(disk_grow_worker_current.stdout | regex_replace('[^0-9]', '') | int)
< (vm_disk_size | regex_replace('[^0-9]', '') | int)
tags: ['provision', 'disks']
# ========================================
# SECTION 6: Wait for VMs to be ready
# ========================================
- name: Wait for manager VM to be reachable via SSH
ansible.builtin.wait_for:
host: "{{ manager_ip }}"
port: 22
delay: 30
timeout: 300
state: started
tags: ['provision', 'wait']
- name: Wait for worker VM to be reachable via SSH
ansible.builtin.wait_for:
host: "{{ worker_ip }}"
port: 22
delay: 30
timeout: 300
state: started
tags: ['provision', 'wait']
- name: VM provisioning complete
ansible.builtin.debug:
msg: |
✅ VMs provisioned successfully on {{ inventory_hostname }}:
- {{ manager_name }}: {{ manager_ip }} (VMID {{ manager_vmid }})
- {{ worker_name }}: {{ worker_ip }} (VMID {{ worker_vmid }})
Next steps: add VMs to in-memory inventory, install Docker, initialize Docker Swarm, and verify connectivity
tags: ['provision']
- name: Add manager VMs to in-memory inventory
ansible.builtin.add_host:
name: "swarm-manager-{{ item | regex_replace('[^0-9]', '') | int }}"
ansible_host: "10.0.0.{{ 210 + (item | regex_replace('[^0-9]', '') | int) }}"
ansible_user: "{{ vm_user }}"
groups: "swarm_managers,swarm_hosts"
loop: "{{ groups['proxmox_cluster'] }}"
run_once: true
tags: ['provision']
- name: Add worker VMs to in-memory inventory
ansible.builtin.add_host:
name: "swarm-worker-{{ item | regex_replace('[^0-9]', '') | int }}"
ansible_host: "10.0.0.{{ 220 + (item | regex_replace('[^0-9]', '') | int) }}"
ansible_user: "{{ vm_user }}"
groups: "swarm_workers,swarm_hosts"
loop: "{{ groups['proxmox_cluster'] }}"
run_once: true
tags: ['provision']
# ========================================
# SECTION 6: Install Docker on VMs
# ========================================
- name: Install Docker Engine (Docker CE) from official repo
hosts: swarm_hosts
become: true
gather_facts: true
vars:
vm_user: chester
tasks:
- name: Install prerequisites for Docker
ansible.builtin.apt:
name:
- ca-certificates
- curl
- gnupg
- lsb-release
- python3-jsondiff
state: present
update_cache: true
tags: ['docker']
- name: Add Docker GPG key (dearmored)
ansible.builtin.shell: |
curl -fsSL https://download.docker.com/linux/ubuntu/gpg | gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
args:
creates: /usr/share/keyrings/docker-archive-keyring.gpg
tags: ['docker']
- name: Add Docker APT repository
ansible.builtin.apt_repository:
repo: "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu {{ ansible_distribution_release }} stable"
filename: docker
state: present
tags: ['docker']
- name: Update apt cache after adding Docker repo
ansible.builtin.apt:
update_cache: true
tags: ['docker']
- name: Install Docker CE, CLI, containerd and compose plugin
ansible.builtin.apt:
name:
- docker-ce
- docker-ce-cli
- containerd.io
- docker-compose-plugin
state: present
update_cache: false
tags: ['docker']
- name: Ensure Docker service is started and enabled
ansible.builtin.systemd:
name: docker
state: started
enabled: true
tags: ['docker']
- name: Add '{{ vm_user }}' to docker group
ansible.builtin.user:
name: "{{ vm_user }}"
groups: docker
append: true
tags: ['docker']
- name: Ensure /opt/stacks exists and is owned by '{{ vm_user }}'
ansible.builtin.file:
path: /opt/stacks
state: directory
owner: "{{ vm_user }}"
group: "{{ vm_user }}"
mode: '0755'
tags: ['docker']
# ========================================
# SECTION 7: Initialize Docker Swarm and Join Nodes
# ========================================
- name: Initialize Docker Swarm on manager VMs
hosts: swarm_managers
become: true
gather_facts: false
tasks:
- name: Initialize swarm on primary manager (run once on first manager)
ansible.builtin.command: >
docker swarm init --advertise-addr {{ hostvars[groups['swarm_managers'][0]]['ansible_host'] }}
delegate_to: "{{ groups['swarm_managers'][0] }}"
run_once: true
register: swarm_init
failed_when: false
changed_when: false
- name: Get worker join token from leader
ansible.builtin.command: docker swarm join-token -q worker
delegate_to: "{{ groups['swarm_managers'][0] }}"
run_once: true
register: swarm_worker_token
changed_when: false
- name: Get manager join token from leader
ansible.builtin.command: docker swarm join-token -q manager
delegate_to: "{{ groups['swarm_managers'][0] }}"
run_once: true
register: swarm_manager_token
changed_when: false
- name: Join secondary managers as managers
ansible.builtin.shell: >
docker swarm join --token {{ swarm_manager_token.stdout }} {{ hostvars[groups['swarm_managers'][0]]['ansible_host'] }}:2377
when: inventory_hostname != groups['swarm_managers'][0]
changed_when: false
# Join workers (use tokens fetched from leader)
- name: Join worker VMs to Docker Swarm
hosts: swarm_workers
become: true
gather_facts: false
tasks:
- name: Fetch worker token from leader (delegated)
ansible.builtin.command: docker swarm join-token -q worker
delegate_to: "{{ groups['swarm_managers'][0] }}"
run_once: true
register: swarm_worker_token
changed_when: false
- name: Check if node is already part of a swarm
ansible.builtin.command: docker info --format '{{"{{.Swarm.LocalNodeState}}"}}'
register: swarm_state
failed_when: false
changed_when: false
- name: Join this VM to swarm as worker
ansible.builtin.shell: >
docker swarm join --token {{ swarm_worker_token.stdout }} {{ hostvars[groups['swarm_managers'][0]]['ansible_host'] }}:2377
when: swarm_state.stdout not in ['active','pending']
changed_when: false
- name: Verify Swarm Cluster from leader
hosts: "{{ groups.get('swarm_managers', ['localhost'])[0] }}"
become: true
gather_facts: false
tasks:
- block:
- name: Show docker nodes on leader
ansible.builtin.command: docker node ls
register: node_list
failed_when: false
changed_when: false
- name: Debug node list
ansible.builtin.debug:
var: node_list.stdout_lines
when: inventory_hostname in groups.get('swarm_managers', [])
# ========================================
# SECTION 8: Connectivity Verification (All permutations)
# ========================================
- name: Verify network connectivity between all Proxmox hosts and VMs
hosts: proxmox_cluster,swarm_hosts
gather_facts: false
become: true
tasks:
- name: Build list of target IPs (run once)
run_once: true
ansible.builtin.set_fact:
all_targets: >
{{ (groups['proxmox_cluster'] | map('extract', hostvars, 'ansible_host') | list) + (groups['swarm_hosts'] | map('extract', hostvars, 'ansible_host') | list) }}
- name: Check connectivity to all targets
vars:
target: "{{ item }}"
ansible.builtin.command: ping -c 1 -W 1 {{ item }}
register: ping_result
failed_when: false
changed_when: false
loop: "{{ all_targets }}"
- name: Report connectivity failures
ansible.builtin.debug:
msg: |
From {{ inventory_hostname }} -> {{ item.item }} : rc={{ item.rc }}
loop: "{{ ping_result.results }}"
when: item.rc != 0
failed_when: false
- name: Fail if any critical connectivity missing (optional)
ansible.builtin.fail:
msg: "Connectivity failures detected from {{ inventory_hostname }}"
when: ping_result.results | selectattr('rc','ne',0) | list | length > 0
failed_when: false