189 lines
7.8 KiB
YAML

---
# playbooks/proxmox/grow_vm_disks.yml
#
# Purpose:
# Idempotently ensures all Swarm VM disks are sized to vm_disk_target on the
# Proxmox layer (Play 1), reboots affected VMs so the guest kernel reads the
# new block device geometry (Play 2), then grows the in-guest partition and
# filesystem to match (Play 3).
#
# Architecture:
# Play 1 — proxmox_cluster: checks the actual LVM volume size via `lvs` (NOT
# `qm config`, which can be out of sync) and uses `lvextend` if below target.
# WHY lvs not qm config: qm resize updates Proxmox metadata but can silently
# fail to grow the LVM when the VM is running. lvs shows ground truth.
# Play 2 — proxmox_cluster: reboots only the VMs whose LVs were just extended.
# Play 3 — swarm_hosts: waits for SSH, then runs disk_grow role (growpart +
# resize2fs). WHY reboot required: virtio-scsi guests on this kernel do not
# honour /sys/class/block/sda/device/rescan or scsi_host scans while running.
# Only a cold re-read of block device geometry at boot is reliable.
#
# VMID scheme: manager = (node_index * 100) + 1, worker = (node_index * 100) + 2
# pve01 → 101/102, pve02 → 201/202, pve03 → 301/302
# LV path: /dev/pve/vm-{vmid}-disk-0 (standard local-lvm layout)
#
# Pre-requisites:
# - SSH access to proxmox_cluster and swarm_hosts
# - LVM tools available on Proxmox nodes (standard PVE install)
# - cloud-guest-utils will be installed by disk_grow role if absent
#
# Usage:
# Fix all Swarm VMs across all PVE nodes:
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/grow_vm_disks.yml
#
# Fix a single node end-to-end (all three plays, one guest):
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/grow_vm_disks.yml \
# -e "target_vmids=101" --limit pve01 # Play 1+2 on pve01, Play 3 on swarm-manager-1
#
# In-guest grow only (disk already extended, VM already rebooted):
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/grow_vm_disks.yml \
# --limit swarm-manager-1 --tags in_guest
#
# Validate only (no changes):
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/grow_vm_disks.yml \
# --check
#
# Verification after run:
# ansible swarm_hosts -i inventory/hosts.ini -m shell -a "df -h /" --become
# ============================================================
# PLAY 1: Proxmox layer — extend LVM volume for each Swarm VM
# Source of truth: lvs (actual LVM size), NOT qm config (metadata only)
# ============================================================
- name: Extend Swarm VM LVM volumes on Proxmox hosts
hosts: proxmox_cluster
become: false
gather_facts: false
tags: [proxmox_resize]
vars:
vm_disk_target: "32G"
vm_disk_target_gb: "{{ vm_disk_target | regex_replace('[^0-9]', '') | int }}"
vm_lv_vg: "pve"
tasks:
- name: Derive VM IDs and LV names for this PVE node
ansible.builtin.set_fact:
disk_grow_manager_vmid: "{{ (inventory_hostname | regex_replace('[^0-9]', '') | int) * 100 + 1 }}"
disk_grow_worker_vmid: "{{ (inventory_hostname | regex_replace('[^0-9]', '') | int) * 100 + 2 }}"
# Manager VM -------------------------------------------------------
# WHY lvs not qm config: qm resize updates Proxmox metadata but silently
# fails to grow the LVM when the VM is already running. lvs is ground truth.
- name: Get actual LVM size for manager VM {{ disk_grow_manager_vmid }}
ansible.builtin.shell: |
lvs --noheadings --units g -o lv_size \
/dev/{{ vm_lv_vg }}/vm-{{ disk_grow_manager_vmid }}-disk-0 2>/dev/null \
| tr -d ' ' | sed 's/g$//' | cut -d. -f1 \
|| echo "absent"
args:
executable: /bin/bash
register: disk_grow_manager_lv_size
changed_when: false
- name: Extend manager VM LV to {{ vm_disk_target }} if below target
ansible.builtin.shell: |
lvextend -L {{ vm_disk_target }} \
/dev/{{ vm_lv_vg }}/vm-{{ disk_grow_manager_vmid }}-disk-0
args:
executable: /bin/bash
when:
- disk_grow_manager_lv_size.stdout | trim != 'absent'
- (disk_grow_manager_lv_size.stdout | trim | int) < (vm_disk_target_gb | int)
register: disk_grow_manager_extend_result
changed_when: disk_grow_manager_extend_result.rc == 0
- name: Report manager VM LV state
ansible.builtin.debug:
msg: >-
Manager VM {{ disk_grow_manager_vmid }} LV:
{{ disk_grow_manager_lv_size.stdout | trim }}G
→ {{ vm_disk_target }}
({{ 'extended — reboot required' if (disk_grow_manager_extend_result is not skipped)
else 'already at target or absent' }})
when: disk_grow_manager_lv_size.stdout | trim != 'absent'
# Worker VM --------------------------------------------------------
- name: Get actual LVM size for worker VM {{ disk_grow_worker_vmid }}
ansible.builtin.shell: |
lvs --noheadings --units g -o lv_size \
/dev/{{ vm_lv_vg }}/vm-{{ disk_grow_worker_vmid }}-disk-0 2>/dev/null \
| tr -d ' ' | sed 's/g$//' | cut -d. -f1 \
|| echo "absent"
args:
executable: /bin/bash
register: disk_grow_worker_lv_size
changed_when: false
- name: Extend worker VM LV to {{ vm_disk_target }} if below target
ansible.builtin.shell: |
lvextend -L {{ vm_disk_target }} \
/dev/{{ vm_lv_vg }}/vm-{{ disk_grow_worker_vmid }}-disk-0
args:
executable: /bin/bash
when:
- disk_grow_worker_lv_size.stdout | trim != 'absent'
- (disk_grow_worker_lv_size.stdout | trim | int) < (vm_disk_target_gb | int)
register: disk_grow_worker_extend_result
changed_when: disk_grow_worker_extend_result.rc == 0
- name: Report worker VM LV state
ansible.builtin.debug:
msg: >-
Worker VM {{ disk_grow_worker_vmid }} LV:
{{ disk_grow_worker_lv_size.stdout | trim }}G
→ {{ vm_disk_target }}
({{ 'extended — reboot required' if (disk_grow_worker_extend_result is not skipped)
else 'already at target or absent' }})
when: disk_grow_worker_lv_size.stdout | trim != 'absent'
# Reboot any VMs whose LV was just extended ---------------------------
# WHY here not in Play 2: qm reboot runs on the PVE host, not the guest.
# We only reboot VMs that were actually extended this run.
- name: Reboot manager VM {{ disk_grow_manager_vmid }} to expose new disk size to guest kernel
ansible.builtin.shell: qm reboot {{ disk_grow_manager_vmid }}
args:
executable: /bin/bash
when:
- disk_grow_manager_extend_result is not skipped
- disk_grow_manager_extend_result.changed
changed_when: true
- name: Reboot worker VM {{ disk_grow_worker_vmid }} to expose new disk size to guest kernel
ansible.builtin.shell: qm reboot {{ disk_grow_worker_vmid }}
args:
executable: /bin/bash
when:
- disk_grow_worker_extend_result is not skipped
- disk_grow_worker_extend_result.changed
changed_when: true
# ============================================================
# PLAY 2: Wait for rebooted Swarm nodes to come back
# ============================================================
- name: Wait for Swarm nodes to return after reboot
hosts: swarm_hosts
become: false
gather_facts: false
tags: [proxmox_resize, in_guest]
tasks:
- name: Wait for SSH to become available (up to 2 minutes)
ansible.builtin.wait_for_connection:
delay: 10
timeout: 120
# ============================================================
# PLAY 3: In-guest layer — grow partition and filesystem
# ============================================================
- name: Grow in-guest root partition and filesystem on all Swarm nodes
hosts: swarm_hosts
become: true
gather_facts: true
tags: [in_guest]
roles:
- disk_grow