112 lines
4.7 KiB
YAML

---
# playbooks/proxmox/pve_update.yml
# Rolling Proxmox cluster package update with conditional kernel reboot.
#
# ─────────────────────────────────────────────────────────────────────────────
# ⚠️ HUMAN-TRIGGERED ONLY — do not automate or schedule.
# serial: 1 ensures one node is updated at a time to protect cluster quorum.
# ─────────────────────────────────────────────────────────────────────────────
#
# What this does:
# 1. Pre-checks cluster quorum — fails fast if quorum is degraded
# 2. Runs apt dist-upgrade on the target node
# 3. Reboots if a kernel update was applied (tags: reboot)
# 4. Waits for the node to return online (tags: reboot)
# 5. Re-verifies cluster quorum before proceeding to the next node
#
# Usage:
# # All nodes (rolling — pve01 → pve02 → pve03):
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/pve_update.yml
#
# # Single node:
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/pve_update.yml --limit pve01
#
# # Dry-run (confirms serial order and reboot conditions without modifying):
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/pve_update.yml --check
#
# # Update packages but skip reboot even if kernel changed:
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/pve_update.yml --skip-tags reboot
- name: Rolling Proxmox cluster update
hosts: proxmox_cluster
become: true
serial: 1
tasks:
- name: "Pre-flight: verify cluster quorum before updating this node"
block:
- name: Check cluster quorum status
ansible.builtin.command: pvecm status
register: pvecm_pre
changed_when: false
check_mode: false
- name: Fail if cluster is not quorate before touching this node
ansible.builtin.assert:
that:
- "'Quorate:' in pvecm_pre.stdout"
- "'Quorate:' in pvecm_pre.stdout and 'Yes' in (pvecm_pre.stdout | regex_search('Quorate:.*') | default(''))"
fail_msg: |
⛔ Cluster quorum is NOT healthy before updating {{ inventory_hostname }}.
Fix quorum before proceeding.
pvecm status:
{{ pvecm_pre.stdout }}
success_msg: "✅ Cluster quorate — safe to update {{ inventory_hostname }}"
- name: "Update packages"
block:
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 0
- name: Run apt dist-upgrade
ansible.builtin.apt:
upgrade: dist
update_cache: false
register: dist_upgrade_result
tags: [update]
- name: Check if a newer kernel is installed but not yet booted
ansible.builtin.shell: |
LATEST=$(ls /boot/vmlinuz-* | sort -V | tail -1 | sed 's|/boot/vmlinuz-||')
RUNNING=$(uname -r)
if [ "$LATEST" != "$RUNNING" ]; then echo "reboot_needed"; fi
register: reboot_check
changed_when: false
check_mode: false
tags: [reboot]
- name: Reboot if a newer kernel is installed
ansible.builtin.reboot:
msg: "Rebooting into {{ reboot_check.stdout | trim }} — initiated by pve_update.yml"
reboot_timeout: 600
when: reboot_check.stdout | trim == 'reboot_needed'
tags: [reboot]
- name: Wait for node to return post-reboot
ansible.builtin.wait_for_connection:
delay: 10
timeout: 600
when: reboot_check.stdout | trim == 'reboot_needed'
tags: [reboot]
- name: "Post-flight: re-verify cluster quorum after node returns"
block:
- name: Check cluster quorum status post-update
ansible.builtin.command: pvecm status
register: pvecm_post
changed_when: false
check_mode: false
- name: Assert cluster is quorate after update
ansible.builtin.assert:
that:
- "'Quorate:' in pvecm_post.stdout and 'Yes' in (pvecm_post.stdout | regex_search('Quorate:.*') | default(''))"
fail_msg: |
⛔ Cluster quorum is degraded after updating {{ inventory_hostname }}.
Investigate before proceeding to the next node.
pvecm status:
{{ pvecm_post.stdout }}
success_msg: "✅ {{ inventory_hostname }} updated — cluster quorum verified. Proceeding."