171 lines
6.8 KiB
YAML

---
# playbooks/docker/swarm_update.yml
# Rolling Docker Swarm node OS update with drain-before-reboot.
#
# ─────────────────────────────────────────────────────────────────────────────
# ⚠️ HUMAN-TRIGGERED ONLY — do not automate or schedule.
# serial: 1 ensures one node is updated at a time.
# Each node is drained before update and re-activated after reboot.
# ─────────────────────────────────────────────────────────────────────────────
#
# What this does per node:
# 1. Pre-checks that Docker Swarm is healthy on the node
# 2. Drains the node (tasks migrate to remaining nodes)
# 3. Runs apt dist-upgrade
# 4. Reboots if a newer kernel was installed
# 5. Waits for the node and Docker daemon to return online
# 6. Re-activates the node in the swarm
# 7. Asserts node is Ready + Active before proceeding to the next node
#
# NOTE: drain/restore commands are delegated to a healthy manager.
# When updating swarm-manager-1, delegation falls back to swarm-manager-2.
# Assumes inventory_hostname matches the Docker Swarm node name (VM hostname).
#
# Usage:
# # All nodes (rolling — managers first, then workers):
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml
#
# # Single node:
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --limit swarm-worker-1
#
# # Dry-run (confirms serial order and reboot conditions without modifying):
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --check
#
# # Update packages but skip reboot even if kernel changed:
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --skip-tags reboot
- name: Rolling Swarm node update
hosts: swarm_hosts
become: true
serial: 1
vars:
# Delegate swarm CLI commands to a healthy manager.
# If we are updating swarm-manager-1 itself, fall back to swarm-manager-2.
swarm_delegate: >-
{{ 'swarm-manager-2' if inventory_hostname == 'swarm-manager-1' else 'swarm-manager-1' }}
tasks:
- name: "Pre-flight: verify Swarm is healthy before touching this node"
block:
- name: Check Docker Swarm state on this node
ansible.builtin.shell: >
docker info --format '{{ '{{' }}.Swarm.LocalNodeState{{ '}}' }}'
register: swarm_pre
changed_when: false
check_mode: false
- name: Fail if node is not an active swarm member
ansible.builtin.assert:
that:
- swarm_pre.stdout | trim == 'active'
fail_msg: >-
⛔ {{ inventory_hostname }} reports Swarm.LocalNodeState={{ swarm_pre.stdout | trim }}.
Expected 'active'. Resolve swarm health before proceeding.
success_msg: "✅ {{ inventory_hostname }} is an active swarm member — safe to drain"
- name: "Drain: migrate tasks off {{ inventory_hostname }}"
tags: [drain]
when: not ansible_check_mode
block:
- name: Set node availability to drain
ansible.builtin.command: >
docker node update --availability drain {{ inventory_hostname }}
delegate_to: "{{ swarm_delegate }}"
become: false
changed_when: true
- name: Wait for running tasks to evacuate
ansible.builtin.shell: >
docker node ps {{ inventory_hostname }} --filter desired-state=running -q 2>/dev/null | wc -l
delegate_to: "{{ swarm_delegate }}"
become: false
register: running_tasks
until: running_tasks.stdout | trim | int == 0
retries: 18
delay: 10
changed_when: false
- name: "Update packages"
block:
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 0
- name: Run apt dist-upgrade
ansible.builtin.apt:
upgrade: dist
update_cache: false
register: dist_upgrade_result
tags: [update]
- name: Check if a newer kernel is installed but not yet booted
ansible.builtin.shell: |
LATEST=$(ls /boot/vmlinuz-* | sort -V | tail -1 | sed 's|/boot/vmlinuz-||')
RUNNING=$(uname -r)
if [ "$LATEST" != "$RUNNING" ]; then echo "reboot_needed"; fi
register: reboot_check
changed_when: false
check_mode: false
tags: [reboot]
- name: Reboot if a newer kernel is installed
ansible.builtin.reboot:
msg: "Rebooting into updated kernel — initiated by swarm_update.yml"
reboot_timeout: 600
when: reboot_check.stdout | trim == 'reboot_needed'
tags: [reboot]
- name: Wait for node to return post-reboot
ansible.builtin.wait_for_connection:
delay: 10
timeout: 600
when: reboot_check.stdout | trim == 'reboot_needed'
tags: [reboot]
- name: Wait for Docker daemon to be ready after reboot
ansible.builtin.command: docker info
register: docker_ready
until: docker_ready.rc == 0
retries: 18
delay: 10
changed_when: false
check_mode: false
when: reboot_check.stdout | trim == 'reboot_needed'
tags: [reboot]
- name: "Restore: re-activate {{ inventory_hostname }} in the swarm"
tags: [drain]
when: not ansible_check_mode
block:
- name: Set node availability back to active
ansible.builtin.command: >
docker node update --availability active {{ inventory_hostname }}
delegate_to: "{{ swarm_delegate }}"
become: false
changed_when: true
- name: Wait for node to be Ready and Active
ansible.builtin.shell: >
docker node ls --filter name={{ inventory_hostname }}
delegate_to: "{{ swarm_delegate }}"
become: false
register: node_ls
until: "'Ready' in node_ls.stdout and 'Active' in node_ls.stdout"
retries: 12
delay: 10
changed_when: false
- name: Confirm node status after update
ansible.builtin.assert:
that:
- "'Ready' in node_ls.stdout"
- "'Active' in node_ls.stdout"
fail_msg: >-
⛔ {{ inventory_hostname }} is not Ready+Active after update.
Investigate before proceeding to the next node.
docker node ls output:
{{ node_ls.stdout }}
success_msg: "✅ {{ inventory_hostname }} updated — Ready + Active. Proceeding."