--- # playbooks/docker/swarm_update.yml # Rolling Docker Swarm node OS update with drain-before-reboot. # # ───────────────────────────────────────────────────────────────────────────── # ⚠️ HUMAN-TRIGGERED ONLY — do not automate or schedule. # serial: 1 ensures one node is updated at a time. # Each node is drained before update and re-activated after reboot. # ───────────────────────────────────────────────────────────────────────────── # # What this does per node: # 1. Pre-checks that Docker Swarm is healthy on the node # 2. Drains the node (tasks migrate to remaining nodes) # 3. Runs apt dist-upgrade # 4. Reboots if a newer kernel was installed # 5. Waits for the node and Docker daemon to return online # 6. Re-activates the node in the swarm # 7. Asserts node is Ready + Active before proceeding to the next node # # NOTE: drain/restore commands are delegated to a healthy manager. # When updating swarm-manager-1, delegation falls back to swarm-manager-2. # Assumes inventory_hostname matches the Docker Swarm node name (VM hostname). # # Usage: # # All nodes (rolling — managers first, then workers): # ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml # # # Single node: # ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --limit swarm-worker-1 # # # Dry-run (confirms serial order and reboot conditions without modifying): # ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --check # # # Update packages but skip reboot even if kernel changed: # ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --skip-tags reboot - name: Rolling Swarm node update hosts: swarm_hosts become: true serial: 1 vars: # Delegate swarm CLI commands to a healthy manager. # If we are updating swarm-manager-1 itself, fall back to swarm-manager-2. swarm_delegate: >- {{ 'swarm-manager-2' if inventory_hostname == 'swarm-manager-1' else 'swarm-manager-1' }} tasks: - name: "Pre-flight: verify Swarm is healthy before touching this node" block: - name: Check Docker Swarm state on this node ansible.builtin.shell: > docker info --format '{{ '{{' }}.Swarm.LocalNodeState{{ '}}' }}' register: swarm_pre changed_when: false check_mode: false - name: Fail if node is not an active swarm member ansible.builtin.assert: that: - swarm_pre.stdout | trim == 'active' fail_msg: >- ⛔ {{ inventory_hostname }} reports Swarm.LocalNodeState={{ swarm_pre.stdout | trim }}. Expected 'active'. Resolve swarm health before proceeding. success_msg: "✅ {{ inventory_hostname }} is an active swarm member — safe to drain" - name: "Drain: migrate tasks off {{ inventory_hostname }}" tags: [drain] when: not ansible_check_mode block: - name: Set node availability to drain ansible.builtin.command: > docker node update --availability drain {{ inventory_hostname }} delegate_to: "{{ swarm_delegate }}" become: false changed_when: true - name: Wait for running tasks to evacuate ansible.builtin.shell: > docker node ps {{ inventory_hostname }} --filter desired-state=running -q 2>/dev/null | wc -l delegate_to: "{{ swarm_delegate }}" become: false register: running_tasks until: running_tasks.stdout | trim | int == 0 retries: 18 delay: 10 changed_when: false - name: "Update packages" block: - name: Update apt cache ansible.builtin.apt: update_cache: true cache_valid_time: 0 - name: Run apt dist-upgrade ansible.builtin.apt: upgrade: dist update_cache: false register: dist_upgrade_result tags: [update] - name: Check if a newer kernel is installed but not yet booted ansible.builtin.shell: | LATEST=$(ls /boot/vmlinuz-* | sort -V | tail -1 | sed 's|/boot/vmlinuz-||') RUNNING=$(uname -r) if [ "$LATEST" != "$RUNNING" ]; then echo "reboot_needed"; fi register: reboot_check changed_when: false check_mode: false tags: [reboot] - name: Reboot if a newer kernel is installed ansible.builtin.reboot: msg: "Rebooting into updated kernel — initiated by swarm_update.yml" reboot_timeout: 600 when: reboot_check.stdout | trim == 'reboot_needed' tags: [reboot] - name: Wait for node to return post-reboot ansible.builtin.wait_for_connection: delay: 10 timeout: 600 when: reboot_check.stdout | trim == 'reboot_needed' tags: [reboot] - name: Wait for Docker daemon to be ready after reboot ansible.builtin.command: docker info register: docker_ready until: docker_ready.rc == 0 retries: 18 delay: 10 changed_when: false check_mode: false when: reboot_check.stdout | trim == 'reboot_needed' tags: [reboot] - name: "Restore: re-activate {{ inventory_hostname }} in the swarm" tags: [drain] when: not ansible_check_mode block: - name: Set node availability back to active ansible.builtin.command: > docker node update --availability active {{ inventory_hostname }} delegate_to: "{{ swarm_delegate }}" become: false changed_when: true - name: Wait for node to be Ready and Active ansible.builtin.shell: > docker node ls --filter name={{ inventory_hostname }} delegate_to: "{{ swarm_delegate }}" become: false register: node_ls until: "'Ready' in node_ls.stdout and 'Active' in node_ls.stdout" retries: 12 delay: 10 changed_when: false - name: Confirm node status after update ansible.builtin.assert: that: - "'Ready' in node_ls.stdout" - "'Active' in node_ls.stdout" fail_msg: >- ⛔ {{ inventory_hostname }} is not Ready+Active after update. Investigate before proceeding to the next node. docker node ls output: {{ node_ls.stdout }} success_msg: "✅ {{ inventory_hostname }} updated — Ready + Active. Proceeding."