171 lines
6.8 KiB
YAML
171 lines
6.8 KiB
YAML
---
|
|
# playbooks/docker/swarm_update.yml
|
|
# Rolling Docker Swarm node OS update with drain-before-reboot.
|
|
#
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
# ⚠️ HUMAN-TRIGGERED ONLY — do not automate or schedule.
|
|
# serial: 1 ensures one node is updated at a time.
|
|
# Each node is drained before update and re-activated after reboot.
|
|
# ─────────────────────────────────────────────────────────────────────────────
|
|
#
|
|
# What this does per node:
|
|
# 1. Pre-checks that Docker Swarm is healthy on the node
|
|
# 2. Drains the node (tasks migrate to remaining nodes)
|
|
# 3. Runs apt dist-upgrade
|
|
# 4. Reboots if a newer kernel was installed
|
|
# 5. Waits for the node and Docker daemon to return online
|
|
# 6. Re-activates the node in the swarm
|
|
# 7. Asserts node is Ready + Active before proceeding to the next node
|
|
#
|
|
# NOTE: drain/restore commands are delegated to a healthy manager.
|
|
# When updating swarm-manager-1, delegation falls back to swarm-manager-2.
|
|
# Assumes inventory_hostname matches the Docker Swarm node name (VM hostname).
|
|
#
|
|
# Usage:
|
|
# # All nodes (rolling — managers first, then workers):
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml
|
|
#
|
|
# # Single node:
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --limit swarm-worker-1
|
|
#
|
|
# # Dry-run (confirms serial order and reboot conditions without modifying):
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --check
|
|
#
|
|
# # Update packages but skip reboot even if kernel changed:
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/docker/swarm_update.yml --skip-tags reboot
|
|
|
|
- name: Rolling Swarm node update
|
|
hosts: swarm_hosts
|
|
become: true
|
|
serial: 1
|
|
|
|
vars:
|
|
# Delegate swarm CLI commands to a healthy manager.
|
|
# If we are updating swarm-manager-1 itself, fall back to swarm-manager-2.
|
|
swarm_delegate: >-
|
|
{{ 'swarm-manager-2' if inventory_hostname == 'swarm-manager-1' else 'swarm-manager-1' }}
|
|
|
|
tasks:
|
|
- name: "Pre-flight: verify Swarm is healthy before touching this node"
|
|
block:
|
|
- name: Check Docker Swarm state on this node
|
|
ansible.builtin.shell: >
|
|
docker info --format '{{ '{{' }}.Swarm.LocalNodeState{{ '}}' }}'
|
|
register: swarm_pre
|
|
changed_when: false
|
|
check_mode: false
|
|
|
|
- name: Fail if node is not an active swarm member
|
|
ansible.builtin.assert:
|
|
that:
|
|
- swarm_pre.stdout | trim == 'active'
|
|
fail_msg: >-
|
|
⛔ {{ inventory_hostname }} reports Swarm.LocalNodeState={{ swarm_pre.stdout | trim }}.
|
|
Expected 'active'. Resolve swarm health before proceeding.
|
|
success_msg: "✅ {{ inventory_hostname }} is an active swarm member — safe to drain"
|
|
|
|
- name: "Drain: migrate tasks off {{ inventory_hostname }}"
|
|
tags: [drain]
|
|
when: not ansible_check_mode
|
|
block:
|
|
- name: Set node availability to drain
|
|
ansible.builtin.command: >
|
|
docker node update --availability drain {{ inventory_hostname }}
|
|
delegate_to: "{{ swarm_delegate }}"
|
|
become: false
|
|
changed_when: true
|
|
|
|
- name: Wait for running tasks to evacuate
|
|
ansible.builtin.shell: >
|
|
docker node ps {{ inventory_hostname }} --filter desired-state=running -q 2>/dev/null | wc -l
|
|
delegate_to: "{{ swarm_delegate }}"
|
|
become: false
|
|
register: running_tasks
|
|
until: running_tasks.stdout | trim | int == 0
|
|
retries: 18
|
|
delay: 10
|
|
changed_when: false
|
|
|
|
- name: "Update packages"
|
|
block:
|
|
- name: Update apt cache
|
|
ansible.builtin.apt:
|
|
update_cache: true
|
|
cache_valid_time: 0
|
|
|
|
- name: Run apt dist-upgrade
|
|
ansible.builtin.apt:
|
|
upgrade: dist
|
|
update_cache: false
|
|
register: dist_upgrade_result
|
|
tags: [update]
|
|
|
|
- name: Check if a newer kernel is installed but not yet booted
|
|
ansible.builtin.shell: |
|
|
LATEST=$(ls /boot/vmlinuz-* | sort -V | tail -1 | sed 's|/boot/vmlinuz-||')
|
|
RUNNING=$(uname -r)
|
|
if [ "$LATEST" != "$RUNNING" ]; then echo "reboot_needed"; fi
|
|
register: reboot_check
|
|
changed_when: false
|
|
check_mode: false
|
|
tags: [reboot]
|
|
|
|
- name: Reboot if a newer kernel is installed
|
|
ansible.builtin.reboot:
|
|
msg: "Rebooting into updated kernel — initiated by swarm_update.yml"
|
|
reboot_timeout: 600
|
|
when: reboot_check.stdout | trim == 'reboot_needed'
|
|
tags: [reboot]
|
|
|
|
- name: Wait for node to return post-reboot
|
|
ansible.builtin.wait_for_connection:
|
|
delay: 10
|
|
timeout: 600
|
|
when: reboot_check.stdout | trim == 'reboot_needed'
|
|
tags: [reboot]
|
|
|
|
- name: Wait for Docker daemon to be ready after reboot
|
|
ansible.builtin.command: docker info
|
|
register: docker_ready
|
|
until: docker_ready.rc == 0
|
|
retries: 18
|
|
delay: 10
|
|
changed_when: false
|
|
check_mode: false
|
|
when: reboot_check.stdout | trim == 'reboot_needed'
|
|
tags: [reboot]
|
|
|
|
- name: "Restore: re-activate {{ inventory_hostname }} in the swarm"
|
|
tags: [drain]
|
|
when: not ansible_check_mode
|
|
block:
|
|
- name: Set node availability back to active
|
|
ansible.builtin.command: >
|
|
docker node update --availability active {{ inventory_hostname }}
|
|
delegate_to: "{{ swarm_delegate }}"
|
|
become: false
|
|
changed_when: true
|
|
|
|
- name: Wait for node to be Ready and Active
|
|
ansible.builtin.shell: >
|
|
docker node ls --filter name={{ inventory_hostname }}
|
|
delegate_to: "{{ swarm_delegate }}"
|
|
become: false
|
|
register: node_ls
|
|
until: "'Ready' in node_ls.stdout and 'Active' in node_ls.stdout"
|
|
retries: 12
|
|
delay: 10
|
|
changed_when: false
|
|
|
|
- name: Confirm node status after update
|
|
ansible.builtin.assert:
|
|
that:
|
|
- "'Ready' in node_ls.stdout"
|
|
- "'Active' in node_ls.stdout"
|
|
fail_msg: >-
|
|
⛔ {{ inventory_hostname }} is not Ready+Active after update.
|
|
Investigate before proceeding to the next node.
|
|
docker node ls output:
|
|
{{ node_ls.stdout }}
|
|
success_msg: "✅ {{ inventory_hostname }} updated — Ready + Active. Proceeding."
|