207 lines
8.4 KiB
YAML

---
- name: Validate required replacement inputs
ansible.builtin.assert:
that:
- replacement_project_name | trim | length > 0
- replacement_old_logical_host in groups['proxmox_cluster']
- replacement_phase2_rebuild_and_rejoin | bool == false or replacement_new_physical_host in groups['proxmox_cluster']
- replacement_swarm_manager_name in groups['swarm_managers']
- replacement_swarm_worker_name in groups['swarm_workers']
fail_msg: >-
Missing replacement inputs or inventory groups. Ensure project name is set and
proxmox/swarm host groups contain the expected hosts.
success_msg: "Replacement input validation passed."
- name: Build replacement context values
ansible.builtin.set_fact:
proxmox_node_replacement_timestamp: "{{ lookup('pipe', 'date +%Y%m%dT%H%M%S') }}"
proxmox_node_replacement_output_dir: "{{ replacement_output_root }}/{{ replacement_project_name | regex_replace('[^a-zA-Z0-9_-]', '_') }}-{{ lookup('pipe', 'date +%Y%m%dT%H%M%S') }}"
- name: Print replacement plan summary
ansible.builtin.debug:
msg:
- "Project: {{ replacement_project_name }}"
- "Logical identity: {{ replacement_old_logical_host }} ({{ replacement_old_ip }})"
- "Replacement hardware: {{ replacement_new_physical_host }} ({{ replacement_new_physical_ip }})"
- "Swarm identities: {{ replacement_swarm_manager_name }}, {{ replacement_swarm_worker_name }}"
- "Execute cutover: {{ replacement_execute_cutover }}"
- "Power off old host: {{ replacement_poweroff_old_host }}"
- name: Preflight network reachability from control node
ansible.builtin.wait_for:
host: >-
{{
replacement_new_physical_ip
if item == replacement_new_physical_host and (hostvars[item] is not defined)
else (hostvars[item].ansible_host | default(item))
}}
port: 22
timeout: 5
connect_timeout: 2
state: started
delegate_to: localhost
loop:
- "{{ replacement_old_logical_host }}"
- "{{ replacement_new_physical_host }}"
- "{{ replacement_swarm_manager_name }}"
- "{{ replacement_swarm_worker_name }}"
when:
- not replacement_skip_runtime_checks | bool
- item != replacement_old_logical_host or not replacement_old_host_may_be_offline | bool
- item != replacement_new_physical_host or replacement_capture_baseline | bool or replacement_phase2_rebuild_and_rejoin | bool
- name: Capture swarm quorum state from manager host
ansible.builtin.command: docker node ls
register: proxmox_node_replacement_swarm_node_ls
changed_when: false
become: true
delegate_to: "{{ replacement_swarm_manager_name }}"
when: not replacement_skip_runtime_checks | bool
- name: Assert swarm quorum output is available
ansible.builtin.assert:
that:
- proxmox_node_replacement_swarm_node_ls.rc == 0
- proxmox_node_replacement_swarm_node_ls.stdout is search('Leader|Reachable')
fail_msg: "Swarm control plane is not healthy enough for a node replacement cutover."
success_msg: "Swarm quorum check passed."
when: not replacement_skip_runtime_checks | bool
- name: Create output directory for baseline artifacts
ansible.builtin.file:
path: "{{ proxmox_node_replacement_output_dir }}"
state: directory
mode: '0755'
delegate_to: localhost
when: replacement_capture_baseline | bool or replacement_execute_cutover | bool
- name: Capture old logical host VM list
ansible.builtin.command: /usr/sbin/qm list
register: proxmox_node_replacement_old_qm_list
changed_when: false
become: true
delegate_to: "{{ replacement_old_logical_host }}"
when: replacement_capture_baseline | bool
- name: Capture replacement physical host VM list
ansible.builtin.command: /usr/sbin/qm list
register: proxmox_node_replacement_new_qm_list
changed_when: false
become: true
delegate_to: "{{ replacement_new_physical_host }}"
when: replacement_capture_baseline | bool
- name: Capture old logical host cluster state
ansible.builtin.command: pvecm status
register: proxmox_node_replacement_old_cluster_status
changed_when: false
failed_when: false
become: true
delegate_to: "{{ replacement_old_logical_host }}"
when: replacement_capture_baseline | bool
- name: Capture replacement physical host cluster state
ansible.builtin.command: pvecm status
register: proxmox_node_replacement_new_cluster_status
changed_when: false
failed_when: false
become: true
delegate_to: "{{ replacement_new_physical_host }}"
when: replacement_capture_baseline | bool
- name: Write baseline artifact to controller
ansible.builtin.copy:
dest: "{{ proxmox_node_replacement_output_dir }}/baseline-summary.txt"
mode: '0644'
content: |
Project: {{ replacement_project_name }}
Timestamp: {{ proxmox_node_replacement_timestamp }}
Logical identity host: {{ replacement_old_logical_host }}
Logical identity IP: {{ replacement_old_ip }}
Replacement physical host: {{ replacement_new_physical_host }}
Replacement physical IP: {{ replacement_new_physical_ip }}
=== Swarm node ls (from {{ replacement_swarm_manager_name }}) ===
{{ proxmox_node_replacement_swarm_node_ls.stdout | default('') }}
=== QM list ({{ replacement_old_logical_host }}) ===
{{ proxmox_node_replacement_old_qm_list.stdout | default('not-captured') }}
=== QM list ({{ replacement_new_physical_host }}) ===
{{ proxmox_node_replacement_new_qm_list.stdout | default('not-captured') }}
=== pvecm status ({{ replacement_old_logical_host }}) ===
{{ proxmox_node_replacement_old_cluster_status.stdout | default('not-captured') }}
=== pvecm status ({{ replacement_new_physical_host }}) ===
{{ proxmox_node_replacement_new_cluster_status.stdout | default('not-captured') }}
delegate_to: localhost
when: replacement_capture_baseline | bool
- name: Explain cutover execution gate
ansible.builtin.debug:
msg: >-
Cutover actions are disabled. Set replacement_execute_cutover=true and
replacement_confirm_phrase=EXECUTE_NODE_REPLACEMENT to continue.
when: not replacement_execute_cutover | bool
- name: Enforce explicit confirmation phrase for cutover
ansible.builtin.assert:
that:
- replacement_confirm_phrase == 'EXECUTE_NODE_REPLACEMENT'
fail_msg: >-
Cutover requested without explicit confirmation phrase.
Set replacement_confirm_phrase=EXECUTE_NODE_REPLACEMENT.
when: replacement_execute_cutover | bool
- name: Build cutover TODO artifact
ansible.builtin.copy:
dest: "{{ proxmox_node_replacement_output_dir }}/cutover-todo.txt"
mode: '0644'
content: |
EXECUTION MODE ENABLED
Phase 2 execution switch:
- replacement_phase2_rebuild_and_rejoin={{ replacement_phase2_rebuild_and_rejoin }}
Phase 3 execution switch:
- replacement_phase3_identity_cutover={{ replacement_phase3_identity_cutover }}
Phase 4 execution switch:
- replacement_phase4_validate_cutover={{ replacement_phase4_validate_cutover }}
Manual steps still required around identity cutover:
1. If phase 2 enabled, rebuild and rejoin replacement swarm nodes on {{ replacement_new_physical_host }}.
2. If phase 3 enabled, update inventory/group_vars source-of-truth with rollback snapshots.
3. If phase 4 enabled, validate swarm quorum and optional service endpoints.
4. Move network identity {{ replacement_old_ip }} to replacement physical host.
5. If stable and approved, power off old host.
delegate_to: localhost
when: replacement_execute_cutover | bool
- name: Execute phase 2 rebuild and swarm rejoin on replacement host
ansible.builtin.include_tasks: phase2_rebuild_and_rejoin.yml
when:
- replacement_execute_cutover | bool
- replacement_phase2_rebuild_and_rejoin | bool
- name: Execute phase 3 identity cutover updates with rollback snapshots
ansible.builtin.include_tasks: phase3_identity_cutover.yml
when:
- replacement_execute_cutover | bool
- replacement_phase3_identity_cutover | bool
- name: Execute phase 4 post-cutover validation gates
ansible.builtin.include_tasks: phase4_validate_cutover.yml
when:
- replacement_execute_cutover | bool
- replacement_phase4_validate_cutover | bool
- name: Power off old logical host after explicit approval
ansible.builtin.command: systemctl poweroff
become: true
delegate_to: "{{ replacement_old_logical_host }}"
when:
- replacement_execute_cutover | bool
- replacement_poweroff_old_host | bool