homelab/ansible/ansible-old/playbooks/proxmox/reconcile_cluster.yml

188 lines
6.5 KiB
YAML

---
# playbooks/proxmox/reconcile_cluster.yml
# Re-enable cluster services and reconcile Proxmox cluster membership.
#
# What this playbook does:
# 1. Ensures pve-cluster is running on all nodes
# 2. Creates a cluster on the primary node if missing
# 3. Joins remaining nodes if they are not yet members
# 4. Re-enables Corosync and HA services
# 5. Prints final cluster membership from the primary node
#
# Usage:
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/reconcile_cluster.yml
#
# Optional overrides:
# -e pve_cluster_name=homelab
# -e pve_primary_node=pve01
# -e cluster_mode=auto|primary|join
# -e pve_existing_cluster_ip=10.0.0.201
# ========================================
# PLAY 1: Setup root SSH trust (parallel)
# ========================================
- name: Setup root SSH trust for cluster operations
hosts: proxmox_cluster
become: true
gather_facts: false
tasks:
- name: Ensure root SSH key exists
ansible.builtin.stat:
path: /root/.ssh/id_rsa
register: root_ssh_key
- name: Generate root SSH key if missing
ansible.builtin.command: ssh-keygen -t ed25519 -f /root/.ssh/id_ed25519 -N ""
args:
creates: /root/.ssh/id_ed25519
when: not root_ssh_key.stat.exists
- name: Fetch root's public SSH key
ansible.builtin.slurp:
src: "{{ '/root/.ssh/id_rsa.pub' if root_ssh_key.stat.exists else '/root/.ssh/id_ed25519.pub' }}"
register: root_pubkey
- name: Distribute root SSH keys across all cluster nodes
ansible.builtin.authorized_key:
user: root
key: "{{ hostvars[item].root_pubkey.content | b64decode }}"
state: present
loop: "{{ groups['proxmox_cluster'] }}"
when: hostvars[item].root_pubkey is defined
# ========================================
# PLAY 2: Cluster reconciliation (serial)
# ========================================
- name: Reconcile Proxmox cluster state
hosts: proxmox_cluster
become: true
gather_facts: true
serial: 1
vars:
pve_cluster_name: "homelab"
pve_primary_node: "{{ groups['proxmox_cluster'][0] }}"
pve_primary_ip: "{{ hostvars[pve_primary_node].ansible_host | default(pve_primary_node) }}"
# auto: create if needed on primary and join others
# primary: force primary-init behavior on target host(s)
# join: force join behavior on target host(s)
cluster_mode: "auto"
pve_existing_cluster_ip: ""
tasks:
- name: Validate inventory has Proxmox nodes
ansible.builtin.assert:
that:
- groups['proxmox_cluster'] | length >= 1
fail_msg: "Inventory group 'proxmox_cluster' is empty or undefined."
- name: Validate cluster_mode input
ansible.builtin.assert:
that:
- cluster_mode in ['auto', 'primary', 'join']
fail_msg: "cluster_mode must be one of: auto, primary, join"
- name: Resolve join target IP
ansible.builtin.set_fact:
pve_join_target_ip: "{{ pve_existing_cluster_ip | default('') | trim if (pve_existing_cluster_ip | default('') | trim | length > 0) else pve_primary_ip }}"
- name: Show reconcile plan
ansible.builtin.debug:
msg:
- "Primary node: {{ pve_primary_node }} ({{ pve_primary_ip }})"
- "Cluster name: {{ pve_cluster_name }}"
- "Cluster mode: {{ cluster_mode }}"
- "Join target IP: {{ pve_join_target_ip }}"
- "Target nodes: {{ groups['proxmox_cluster'] | join(', ') }}"
run_once: true
- name: Ensure pve-cluster service is enabled and running
ansible.builtin.systemd:
name: pve-cluster
enabled: true
state: started
- name: Check whether this node is already clustered
ansible.builtin.stat:
path: /etc/pve/corosync.conf
register: corosync_conf
- name: Create cluster on primary node when missing
ansible.builtin.command: "pvecm create {{ pve_cluster_name }}"
register: pvecm_create
changed_when: pvecm_create.rc == 0
when:
- cluster_mode in ['auto', 'primary']
- inventory_hostname == pve_primary_node or cluster_mode == 'primary'
- not corosync_conf.stat.exists
- name: Wait for corosync config to appear on primary
ansible.builtin.wait_for:
path: /etc/pve/corosync.conf
timeout: 60
when: inventory_hostname == pve_primary_node
- name: Test root SSH connectivity to primary node
ansible.builtin.command: "ssh -o BatchMode=yes root@{{ pve_join_target_ip }} hostname"
changed_when: false
failed_when: false
register: ssh_test
when:
- inventory_hostname != pve_primary_node
- not corosync_conf.stat.exists
- name: Warn if root SSH test failed
ansible.builtin.debug:
msg: "WARNING: Root SSH to {{ pve_join_target_ip }} failed. Cluster join may hang. Error: {{ ssh_test.stderr }}"
when:
- ssh_test is defined
- ssh_test.rc is defined
- ssh_test.rc != 0
- name: Join non-primary node to cluster when missing
ansible.builtin.command: "pvecm add {{ pve_join_target_ip }} --use_ssh 1"
register: pvecm_add
changed_when: pvecm_add.rc == 0
when:
- cluster_mode in ['auto', 'join']
- inventory_hostname != pve_primary_node or cluster_mode == 'join'
- not corosync_conf.stat.exists
- name: Re-check cluster membership config after create/join
ansible.builtin.stat:
path: /etc/pve/corosync.conf
register: corosync_conf_after
- name: Ensure Corosync service is enabled and running on clustered nodes
ansible.builtin.systemd:
name: corosync
enabled: true
state: started
when: corosync_conf_after.stat.exists
- name: Ensure pve-ha-lrm service is enabled and running on clustered nodes
ansible.builtin.systemd:
name: pve-ha-lrm
enabled: true
state: started
when: corosync_conf_after.stat.exists
- name: Ensure pve-ha-crm service is enabled and running on clustered nodes
ansible.builtin.systemd:
name: pve-ha-crm
enabled: true
state: started
when: corosync_conf_after.stat.exists
- name: Show cluster membership from primary
ansible.builtin.command: pvecm nodes
changed_when: false
register: pvecm_nodes
when: inventory_hostname == pve_primary_node
- name: Print cluster membership output
ansible.builtin.debug:
var: pvecm_nodes.stdout_lines
when: inventory_hostname == pve_primary_node