--- # playbooks/proxmox/reconcile_cluster.yml # Re-enable cluster services and reconcile Proxmox cluster membership. # # What this playbook does: # 1. Ensures pve-cluster is running on all nodes # 2. Creates a cluster on the primary node if missing # 3. Joins remaining nodes if they are not yet members # 4. Re-enables Corosync and HA services # 5. Prints final cluster membership from the primary node # # Usage: # ansible-playbook -i inventory/hosts.ini playbooks/proxmox/reconcile_cluster.yml # # Optional overrides: # -e pve_cluster_name=homelab # -e pve_primary_node=pve01 # -e cluster_mode=auto|primary|join # -e pve_existing_cluster_ip=10.0.0.201 # ======================================== # PLAY 1: Setup root SSH trust (parallel) # ======================================== - name: Setup root SSH trust for cluster operations hosts: proxmox_cluster become: true gather_facts: false tasks: - name: Ensure root SSH key exists ansible.builtin.stat: path: /root/.ssh/id_rsa register: root_ssh_key - name: Generate root SSH key if missing ansible.builtin.command: ssh-keygen -t ed25519 -f /root/.ssh/id_ed25519 -N "" args: creates: /root/.ssh/id_ed25519 when: not root_ssh_key.stat.exists - name: Fetch root's public SSH key ansible.builtin.slurp: src: "{{ '/root/.ssh/id_rsa.pub' if root_ssh_key.stat.exists else '/root/.ssh/id_ed25519.pub' }}" register: root_pubkey - name: Distribute root SSH keys across all cluster nodes ansible.builtin.authorized_key: user: root key: "{{ hostvars[item].root_pubkey.content | b64decode }}" state: present loop: "{{ groups['proxmox_cluster'] }}" when: hostvars[item].root_pubkey is defined # ======================================== # PLAY 2: Cluster reconciliation (serial) # ======================================== - name: Reconcile Proxmox cluster state hosts: proxmox_cluster become: true gather_facts: true serial: 1 vars: pve_cluster_name: "homelab" pve_primary_node: "{{ groups['proxmox_cluster'][0] }}" pve_primary_ip: "{{ hostvars[pve_primary_node].ansible_host | default(pve_primary_node) }}" # auto: create if needed on primary and join others # primary: force primary-init behavior on target host(s) # join: force join behavior on target host(s) cluster_mode: "auto" pve_existing_cluster_ip: "" tasks: - name: Validate inventory has Proxmox nodes ansible.builtin.assert: that: - groups['proxmox_cluster'] | length >= 1 fail_msg: "Inventory group 'proxmox_cluster' is empty or undefined." - name: Validate cluster_mode input ansible.builtin.assert: that: - cluster_mode in ['auto', 'primary', 'join'] fail_msg: "cluster_mode must be one of: auto, primary, join" - name: Resolve join target IP ansible.builtin.set_fact: pve_join_target_ip: "{{ pve_existing_cluster_ip | default('') | trim if (pve_existing_cluster_ip | default('') | trim | length > 0) else pve_primary_ip }}" - name: Show reconcile plan ansible.builtin.debug: msg: - "Primary node: {{ pve_primary_node }} ({{ pve_primary_ip }})" - "Cluster name: {{ pve_cluster_name }}" - "Cluster mode: {{ cluster_mode }}" - "Join target IP: {{ pve_join_target_ip }}" - "Target nodes: {{ groups['proxmox_cluster'] | join(', ') }}" run_once: true - name: Ensure pve-cluster service is enabled and running ansible.builtin.systemd: name: pve-cluster enabled: true state: started - name: Check whether this node is already clustered ansible.builtin.stat: path: /etc/pve/corosync.conf register: corosync_conf - name: Create cluster on primary node when missing ansible.builtin.command: "pvecm create {{ pve_cluster_name }}" register: pvecm_create changed_when: pvecm_create.rc == 0 when: - cluster_mode in ['auto', 'primary'] - inventory_hostname == pve_primary_node or cluster_mode == 'primary' - not corosync_conf.stat.exists - name: Wait for corosync config to appear on primary ansible.builtin.wait_for: path: /etc/pve/corosync.conf timeout: 60 when: inventory_hostname == pve_primary_node - name: Test root SSH connectivity to primary node ansible.builtin.command: "ssh -o BatchMode=yes root@{{ pve_join_target_ip }} hostname" changed_when: false failed_when: false register: ssh_test when: - inventory_hostname != pve_primary_node - not corosync_conf.stat.exists - name: Warn if root SSH test failed ansible.builtin.debug: msg: "WARNING: Root SSH to {{ pve_join_target_ip }} failed. Cluster join may hang. Error: {{ ssh_test.stderr }}" when: - ssh_test is defined - ssh_test.rc is defined - ssh_test.rc != 0 - name: Join non-primary node to cluster when missing ansible.builtin.command: "pvecm add {{ pve_join_target_ip }} --use_ssh 1" register: pvecm_add changed_when: pvecm_add.rc == 0 when: - cluster_mode in ['auto', 'join'] - inventory_hostname != pve_primary_node or cluster_mode == 'join' - not corosync_conf.stat.exists - name: Re-check cluster membership config after create/join ansible.builtin.stat: path: /etc/pve/corosync.conf register: corosync_conf_after - name: Ensure Corosync service is enabled and running on clustered nodes ansible.builtin.systemd: name: corosync enabled: true state: started when: corosync_conf_after.stat.exists - name: Ensure pve-ha-lrm service is enabled and running on clustered nodes ansible.builtin.systemd: name: pve-ha-lrm enabled: true state: started when: corosync_conf_after.stat.exists - name: Ensure pve-ha-crm service is enabled and running on clustered nodes ansible.builtin.systemd: name: pve-ha-crm enabled: true state: started when: corosync_conf_after.stat.exists - name: Show cluster membership from primary ansible.builtin.command: pvecm nodes changed_when: false register: pvecm_nodes when: inventory_hostname == pve_primary_node - name: Print cluster membership output ansible.builtin.debug: var: pvecm_nodes.stdout_lines when: inventory_hostname == pve_primary_node