--- # playbooks/proxmox/pve_audit.yml # Read-only cross-node consistency audit for the Proxmox cluster. # Safe to schedule. Makes no changes to any host. # # What this does: # Play 1 — Gathers key state from all proxmox_cluster nodes (kernel, repos, # swap, nag script, GRUB cmdline, HA services, cluster quorum) # Play 2 — Asserts consistency across all 3 nodes and writes a markdown # drift report to outputs/pve_audit_.md # # Usage: # ansible-playbook -i inventory/hosts.ini playbooks/proxmox/pve_audit.yml # # Output: # outputs/pve_audit_.md (repo root) - name: "Play 1: Gather Proxmox cluster node state" hosts: proxmox_cluster become: true gather_facts: true tasks: - name: Check nag removal script presence ansible.builtin.stat: path: /usr/local/bin/pve-remove-nag.sh register: nag_script_stat - name: Read GRUB cmdline ansible.builtin.command: grep '^GRUB_CMDLINE_LINUX_DEFAULT=' /etc/default/grub register: grub_cmdline changed_when: false check_mode: false - name: Check enterprise repo files absent ansible.builtin.stat: path: "{{ item }}" loop: - /etc/apt/sources.list.d/pve-enterprise.list - /etc/apt/sources.list.d/pve-enterprise.sources - /etc/apt/sources.list.d/ceph.list - /etc/apt/sources.list.d/ceph.sources register: enterprise_repo_stat - name: Get cluster quorum status ansible.builtin.command: pvecm status register: pvecm_status changed_when: false check_mode: false failed_when: false - name: Check HA and cluster service states ansible.builtin.command: "systemctl is-active {{ item }}" register: service_active_check changed_when: false check_mode: false failed_when: false loop: - corosync - pve-ha-lrm - pve-ha-crm - name: Check PermitRootLogin effective setting ansible.builtin.command: sshd -T register: sshd_config_dump changed_when: false check_mode: false failed_when: false - name: Stash per-node audit facts for cross-node comparison ansible.builtin.set_fact: pve_audit: kernel: "{{ ansible_kernel }}" distro_version: "{{ ansible_distribution_version }}" swap_mb: "{{ ansible_swaptotal_mb }}" nag_script_present: "{{ nag_script_stat.stat.exists }}" grub_cmdline: "{{ grub_cmdline.stdout }}" enterprise_repos_absent: >- {{ enterprise_repo_stat.results | selectattr('stat.exists', 'equalto', true) | list | length == 0 }} pvecm_output: "{{ pvecm_status.stdout | default('(pvecm not available)') }}" quorate: >- {{ 'Quorate:' in (pvecm_status.stdout | default('')) and 'Yes' in ((pvecm_status.stdout | default('')) | regex_search('Quorate:.*') | default('')) }} corosync_active: >- {{ (service_active_check.results | selectattr('item', 'equalto', 'corosync') | first).stdout == 'active' }} ha_lrm_active: >- {{ (service_active_check.results | selectattr('item', 'equalto', 'pve-ha-lrm') | first).stdout == 'active' }} ha_crm_active: >- {{ (service_active_check.results | selectattr('item', 'equalto', 'pve-ha-crm') | first).stdout == 'active' }} permit_root_login: >- {{ 'permitrootlogin yes' in (sshd_config_dump.stdout | default('') | lower) }} - name: "Play 2: Cross-node consistency assertions and drift report" hosts: localhost gather_facts: false vars: pve_nodes: "{{ groups['proxmox_cluster'] }}" audit_timestamp: "{{ lookup('pipe', 'date +%Y%m%dT%H%M%S') }}" report_path: "{{ playbook_dir }}/../../../outputs/pve_audit_{{ audit_timestamp }}.md" tasks: - name: Ensure outputs directory exists ansible.builtin.file: path: "{{ playbook_dir }}/../../../outputs" state: directory mode: '0755' - name: Write drift report ansible.builtin.copy: dest: "{{ report_path }}" mode: '0644' content: | # Proxmox Cluster Audit Report Generated: {{ audit_timestamp }} Nodes audited: {{ pve_nodes | join(', ') }} ## Node Summary | Node | Kernel | Distro | Swap | Nag Script | Enterprise Repos | Quorate | Corosync | HA-LRM | HA-CRM | |------|--------|--------|------|------------|------------------|---------|----------|--------|--------| {% for node in pve_nodes %} | {{ node }} | `{{ hostvars[node]['pve_audit']['kernel'] }}` | {{ hostvars[node]['pve_audit']['distro_version'] }} | {{ hostvars[node]['pve_audit']['swap_mb'] }}MB | {{ '✅' if hostvars[node]['pve_audit']['nag_script_present'] | bool else '❌' }} | {{ '✅ absent' if hostvars[node]['pve_audit']['enterprise_repos_absent'] | bool else '❌ present' }} | {{ '✅' if hostvars[node]['pve_audit']['quorate'] | bool else '❌' }} | {{ '✅' if hostvars[node]['pve_audit']['corosync_active'] | bool else '❌' }} | {{ '✅' if hostvars[node]['pve_audit']['ha_lrm_active'] | bool else '❌' }} | {{ '✅' if hostvars[node]['pve_audit']['ha_crm_active'] | bool else '❌' }} | {% endfor %} ## GRUB Cmdline {% for node in pve_nodes %} - **{{ node }}**: `{{ hostvars[node]['pve_audit']['grub_cmdline'] }}` {% endfor %} ## Cluster Quorum Status {% for node in pve_nodes %} ### {{ node }} ``` {{ hostvars[node]['pve_audit']['pvecm_output'] }} ``` {% endfor %} - name: Assert kernel consistency across all nodes ansible.builtin.assert: that: - hostvars[item]['pve_audit']['kernel'] == hostvars[pve_nodes[0]]['pve_audit']['kernel'] fail_msg: >- ❌ Kernel drift: {{ item }} has {{ hostvars[item]['pve_audit']['kernel'] }} but {{ pve_nodes[0] }} has {{ hostvars[pve_nodes[0]]['pve_audit']['kernel'] }} success_msg: "✅ {{ item }}: kernel {{ hostvars[item]['pve_audit']['kernel'] }}" loop: "{{ pve_nodes }}" - name: Assert distro version consistency across all nodes ansible.builtin.assert: that: - hostvars[item]['pve_audit']['distro_version'] == hostvars[pve_nodes[0]]['pve_audit']['distro_version'] fail_msg: >- ❌ Distro version drift: {{ item }} has {{ hostvars[item]['pve_audit']['distro_version'] }} but {{ pve_nodes[0] }} has {{ hostvars[pve_nodes[0]]['pve_audit']['distro_version'] }} success_msg: "✅ {{ item }}: distro {{ hostvars[item]['pve_audit']['distro_version'] }}" loop: "{{ pve_nodes }}" - name: Assert swap is disabled on all nodes ansible.builtin.assert: that: - hostvars[item]['pve_audit']['swap_mb'] | int == 0 fail_msg: "❌ Swap is enabled on {{ item }}: {{ hostvars[item]['pve_audit']['swap_mb'] }}MB — run pve_baseline.yml --tags storage" success_msg: "✅ {{ item }}: swap disabled" loop: "{{ pve_nodes }}" - name: Assert nag removal script present on all nodes ansible.builtin.assert: that: - hostvars[item]['pve_audit']['nag_script_present'] | bool fail_msg: "❌ Nag removal script missing on {{ item }} — run pve_baseline.yml --tags nag" success_msg: "✅ {{ item }}: nag script present" loop: "{{ pve_nodes }}" - name: Assert enterprise repos absent on all nodes ansible.builtin.assert: that: - hostvars[item]['pve_audit']['enterprise_repos_absent'] | bool fail_msg: "❌ Enterprise repo still present on {{ item }} — run pve_baseline.yml --tags repos" success_msg: "✅ {{ item }}: enterprise repos absent" loop: "{{ pve_nodes }}" - name: Assert cluster is quorate ansible.builtin.assert: that: - hostvars[item]['pve_audit']['quorate'] | bool fail_msg: "❌ {{ item }} reports cluster NOT quorate — investigate immediately" success_msg: "✅ {{ item }}: cluster quorate" loop: "{{ pve_nodes }}" - name: Assert HA and Corosync services running on all nodes ansible.builtin.assert: that: - hostvars[item]['pve_audit']['corosync_active'] | bool - hostvars[item]['pve_audit']['ha_lrm_active'] | bool - hostvars[item]['pve_audit']['ha_crm_active'] | bool fail_msg: >- ❌ HA/Corosync degraded on {{ item }}: corosync={{ hostvars[item]['pve_audit']['corosync_active'] }} pve-ha-lrm={{ hostvars[item]['pve_audit']['ha_lrm_active'] }} pve-ha-crm={{ hostvars[item]['pve_audit']['ha_crm_active'] }} success_msg: "✅ {{ item }}: corosync + HA services active" loop: "{{ pve_nodes }}" - name: Assert PermitRootLogin is enabled on all nodes ansible.builtin.assert: that: - hostvars[item]['pve_audit']['permit_root_login'] | bool fail_msg: "❌ PermitRootLogin is not 'yes' on {{ item }} — run pve_baseline.yml --tags ssh to fix" success_msg: "✅ {{ item }}: PermitRootLogin yes" loop: "{{ pve_nodes }}"