218 lines
9.1 KiB
YAML

---
# playbooks/proxmox/pve_audit.yml
# Read-only cross-node consistency audit for the Proxmox cluster.
# Safe to schedule. Makes no changes to any host.
#
# What this does:
# Play 1 — Gathers key state from all proxmox_cluster nodes (kernel, repos,
# swap, nag script, GRUB cmdline, HA services, cluster quorum)
# Play 2 — Asserts consistency across all 3 nodes and writes a markdown
# drift report to outputs/pve_audit_<timestamp>.md
#
# Usage:
# ansible-playbook -i inventory/hosts.ini playbooks/proxmox/pve_audit.yml
#
# Output:
# outputs/pve_audit_<timestamp>.md (repo root)
- name: "Play 1: Gather Proxmox cluster node state"
hosts: proxmox_cluster
become: true
gather_facts: true
tasks:
- name: Check nag removal script presence
ansible.builtin.stat:
path: /usr/local/bin/pve-remove-nag.sh
register: nag_script_stat
- name: Read GRUB cmdline
ansible.builtin.command: grep '^GRUB_CMDLINE_LINUX_DEFAULT=' /etc/default/grub
register: grub_cmdline
changed_when: false
check_mode: false
- name: Check enterprise repo files absent
ansible.builtin.stat:
path: "{{ item }}"
loop:
- /etc/apt/sources.list.d/pve-enterprise.list
- /etc/apt/sources.list.d/pve-enterprise.sources
- /etc/apt/sources.list.d/ceph.list
- /etc/apt/sources.list.d/ceph.sources
register: enterprise_repo_stat
- name: Get cluster quorum status
ansible.builtin.command: pvecm status
register: pvecm_status
changed_when: false
check_mode: false
failed_when: false
- name: Check HA and cluster service states
ansible.builtin.command: "systemctl is-active {{ item }}"
register: service_active_check
changed_when: false
check_mode: false
failed_when: false
loop:
- corosync
- pve-ha-lrm
- pve-ha-crm
- name: Check PermitRootLogin effective setting
ansible.builtin.command: sshd -T
register: sshd_config_dump
changed_when: false
check_mode: false
failed_when: false
- name: Stash per-node audit facts for cross-node comparison
ansible.builtin.set_fact:
pve_audit:
kernel: "{{ ansible_kernel }}"
distro_version: "{{ ansible_distribution_version }}"
swap_mb: "{{ ansible_swaptotal_mb }}"
nag_script_present: "{{ nag_script_stat.stat.exists }}"
grub_cmdline: "{{ grub_cmdline.stdout }}"
enterprise_repos_absent: >-
{{ enterprise_repo_stat.results | selectattr('stat.exists', 'equalto', true) | list | length == 0 }}
pvecm_output: "{{ pvecm_status.stdout | default('(pvecm not available)') }}"
quorate: >-
{{ 'Quorate:' in (pvecm_status.stdout | default('')) and
'Yes' in ((pvecm_status.stdout | default('')) | regex_search('Quorate:.*') | default('')) }}
corosync_active: >-
{{ (service_active_check.results | selectattr('item', 'equalto', 'corosync') | first).stdout == 'active' }}
ha_lrm_active: >-
{{ (service_active_check.results | selectattr('item', 'equalto', 'pve-ha-lrm') | first).stdout == 'active' }}
ha_crm_active: >-
{{ (service_active_check.results | selectattr('item', 'equalto', 'pve-ha-crm') | first).stdout == 'active' }}
permit_root_login: >-
{{ 'permitrootlogin yes' in (sshd_config_dump.stdout | default('') | lower) }}
- name: "Play 2: Cross-node consistency assertions and drift report"
hosts: localhost
gather_facts: false
vars:
pve_nodes: "{{ groups['proxmox_cluster'] }}"
audit_timestamp: "{{ lookup('pipe', 'date +%Y%m%dT%H%M%S') }}"
report_path: "{{ playbook_dir }}/../../../outputs/pve_audit_{{ audit_timestamp }}.md"
tasks:
- name: Ensure outputs directory exists
ansible.builtin.file:
path: "{{ playbook_dir }}/../../../outputs"
state: directory
mode: '0755'
- name: Write drift report
ansible.builtin.copy:
dest: "{{ report_path }}"
mode: '0644'
content: |
# Proxmox Cluster Audit Report
Generated: {{ audit_timestamp }}
Nodes audited: {{ pve_nodes | join(', ') }}
## Node Summary
| Node | Kernel | Distro | Swap | Nag Script | Enterprise Repos | Quorate | Corosync | HA-LRM | HA-CRM |
|------|--------|--------|------|------------|------------------|---------|----------|--------|--------|
{% for node in pve_nodes %}
| {{ node }} | `{{ hostvars[node]['pve_audit']['kernel'] }}` | {{ hostvars[node]['pve_audit']['distro_version'] }} | {{ hostvars[node]['pve_audit']['swap_mb'] }}MB | {{ '✅' if hostvars[node]['pve_audit']['nag_script_present'] | bool else '❌' }} | {{ '✅ absent' if hostvars[node]['pve_audit']['enterprise_repos_absent'] | bool else '❌ present' }} | {{ '✅' if hostvars[node]['pve_audit']['quorate'] | bool else '❌' }} | {{ '✅' if hostvars[node]['pve_audit']['corosync_active'] | bool else '❌' }} | {{ '✅' if hostvars[node]['pve_audit']['ha_lrm_active'] | bool else '❌' }} | {{ '✅' if hostvars[node]['pve_audit']['ha_crm_active'] | bool else '❌' }} |
{% endfor %}
## GRUB Cmdline
{% for node in pve_nodes %}
- **{{ node }}**: `{{ hostvars[node]['pve_audit']['grub_cmdline'] }}`
{% endfor %}
## Cluster Quorum Status
{% for node in pve_nodes %}
### {{ node }}
```
{{ hostvars[node]['pve_audit']['pvecm_output'] }}
```
{% endfor %}
- name: Assert kernel consistency across all nodes
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['kernel'] == hostvars[pve_nodes[0]]['pve_audit']['kernel']
fail_msg: >-
❌ Kernel drift: {{ item }} has {{ hostvars[item]['pve_audit']['kernel'] }}
but {{ pve_nodes[0] }} has {{ hostvars[pve_nodes[0]]['pve_audit']['kernel'] }}
success_msg: "✅ {{ item }}: kernel {{ hostvars[item]['pve_audit']['kernel'] }}"
loop: "{{ pve_nodes }}"
- name: Assert distro version consistency across all nodes
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['distro_version'] == hostvars[pve_nodes[0]]['pve_audit']['distro_version']
fail_msg: >-
❌ Distro version drift: {{ item }} has {{ hostvars[item]['pve_audit']['distro_version'] }}
but {{ pve_nodes[0] }} has {{ hostvars[pve_nodes[0]]['pve_audit']['distro_version'] }}
success_msg: "✅ {{ item }}: distro {{ hostvars[item]['pve_audit']['distro_version'] }}"
loop: "{{ pve_nodes }}"
- name: Assert swap is disabled on all nodes
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['swap_mb'] | int == 0
fail_msg: "❌ Swap is enabled on {{ item }}: {{ hostvars[item]['pve_audit']['swap_mb'] }}MB — run pve_baseline.yml --tags storage"
success_msg: "✅ {{ item }}: swap disabled"
loop: "{{ pve_nodes }}"
- name: Assert nag removal script present on all nodes
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['nag_script_present'] | bool
fail_msg: "❌ Nag removal script missing on {{ item }} — run pve_baseline.yml --tags nag"
success_msg: "✅ {{ item }}: nag script present"
loop: "{{ pve_nodes }}"
- name: Assert enterprise repos absent on all nodes
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['enterprise_repos_absent'] | bool
fail_msg: "❌ Enterprise repo still present on {{ item }} — run pve_baseline.yml --tags repos"
success_msg: "✅ {{ item }}: enterprise repos absent"
loop: "{{ pve_nodes }}"
- name: Assert cluster is quorate
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['quorate'] | bool
fail_msg: "❌ {{ item }} reports cluster NOT quorate — investigate immediately"
success_msg: "✅ {{ item }}: cluster quorate"
loop: "{{ pve_nodes }}"
- name: Assert HA and Corosync services running on all nodes
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['corosync_active'] | bool
- hostvars[item]['pve_audit']['ha_lrm_active'] | bool
- hostvars[item]['pve_audit']['ha_crm_active'] | bool
fail_msg: >-
❌ HA/Corosync degraded on {{ item }}:
corosync={{ hostvars[item]['pve_audit']['corosync_active'] }}
pve-ha-lrm={{ hostvars[item]['pve_audit']['ha_lrm_active'] }}
pve-ha-crm={{ hostvars[item]['pve_audit']['ha_crm_active'] }}
success_msg: "✅ {{ item }}: corosync + HA services active"
loop: "{{ pve_nodes }}"
- name: Assert PermitRootLogin is enabled on all nodes
ansible.builtin.assert:
that:
- hostvars[item]['pve_audit']['permit_root_login'] | bool
fail_msg: "❌ PermitRootLogin is not 'yes' on {{ item }} — run pve_baseline.yml --tags ssh to fix"
success_msg: "✅ {{ item }}: PermitRootLogin yes"
loop: "{{ pve_nodes }}"