--- # roles/monitoring_stack/templates/alert-rules.yml.j2 # Prometheus alerting rules for homelab monitoring # Jinja2 escaping: Prometheus template syntax is wrapped in raw blocks {% raw %} groups: - name: node_health interval: 30s rules: # === ALERT: High CPU Usage === - alert: HighCPUUsage expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80 for: 5m labels: severity: warning annotations: summary: "High CPU usage on {{ $labels.instance }}" description: "CPU usage is above 80% for 5 minutes (current: {{ $value }}%)" # === ALERT: High Memory Usage === - alert: HighMemoryUsage expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85 for: 5m labels: severity: warning annotations: summary: "High memory usage on {{ $labels.instance }}" description: "Memory usage is above 85% (current: {{ $value }}%)" # === ALERT: Low Disk Space === - alert: LowDiskSpace expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15 for: 5m labels: severity: critical annotations: summary: "Low disk space on {{ $labels.instance }}" description: "Root filesystem has less than 15% free space (current: {{ $value }}%)" # === ALERT: Node Down === - alert: NodeDown expr: up{job=~"swarm-.*|watchtower-node"} == 0 for: 2m labels: severity: critical annotations: summary: "Node {{ $labels.instance }} is down" description: "The node has been unreachable for 2 minutes" - name: swarm_health interval: 30s rules: # === ALERT: Swarm Manager Down === - alert: SwarmManagerDown expr: up{job="swarm-managers-node"} == 0 for: 1m labels: severity: critical annotations: summary: "Swarm manager {{ $labels.instance }} is down" description: "A swarm manager node has been unreachable for 1 minute. Check cluster quorum!" # === ALERT: High Container Memory Usage === - alert: HighContainerMemory expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 90 for: 5m labels: severity: warning annotations: summary: "Container {{ $labels.name }} high memory usage" description: "Container is using over 90% of its memory limit (current: {{ $value }}%)" - name: proxmox_health interval: 30s rules: # === ALERT: Proxmox node unreachable via pve_exporter === - alert: ProxmoxNodeDown expr: up{job="proxmox"} == 0 for: 2m labels: severity: critical annotations: summary: "Proxmox node {{ $labels.instance }} unreachable" description: "pve_exporter cannot reach {{ $labels.instance }} for 2 minutes. Verify API token and network path." # === ALERT: QEMU VM in non-running state === - alert: ProxmoxVMDown expr: pve_up{type="qemu"} == 0 for: 5m labels: severity: warning annotations: summary: "VM {{ $labels.name }} is stopped ({{ $labels.id }})" description: "A QEMU VM has been in a non-running state for 5 minutes on cluster pve." # === ALERT: Proxmox datastore filling up === - alert: ProxmoxStorageFull expr: pve_disk_usage_bytes / pve_disk_size_bytes > 0.85 for: 5m labels: severity: warning annotations: summary: "Proxmox storage {{ $labels.id }} is above 85% full" description: "Datastore {{ $labels.id }} usage is at {{ $value | humanizePercentage }}. Review and prune old backups/snapshots." {% endraw %} # === PRO-TIP: Alert Routing === # Connect these alerts to Alertmanager for notifications # (Email, Slack, PagerDuty, etc.) # See: https://prometheus.io/docs/alerting/latest/configuration/