112 lines
4.0 KiB
Django/Jinja

---
# roles/monitoring_stack/templates/alert-rules.yml.j2
# Prometheus alerting rules for homelab monitoring
# Jinja2 escaping: Prometheus template syntax is wrapped in raw blocks
{% raw %}
groups:
- name: node_health
interval: 30s
rules:
# === ALERT: High CPU Usage ===
- alert: HighCPUUsage
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% for 5 minutes (current: {{ $value }}%)"
# === ALERT: High Memory Usage ===
- alert: HighMemoryUsage
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage on {{ $labels.instance }}"
description: "Memory usage is above 85% (current: {{ $value }}%)"
# === ALERT: Low Disk Space ===
- alert: LowDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Root filesystem has less than 15% free space (current: {{ $value }}%)"
# === ALERT: Node Down ===
- alert: NodeDown
expr: up{job=~"swarm-.*|watchtower-node"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} is down"
description: "The node has been unreachable for 2 minutes"
- name: swarm_health
interval: 30s
rules:
# === ALERT: Swarm Manager Down ===
- alert: SwarmManagerDown
expr: up{job="swarm-managers-node"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Swarm manager {{ $labels.instance }} is down"
description: "A swarm manager node has been unreachable for 1 minute. Check cluster quorum!"
# === ALERT: High Container Memory Usage ===
- alert: HighContainerMemory
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "Container {{ $labels.name }} high memory usage"
description: "Container is using over 90% of its memory limit (current: {{ $value }}%)"
- name: proxmox_health
interval: 30s
rules:
# === ALERT: Proxmox node unreachable via pve_exporter ===
- alert: ProxmoxNodeDown
expr: up{job="proxmox"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Proxmox node {{ $labels.instance }} unreachable"
description: "pve_exporter cannot reach {{ $labels.instance }} for 2 minutes. Verify API token and network path."
# === ALERT: QEMU VM in non-running state ===
- alert: ProxmoxVMDown
expr: pve_up{type="qemu"} == 0
for: 5m
labels:
severity: warning
annotations:
summary: "VM {{ $labels.name }} is stopped ({{ $labels.id }})"
description: "A QEMU VM has been in a non-running state for 5 minutes on cluster pve."
# === ALERT: Proxmox datastore filling up ===
- alert: ProxmoxStorageFull
expr: pve_disk_usage_bytes / pve_disk_size_bytes > 0.85
for: 5m
labels:
severity: warning
annotations:
summary: "Proxmox storage {{ $labels.id }} is above 85% full"
description: "Datastore {{ $labels.id }} usage is at {{ $value | humanizePercentage }}. Review and prune old backups/snapshots."
{% endraw %}
# === PRO-TIP: Alert Routing ===
# Connect these alerts to Alertmanager for notifications
# (Email, Slack, PagerDuty, etc.)
# See: https://prometheus.io/docs/alerting/latest/configuration/