112 lines
4.0 KiB
Django/Jinja
112 lines
4.0 KiB
Django/Jinja
---
|
|
# roles/monitoring_stack/templates/alert-rules.yml.j2
|
|
# Prometheus alerting rules for homelab monitoring
|
|
|
|
# Jinja2 escaping: Prometheus template syntax is wrapped in raw blocks
|
|
{% raw %}
|
|
groups:
|
|
- name: node_health
|
|
interval: 30s
|
|
rules:
|
|
# === ALERT: High CPU Usage ===
|
|
- alert: HighCPUUsage
|
|
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High CPU usage on {{ $labels.instance }}"
|
|
description: "CPU usage is above 80% for 5 minutes (current: {{ $value }}%)"
|
|
|
|
# === ALERT: High Memory Usage ===
|
|
- alert: HighMemoryUsage
|
|
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "High memory usage on {{ $labels.instance }}"
|
|
description: "Memory usage is above 85% (current: {{ $value }}%)"
|
|
|
|
# === ALERT: Low Disk Space ===
|
|
- alert: LowDiskSpace
|
|
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15
|
|
for: 5m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Low disk space on {{ $labels.instance }}"
|
|
description: "Root filesystem has less than 15% free space (current: {{ $value }}%)"
|
|
|
|
# === ALERT: Node Down ===
|
|
- alert: NodeDown
|
|
expr: up{job=~"swarm-.*|watchtower-node"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Node {{ $labels.instance }} is down"
|
|
description: "The node has been unreachable for 2 minutes"
|
|
|
|
- name: swarm_health
|
|
interval: 30s
|
|
rules:
|
|
# === ALERT: Swarm Manager Down ===
|
|
- alert: SwarmManagerDown
|
|
expr: up{job="swarm-managers-node"} == 0
|
|
for: 1m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Swarm manager {{ $labels.instance }} is down"
|
|
description: "A swarm manager node has been unreachable for 1 minute. Check cluster quorum!"
|
|
|
|
# === ALERT: High Container Memory Usage ===
|
|
- alert: HighContainerMemory
|
|
expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 90
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Container {{ $labels.name }} high memory usage"
|
|
description: "Container is using over 90% of its memory limit (current: {{ $value }}%)"
|
|
|
|
- name: proxmox_health
|
|
interval: 30s
|
|
rules:
|
|
# === ALERT: Proxmox node unreachable via pve_exporter ===
|
|
- alert: ProxmoxNodeDown
|
|
expr: up{job="proxmox"} == 0
|
|
for: 2m
|
|
labels:
|
|
severity: critical
|
|
annotations:
|
|
summary: "Proxmox node {{ $labels.instance }} unreachable"
|
|
description: "pve_exporter cannot reach {{ $labels.instance }} for 2 minutes. Verify API token and network path."
|
|
|
|
# === ALERT: QEMU VM in non-running state ===
|
|
- alert: ProxmoxVMDown
|
|
expr: pve_up{type="qemu"} == 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "VM {{ $labels.name }} is stopped ({{ $labels.id }})"
|
|
description: "A QEMU VM has been in a non-running state for 5 minutes on cluster pve."
|
|
|
|
# === ALERT: Proxmox datastore filling up ===
|
|
- alert: ProxmoxStorageFull
|
|
expr: pve_disk_usage_bytes / pve_disk_size_bytes > 0.85
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
annotations:
|
|
summary: "Proxmox storage {{ $labels.id }} is above 85% full"
|
|
description: "Datastore {{ $labels.id }} usage is at {{ $value | humanizePercentage }}. Review and prune old backups/snapshots."
|
|
{% endraw %}
|
|
|
|
# === PRO-TIP: Alert Routing ===
|
|
# Connect these alerts to Alertmanager for notifications
|
|
# (Email, Slack, PagerDuty, etc.)
|
|
# See: https://prometheus.io/docs/alerting/latest/configuration/
|