homelab/ansible/archive/roles/monitoring_stack/templates/alert-rules.yml.j2

---
# roles/monitoring_stack/templates/alert-rules.yml.j2
# Prometheus alerting rules for homelab monitoring

# Jinja2 escaping: Prometheus template syntax is wrapped in raw blocks
{% raw %}
groups:
  - name: node_health
    interval: 30s
    rules:
      # === ALERT: High CPU Usage ===
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is above 80% for 5 minutes (current: {{ $value }}%)"

      # === ALERT: High Memory Usage ===
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is above 85% (current: {{ $value }}%)"

      # === ALERT: Low Disk Space ===
      - alert: LowDiskSpace
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Root filesystem has less than 15% free space (current: {{ $value }}%)"

      # === ALERT: Node Down ===
      - alert: NodeDown
        expr: up{job=~"swarm-.*|watchtower-node"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Node {{ $labels.instance }} is down"
          description: "The node has been unreachable for 2 minutes"

  - name: swarm_health
    interval: 30s
    rules:
      # === ALERT: Swarm Manager Down ===
      - alert: SwarmManagerDown
        expr: up{job="swarm-managers-node"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Swarm manager {{ $labels.instance }} is down"
          description: "A swarm manager node has been unreachable for 1 minute. Check cluster quorum!"

      # === ALERT: High Container Memory Usage ===
      - alert: HighContainerMemory
        expr: (container_memory_usage_bytes / container_spec_memory_limit_bytes) * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Container {{ $labels.name }} high memory usage"
          description: "Container is using over 90% of its memory limit (current: {{ $value }}%)"

  - name: proxmox_health
    interval: 30s
    rules:
      # === ALERT: Proxmox node unreachable via pve_exporter ===
      - alert: ProxmoxNodeDown
        expr: up{job="proxmox"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Proxmox node {{ $labels.instance }} unreachable"
          description: "pve_exporter cannot reach {{ $labels.instance }} for 2 minutes. Verify API token and network path."

      # === ALERT: QEMU VM in non-running state ===
      - alert: ProxmoxVMDown
        expr: pve_up{type="qemu"} == 0
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "VM {{ $labels.name }} is stopped ({{ $labels.id }})"
          description: "A QEMU VM has been in a non-running state for 5 minutes on cluster pve."

      # === ALERT: Proxmox datastore filling up ===
      - alert: ProxmoxStorageFull
        expr: pve_disk_usage_bytes / pve_disk_size_bytes > 0.85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Proxmox storage {{ $labels.id }} is above 85% full"
          description: "Datastore {{ $labels.id }} usage is at {{ $value | humanizePercentage }}. Review and prune old backups/snapshots."
{% endraw %}

# === PRO-TIP: Alert Routing ===
# Connect these alerts to Alertmanager for notifications
# (Email, Slack, PagerDuty, etc.)
# See: https://prometheus.io/docs/alerting/latest/configuration/