157 lines
4.6 KiB
Django/Jinja

---
# roles/monitoring_stack/templates/prometheus.yml.j2
# Prometheus configuration with dynamic swarm cluster discovery
global:
scrape_interval: {{ prometheus_scrape_interval }}
evaluation_interval: {{ prometheus_scrape_interval }}
external_labels:
cluster: 'homelab'
environment: 'production'
# === BEST PRACTICE: Alerting Rules ===
# Separate alert rules into external files for maintainability
rule_files:
- '/etc/prometheus/alerts/*.yml'
# === CONCEPT: Scrape Configs ===
# Each job defines a set of targets to monitor
# Prometheus will scrape /metrics from each endpoint
scrape_configs:
# Monitor Prometheus itself (meta-monitoring)
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:{{ prometheus_port }}']
labels:
role: 'monitoring'
host: 'watchtower'
# === WATCHTOWER NODE METRICS ===
- job_name: 'watchtower-node'
static_configs:
- targets: ['node-exporter:9100']
labels:
role: 'controller'
host: 'watchtower'
# === WATCHTOWER LOCAL CONTAINER METRICS ===
- job_name: 'watchtower-containers'
static_configs:
- targets: ['watchtower-cadvisor:8080']
labels:
role: 'controller'
host: 'watchtower'
metric_source: 'cadvisor'
# === SWARM MANAGER NODE METRICS ===
# Generated dynamically from [swarm_managers] inventory group
- job_name: 'swarm-managers-node'
static_configs:
- targets:
{% for host in groups['swarm_managers'] %}
- '{{ hostvars[host].ansible_host }}:9100'
{% endfor %}
labels:
role: 'manager'
cluster: 'swarm'
# === SWARM WORKER NODE METRICS ===
- job_name: 'swarm-workers-node'
static_configs:
- targets:
{% for host in groups['swarm_workers'] %}
- '{{ hostvars[host].ansible_host }}:9100'
{% endfor %}
labels:
role: 'worker'
cluster: 'swarm'
# === CONTAINER METRICS (cAdvisor) ===
- job_name: 'swarm-managers-containers'
static_configs:
- targets:
{% for host in groups['swarm_managers'] %}
- '{{ hostvars[host].ansible_host }}:8080'
{% endfor %}
labels:
role: 'manager'
cluster: 'swarm'
- job_name: 'swarm-workers-containers'
static_configs:
- targets:
{% for host in groups['swarm_workers'] %}
- '{{ hostvars[host].ansible_host }}:8080'
{% endfor %}
labels:
role: 'worker'
cluster: 'swarm'
# === PRO-TIP: Docker Hosts ===
# Monitor standalone Docker hosts (heimdall, waldorf)
{% if groups['docker_hosts'] is defined %}
- job_name: 'docker-hosts-node'
static_configs:
- targets:
{% for host in groups['docker_hosts'] %}
- '{{ hostvars[host].ansible_host }}:9100'
{% endfor %}
labels:
role: 'standalone'
{% endif %}
# === BLACKBOX PROBES (NETWORK / ENDPOINT HEALTH) ===
- job_name: 'blackbox-probes'
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
{% for probe in monitoring_probe_targets %}
- targets: ['{{ probe.target }}']
labels:
probe_name: '{{ probe.name }}'
module: '{{ probe.module }}'
{% endfor %}
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [module]
target_label: __param_module
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'blackbox-exporter:{{ blackbox_port }}'
# === PROXMOX CLUSTER METRICS (via pve_exporter) ===
# pve_exporter authenticates to the Proxmox API using a read-only PVEAuditor token.
# Each PVE node is passed as ?target= and the request is routed through the exporter.
- job_name: 'proxmox'
metrics_path: /pve
params:
module: [default]
static_configs:
- targets:
{% for host in groups['proxmox_cluster'] %}
- '{{ hostvars[host].ansible_host }}'
{% endfor %}
labels:
cluster: 'pve'
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 'pve-exporter:9221'
# === FUTURE: Swarm Service Discovery ===
# Uncomment to enable automatic discovery of swarm services
# Requires Docker API to be exposed on managers
# - job_name: 'swarm-services'
# dockerswarm_sd_configs:
# - host: unix:///var/run/docker.sock
# role: tasks
# relabel_configs:
# - source_labels: [__meta_dockerswarm_service_name]
# target_label: service