--- # roles/monitoring_stack/templates/prometheus.yml.j2 # Prometheus configuration with dynamic swarm cluster discovery global: scrape_interval: {{ prometheus_scrape_interval }} evaluation_interval: {{ prometheus_scrape_interval }} external_labels: cluster: 'homelab' environment: 'production' # === BEST PRACTICE: Alerting Rules === # Separate alert rules into external files for maintainability rule_files: - '/etc/prometheus/alerts/*.yml' # === CONCEPT: Scrape Configs === # Each job defines a set of targets to monitor # Prometheus will scrape /metrics from each endpoint scrape_configs: # Monitor Prometheus itself (meta-monitoring) - job_name: 'prometheus' static_configs: - targets: ['localhost:{{ prometheus_port }}'] labels: role: 'monitoring' host: 'watchtower' # === WATCHTOWER NODE METRICS === - job_name: 'watchtower-node' static_configs: - targets: ['node-exporter:9100'] labels: role: 'controller' host: 'watchtower' # === WATCHTOWER LOCAL CONTAINER METRICS === - job_name: 'watchtower-containers' static_configs: - targets: ['watchtower-cadvisor:8080'] labels: role: 'controller' host: 'watchtower' metric_source: 'cadvisor' # === SWARM MANAGER NODE METRICS === # Generated dynamically from [swarm_managers] inventory group - job_name: 'swarm-managers-node' static_configs: - targets: {% for host in groups['swarm_managers'] %} - '{{ hostvars[host].ansible_host }}:9100' {% endfor %} labels: role: 'manager' cluster: 'swarm' # === SWARM WORKER NODE METRICS === - job_name: 'swarm-workers-node' static_configs: - targets: {% for host in groups['swarm_workers'] %} - '{{ hostvars[host].ansible_host }}:9100' {% endfor %} labels: role: 'worker' cluster: 'swarm' # === CONTAINER METRICS (cAdvisor) === - job_name: 'swarm-managers-containers' static_configs: - targets: {% for host in groups['swarm_managers'] %} - '{{ hostvars[host].ansible_host }}:8080' {% endfor %} labels: role: 'manager' cluster: 'swarm' - job_name: 'swarm-workers-containers' static_configs: - targets: {% for host in groups['swarm_workers'] %} - '{{ hostvars[host].ansible_host }}:8080' {% endfor %} labels: role: 'worker' cluster: 'swarm' # === PRO-TIP: Docker Hosts === # Monitor standalone Docker hosts (heimdall, waldorf) {% if groups['docker_hosts'] is defined %} - job_name: 'docker-hosts-node' static_configs: - targets: {% for host in groups['docker_hosts'] %} - '{{ hostvars[host].ansible_host }}:9100' {% endfor %} labels: role: 'standalone' {% endif %} # === BLACKBOX PROBES (NETWORK / ENDPOINT HEALTH) === - job_name: 'blackbox-probes' metrics_path: /probe params: module: [http_2xx] static_configs: {% for probe in monitoring_probe_targets %} - targets: ['{{ probe.target }}'] labels: probe_name: '{{ probe.name }}' module: '{{ probe.module }}' {% endfor %} relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [module] target_label: __param_module - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 'blackbox-exporter:{{ blackbox_port }}' # === PROXMOX CLUSTER METRICS (via pve_exporter) === # pve_exporter authenticates to the Proxmox API using a read-only PVEAuditor token. # Each PVE node is passed as ?target= and the request is routed through the exporter. - job_name: 'proxmox' metrics_path: /pve params: module: [default] static_configs: - targets: {% for host in groups['proxmox_cluster'] %} - '{{ hostvars[host].ansible_host }}' {% endfor %} labels: cluster: 'pve' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: 'pve-exporter:9221' # === FUTURE: Swarm Service Discovery === # Uncomment to enable automatic discovery of swarm services # Requires Docker API to be exposed on managers # - job_name: 'swarm-services' # dockerswarm_sd_configs: # - host: unix:///var/run/docker.sock # role: tasks # relabel_configs: # - source_labels: [__meta_dockerswarm_service_name] # target_label: service