295 lines
9.6 KiB
Django/Jinja
295 lines
9.6 KiB
Django/Jinja
# roles/monitoring_stack/templates/docker-compose.yml.j2
|
|
# Complete Watchtower monitoring stack with swarm observability
|
|
|
|
# === CONCEPT: Docker Compose for Orchestration ===
|
|
# This file defines the DESIRED STATE of our monitoring infrastructure
|
|
# Docker will continuously reconcile to maintain this state
|
|
|
|
services:
|
|
# === TRAFFIC ROUTER: traefik-kop ===
|
|
# Syncs Traefik configuration from Heimdall Redis KV store
|
|
traefik-kop:
|
|
image: ghcr.io/jittering/traefik-kop:latest
|
|
container_name: traefik-kop-agent
|
|
restart: unless-stopped
|
|
volumes:
|
|
- /run/user/1000/docker.sock:/var/run/docker.sock:ro
|
|
environment:
|
|
- REDIS_ADDR={{ heimdall_redis }}
|
|
- BIND_IP={{ watchtower_ip }}
|
|
networks:
|
|
- monitoring
|
|
|
|
# === METRICS STORAGE: Prometheus ===
|
|
prometheus:
|
|
image: prom/prometheus:latest
|
|
container_name: prometheus
|
|
user: "0:0"
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ prometheus_host_port }}:9090"
|
|
volumes:
|
|
- ./prometheus-config:/etc/prometheus
|
|
- ./prometheus-data:/prometheus
|
|
command:
|
|
- '--config.file=/etc/prometheus/prometheus.yml'
|
|
- '--storage.tsdb.retention.time={{ prometheus_retention }}'
|
|
- '--storage.tsdb.path=/prometheus'
|
|
- '--web.enable-lifecycle'
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=false"
|
|
|
|
# === VISUALIZATION: Grafana ===
|
|
grafana:
|
|
image: grafana/grafana-oss:latest
|
|
container_name: grafana
|
|
user: "0:0"
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ grafana_port }}:3000"
|
|
environment:
|
|
- GF_SERVER_ROOT_URL=https://{{ grafana_domain }}
|
|
- GF_SECURITY_ADMIN_USER=${GF_ADMIN_USER:-admin}
|
|
- GF_SECURITY_ADMIN_PASSWORD=${GF_ADMIN_PASSWORD:-admin}
|
|
volumes:
|
|
- ./grafana-data:/var/lib/grafana
|
|
- ./grafana-provisioning:/etc/grafana/provisioning:ro
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.grafana.rule=Host(`{{ grafana_domain }}`)"
|
|
- "traefik.http.routers.grafana.entrypoints=websecure"
|
|
- "traefik.http.routers.grafana.tls.certresolver=myresolver"
|
|
- "traefik.http.services.grafana.loadbalancer.server.port={{ grafana_port }}"
|
|
|
|
# === UPTIME MONITORING: Uptime Kuma ===
|
|
uptime-kuma:
|
|
image: louislam/uptime-kuma:1
|
|
container_name: uptime-kuma
|
|
user: "0:0"
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ uptime_kuma_port }}:3001"
|
|
volumes:
|
|
- ./uptime-kuma-data:/app/data
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.uptime.rule=Host(`{{ uptime_domain }}`)"
|
|
- "traefik.http.routers.uptime.entrypoints=websecure"
|
|
- "traefik.http.routers.uptime.tls.certresolver=myresolver"
|
|
- "traefik.http.services.uptime.loadbalancer.server.port={{ uptime_kuma_port }}"
|
|
|
|
# === HOST METRICS: Node Exporter ===
|
|
# Collects metrics from Watchtower itself
|
|
node-exporter:
|
|
image: prom/node-exporter:latest
|
|
container_name: node-exporter
|
|
restart: unless-stopped
|
|
command:
|
|
- '--path.procfs=/host/proc'
|
|
- '--path.sysfs=/host/sys'
|
|
- '--path.rootfs=/rootfs'
|
|
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
|
|
volumes:
|
|
- /proc:/host/proc:ro
|
|
- /sys:/host/sys:ro
|
|
- /:/rootfs:ro
|
|
ports:
|
|
- "9100:9100"
|
|
networks:
|
|
- monitoring
|
|
security_opt:
|
|
- no-new-privileges:true
|
|
read_only: true
|
|
|
|
# === WATCHTOWER CONTAINER METRICS: cAdvisor ===
|
|
# Captures local container resource usage on Watchtower itself.
|
|
watchtower-cadvisor:
|
|
image: gcr.io/cadvisor/cadvisor:latest
|
|
container_name: watchtower-cadvisor
|
|
restart: unless-stopped
|
|
command:
|
|
- '--housekeeping_interval=30s'
|
|
- '--docker_only=true'
|
|
- '--store_container_labels=false'
|
|
- '--disable_metrics=advtcp,udp,process,sched,referenced_memory,resctrl'
|
|
volumes:
|
|
- /:/rootfs:ro
|
|
- /var/run:/var/run:ro
|
|
- /sys:/sys:ro
|
|
- /var/lib/docker:/var/lib/docker:ro
|
|
- /dev/disk:/dev/disk:ro
|
|
ports:
|
|
- "18080:8080"
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=false"
|
|
|
|
# === NETWORK/ENDPOINT PROBING: Blackbox Exporter ===
|
|
blackbox-exporter:
|
|
image: {{ blackbox_exporter_image }}
|
|
container_name: blackbox-exporter
|
|
restart: unless-stopped
|
|
command:
|
|
- '--config.file=/etc/blackbox_exporter/blackbox.yml'
|
|
ports:
|
|
- "{{ blackbox_port }}:9115"
|
|
volumes:
|
|
- ./blackbox-config:/etc/blackbox_exporter:ro
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=false"
|
|
|
|
# === PROXMOX API METRICS: pve_exporter ===
|
|
# Authenticates to Proxmox API via read-only token and exposes VM/node/storage metrics.
|
|
# Credentials are stored in pve-exporter-config/pve.yml (mode 0600, vault-sourced).
|
|
pve-exporter:
|
|
image: prompve/prometheus-pve-exporter:latest
|
|
container_name: pve-exporter
|
|
user: "0:0"
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ pve_exporter_port }}:9221"
|
|
volumes:
|
|
- ./pve-exporter-config:/etc/prometheus:ro
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=false"
|
|
|
|
# === LOG VIEWER: Dozzle ===
|
|
{% if monitoring_enable_dozzle | bool %}
|
|
dozzle:
|
|
image: amir20/dozzle:v9.0.1
|
|
container_name: dozzle
|
|
user: "0:0"
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ dozzle_port }}:8080"
|
|
read_only: true
|
|
volumes:
|
|
- /run/user/1000/docker.sock:/var/run/docker.sock:ro
|
|
- ./dozzle-data:/data
|
|
environment:
|
|
- TZ=America/New_York
|
|
- "DOZZLE_REMOTE_AGENT={% for host in groups['swarm_hosts'] %}{{ hostvars[host].ansible_host }}:{{ dozzle_agent_port }}{% if not loop.last %},{% endif %}{% endfor %}"
|
|
logging:
|
|
driver: "json-file"
|
|
options:
|
|
max-size: "10m"
|
|
max-file: "3"
|
|
tmpfs:
|
|
- /tmp
|
|
security_opt:
|
|
- no-new-privileges:true
|
|
cap_drop:
|
|
- ALL
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable={{ 'true' if dozzle_expose_via_traefik | bool else 'false' }}"
|
|
{% if dozzle_expose_via_traefik | bool %}
|
|
- "traefik.http.routers.dozzle.rule=Host(`{{ dozzle_domain }}`)"
|
|
- "traefik.http.routers.dozzle.entrypoints=websecure"
|
|
- "traefik.http.routers.dozzle.tls.certresolver=myresolver"
|
|
- "traefik.http.services.dozzle.loadbalancer.server.port={{ dozzle_port }}"
|
|
{% if monitoring_enable_authentik_outpost | bool %}
|
|
- "traefik.http.routers.dozzle.middlewares=authentik-outpost-dozzle@redis"
|
|
{% endif %}
|
|
{% endif %}
|
|
|
|
# === AUTHENTICATION: Authentik Outpost ===
|
|
{% if monitoring_enable_authentik_outpost | bool %}
|
|
authentik-outpost-dozzle:
|
|
image: ghcr.io/goauthentik/proxy:2025.10.3
|
|
container_name: authentik-outpost-dozzle
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ authentik_outpost_port }}:9000"
|
|
environment:
|
|
- AUTHENTIK_HOST={{ authentik_host }}
|
|
- AUTHENTIK_INSECURE=false
|
|
- AUTHENTIK_TOKEN=${AUTHENTIK_OUTPOST_DOZZLE_TOKEN}
|
|
- AUTHENTIK_HOST_BROWSER={{ authentik_host }}
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.authentik-outpost-dozzle.rule=Host(`{{ dozzle_domain }}`) && PathPrefix(`/outpost.goauthentik.io/`)"
|
|
- "traefik.http.routers.authentik-outpost-dozzle.entrypoints=websecure"
|
|
- "traefik.http.routers.authentik-outpost-dozzle.tls.certresolver=myresolver"
|
|
- "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.address=http://{{ watchtower_ip }}:{{ authentik_outpost_port }}/outpost.goauthentik.io/auth/traefik"
|
|
- "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.trustforwardheader=true"
|
|
- "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.authresponseheaders=X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid"
|
|
- "traefik.http.services.authentik-outpost-dozzle.loadbalancer.server.port={{ authentik_outpost_port }}"
|
|
{% endif %}
|
|
{% endif %}
|
|
|
|
# === CONTAINER MANAGEMENT: Portainer ===
|
|
portainer:
|
|
image: portainer/portainer-ce:latest
|
|
container_name: portainer
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ portainer_http_port }}:9000"
|
|
- "{{ portainer_https_port }}:9443"
|
|
- "{{ portainer_edge_port }}:8000"
|
|
volumes:
|
|
- /run/user/1000/docker.sock:/var/run/docker.sock:ro
|
|
- ./portainer-data:/data
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=true"
|
|
- "traefik.http.routers.portainer.rule=Host(`{{ portainer_domain }}`)"
|
|
- "traefik.http.routers.portainer.entrypoints=websecure"
|
|
- "traefik.http.routers.portainer.tls.certresolver=myresolver"
|
|
- "traefik.http.services.portainer.loadbalancer.server.port={{ portainer_http_port }}"
|
|
|
|
# === LOG AGGREGATION: Loki ===
|
|
# "Prometheus for logs" - indexes labels, not content
|
|
loki:
|
|
image: grafana/loki:latest
|
|
container_name: loki
|
|
user: "0:0"
|
|
restart: unless-stopped
|
|
ports:
|
|
- "{{ loki_port }}:3100"
|
|
volumes:
|
|
- ./loki-config:/etc/loki
|
|
- ./loki-data:/loki
|
|
command: -config.file=/etc/loki/loki-config.yml
|
|
networks:
|
|
- monitoring
|
|
labels:
|
|
- "traefik.enable=false"
|
|
|
|
# === LOG SHIPPER: Promtail ===
|
|
# Reads Docker logs and ships to Loki
|
|
promtail:
|
|
image: grafana/promtail:latest
|
|
container_name: promtail
|
|
restart: unless-stopped
|
|
volumes:
|
|
- ./promtail-config:/etc/promtail
|
|
- /var/run/docker.sock:/var/run/docker.sock:ro
|
|
command: -config.file=/etc/promtail/promtail-config.yml
|
|
networks:
|
|
- monitoring
|
|
depends_on:
|
|
- loki
|
|
|
|
# === BEST PRACTICE: Dedicated Network ===
|
|
# Isolates monitoring traffic from production workloads
|
|
networks:
|
|
monitoring:
|
|
driver: bridge
|
|
name: monitoring
|