295 lines
9.6 KiB
Django/Jinja

# roles/monitoring_stack/templates/docker-compose.yml.j2
# Complete Watchtower monitoring stack with swarm observability
# === CONCEPT: Docker Compose for Orchestration ===
# This file defines the DESIRED STATE of our monitoring infrastructure
# Docker will continuously reconcile to maintain this state
services:
# === TRAFFIC ROUTER: traefik-kop ===
# Syncs Traefik configuration from Heimdall Redis KV store
traefik-kop:
image: ghcr.io/jittering/traefik-kop:latest
container_name: traefik-kop-agent
restart: unless-stopped
volumes:
- /run/user/1000/docker.sock:/var/run/docker.sock:ro
environment:
- REDIS_ADDR={{ heimdall_redis }}
- BIND_IP={{ watchtower_ip }}
networks:
- monitoring
# === METRICS STORAGE: Prometheus ===
prometheus:
image: prom/prometheus:latest
container_name: prometheus
user: "0:0"
restart: unless-stopped
ports:
- "{{ prometheus_host_port }}:9090"
volumes:
- ./prometheus-config:/etc/prometheus
- ./prometheus-data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time={{ prometheus_retention }}'
- '--storage.tsdb.path=/prometheus'
- '--web.enable-lifecycle'
networks:
- monitoring
labels:
- "traefik.enable=false"
# === VISUALIZATION: Grafana ===
grafana:
image: grafana/grafana-oss:latest
container_name: grafana
user: "0:0"
restart: unless-stopped
ports:
- "{{ grafana_port }}:3000"
environment:
- GF_SERVER_ROOT_URL=https://{{ grafana_domain }}
- GF_SECURITY_ADMIN_USER=${GF_ADMIN_USER:-admin}
- GF_SECURITY_ADMIN_PASSWORD=${GF_ADMIN_PASSWORD:-admin}
volumes:
- ./grafana-data:/var/lib/grafana
- ./grafana-provisioning:/etc/grafana/provisioning:ro
networks:
- monitoring
labels:
- "traefik.enable=true"
- "traefik.http.routers.grafana.rule=Host(`{{ grafana_domain }}`)"
- "traefik.http.routers.grafana.entrypoints=websecure"
- "traefik.http.routers.grafana.tls.certresolver=myresolver"
- "traefik.http.services.grafana.loadbalancer.server.port={{ grafana_port }}"
# === UPTIME MONITORING: Uptime Kuma ===
uptime-kuma:
image: louislam/uptime-kuma:1
container_name: uptime-kuma
user: "0:0"
restart: unless-stopped
ports:
- "{{ uptime_kuma_port }}:3001"
volumes:
- ./uptime-kuma-data:/app/data
networks:
- monitoring
labels:
- "traefik.enable=true"
- "traefik.http.routers.uptime.rule=Host(`{{ uptime_domain }}`)"
- "traefik.http.routers.uptime.entrypoints=websecure"
- "traefik.http.routers.uptime.tls.certresolver=myresolver"
- "traefik.http.services.uptime.loadbalancer.server.port={{ uptime_kuma_port }}"
# === HOST METRICS: Node Exporter ===
# Collects metrics from Watchtower itself
node-exporter:
image: prom/node-exporter:latest
container_name: node-exporter
restart: unless-stopped
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--path.rootfs=/rootfs'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
ports:
- "9100:9100"
networks:
- monitoring
security_opt:
- no-new-privileges:true
read_only: true
# === WATCHTOWER CONTAINER METRICS: cAdvisor ===
# Captures local container resource usage on Watchtower itself.
watchtower-cadvisor:
image: gcr.io/cadvisor/cadvisor:latest
container_name: watchtower-cadvisor
restart: unless-stopped
command:
- '--housekeeping_interval=30s'
- '--docker_only=true'
- '--store_container_labels=false'
- '--disable_metrics=advtcp,udp,process,sched,referenced_memory,resctrl'
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker:/var/lib/docker:ro
- /dev/disk:/dev/disk:ro
ports:
- "18080:8080"
networks:
- monitoring
labels:
- "traefik.enable=false"
# === NETWORK/ENDPOINT PROBING: Blackbox Exporter ===
blackbox-exporter:
image: {{ blackbox_exporter_image }}
container_name: blackbox-exporter
restart: unless-stopped
command:
- '--config.file=/etc/blackbox_exporter/blackbox.yml'
ports:
- "{{ blackbox_port }}:9115"
volumes:
- ./blackbox-config:/etc/blackbox_exporter:ro
networks:
- monitoring
labels:
- "traefik.enable=false"
# === PROXMOX API METRICS: pve_exporter ===
# Authenticates to Proxmox API via read-only token and exposes VM/node/storage metrics.
# Credentials are stored in pve-exporter-config/pve.yml (mode 0600, vault-sourced).
pve-exporter:
image: prompve/prometheus-pve-exporter:latest
container_name: pve-exporter
user: "0:0"
restart: unless-stopped
ports:
- "{{ pve_exporter_port }}:9221"
volumes:
- ./pve-exporter-config:/etc/prometheus:ro
networks:
- monitoring
labels:
- "traefik.enable=false"
# === LOG VIEWER: Dozzle ===
{% if monitoring_enable_dozzle | bool %}
dozzle:
image: amir20/dozzle:v9.0.1
container_name: dozzle
user: "0:0"
restart: unless-stopped
ports:
- "{{ dozzle_port }}:8080"
read_only: true
volumes:
- /run/user/1000/docker.sock:/var/run/docker.sock:ro
- ./dozzle-data:/data
environment:
- TZ=America/New_York
- "DOZZLE_REMOTE_AGENT={% for host in groups['swarm_hosts'] %}{{ hostvars[host].ansible_host }}:{{ dozzle_agent_port }}{% if not loop.last %},{% endif %}{% endfor %}"
logging:
driver: "json-file"
options:
max-size: "10m"
max-file: "3"
tmpfs:
- /tmp
security_opt:
- no-new-privileges:true
cap_drop:
- ALL
networks:
- monitoring
labels:
- "traefik.enable={{ 'true' if dozzle_expose_via_traefik | bool else 'false' }}"
{% if dozzle_expose_via_traefik | bool %}
- "traefik.http.routers.dozzle.rule=Host(`{{ dozzle_domain }}`)"
- "traefik.http.routers.dozzle.entrypoints=websecure"
- "traefik.http.routers.dozzle.tls.certresolver=myresolver"
- "traefik.http.services.dozzle.loadbalancer.server.port={{ dozzle_port }}"
{% if monitoring_enable_authentik_outpost | bool %}
- "traefik.http.routers.dozzle.middlewares=authentik-outpost-dozzle@redis"
{% endif %}
{% endif %}
# === AUTHENTICATION: Authentik Outpost ===
{% if monitoring_enable_authentik_outpost | bool %}
authentik-outpost-dozzle:
image: ghcr.io/goauthentik/proxy:2025.10.3
container_name: authentik-outpost-dozzle
restart: unless-stopped
ports:
- "{{ authentik_outpost_port }}:9000"
environment:
- AUTHENTIK_HOST={{ authentik_host }}
- AUTHENTIK_INSECURE=false
- AUTHENTIK_TOKEN=${AUTHENTIK_OUTPOST_DOZZLE_TOKEN}
- AUTHENTIK_HOST_BROWSER={{ authentik_host }}
networks:
- monitoring
labels:
- "traefik.enable=true"
- "traefik.http.routers.authentik-outpost-dozzle.rule=Host(`{{ dozzle_domain }}`) && PathPrefix(`/outpost.goauthentik.io/`)"
- "traefik.http.routers.authentik-outpost-dozzle.entrypoints=websecure"
- "traefik.http.routers.authentik-outpost-dozzle.tls.certresolver=myresolver"
- "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.address=http://{{ watchtower_ip }}:{{ authentik_outpost_port }}/outpost.goauthentik.io/auth/traefik"
- "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.trustforwardheader=true"
- "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.authresponseheaders=X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid"
- "traefik.http.services.authentik-outpost-dozzle.loadbalancer.server.port={{ authentik_outpost_port }}"
{% endif %}
{% endif %}
# === CONTAINER MANAGEMENT: Portainer ===
portainer:
image: portainer/portainer-ce:latest
container_name: portainer
restart: unless-stopped
ports:
- "{{ portainer_http_port }}:9000"
- "{{ portainer_https_port }}:9443"
- "{{ portainer_edge_port }}:8000"
volumes:
- /run/user/1000/docker.sock:/var/run/docker.sock:ro
- ./portainer-data:/data
networks:
- monitoring
labels:
- "traefik.enable=true"
- "traefik.http.routers.portainer.rule=Host(`{{ portainer_domain }}`)"
- "traefik.http.routers.portainer.entrypoints=websecure"
- "traefik.http.routers.portainer.tls.certresolver=myresolver"
- "traefik.http.services.portainer.loadbalancer.server.port={{ portainer_http_port }}"
# === LOG AGGREGATION: Loki ===
# "Prometheus for logs" - indexes labels, not content
loki:
image: grafana/loki:latest
container_name: loki
user: "0:0"
restart: unless-stopped
ports:
- "{{ loki_port }}:3100"
volumes:
- ./loki-config:/etc/loki
- ./loki-data:/loki
command: -config.file=/etc/loki/loki-config.yml
networks:
- monitoring
labels:
- "traefik.enable=false"
# === LOG SHIPPER: Promtail ===
# Reads Docker logs and ships to Loki
promtail:
image: grafana/promtail:latest
container_name: promtail
restart: unless-stopped
volumes:
- ./promtail-config:/etc/promtail
- /var/run/docker.sock:/var/run/docker.sock:ro
command: -config.file=/etc/promtail/promtail-config.yml
networks:
- monitoring
depends_on:
- loki
# === BEST PRACTICE: Dedicated Network ===
# Isolates monitoring traffic from production workloads
networks:
monitoring:
driver: bridge
name: monitoring