# roles/monitoring_stack/templates/docker-compose.yml.j2 # Complete Watchtower monitoring stack with swarm observability # === CONCEPT: Docker Compose for Orchestration === # This file defines the DESIRED STATE of our monitoring infrastructure # Docker will continuously reconcile to maintain this state services: # === TRAFFIC ROUTER: traefik-kop === # Syncs Traefik configuration from Heimdall Redis KV store traefik-kop: image: ghcr.io/jittering/traefik-kop:latest container_name: traefik-kop-agent restart: unless-stopped volumes: - /run/user/1000/docker.sock:/var/run/docker.sock:ro environment: - REDIS_ADDR={{ heimdall_redis }} - BIND_IP={{ watchtower_ip }} networks: - monitoring # === METRICS STORAGE: Prometheus === prometheus: image: prom/prometheus:latest container_name: prometheus user: "0:0" restart: unless-stopped ports: - "{{ prometheus_host_port }}:9090" volumes: - ./prometheus-config:/etc/prometheus - ./prometheus-data:/prometheus command: - '--config.file=/etc/prometheus/prometheus.yml' - '--storage.tsdb.retention.time={{ prometheus_retention }}' - '--storage.tsdb.path=/prometheus' - '--web.enable-lifecycle' networks: - monitoring labels: - "traefik.enable=false" # === VISUALIZATION: Grafana === grafana: image: grafana/grafana-oss:latest container_name: grafana user: "0:0" restart: unless-stopped ports: - "{{ grafana_port }}:3000" environment: - GF_SERVER_ROOT_URL=https://{{ grafana_domain }} - GF_SECURITY_ADMIN_USER=${GF_ADMIN_USER:-admin} - GF_SECURITY_ADMIN_PASSWORD=${GF_ADMIN_PASSWORD:-admin} volumes: - ./grafana-data:/var/lib/grafana - ./grafana-provisioning:/etc/grafana/provisioning:ro networks: - monitoring labels: - "traefik.enable=true" - "traefik.http.routers.grafana.rule=Host(`{{ grafana_domain }}`)" - "traefik.http.routers.grafana.entrypoints=websecure" - "traefik.http.routers.grafana.tls.certresolver=myresolver" - "traefik.http.services.grafana.loadbalancer.server.port={{ grafana_port }}" # === UPTIME MONITORING: Uptime Kuma === uptime-kuma: image: louislam/uptime-kuma:1 container_name: uptime-kuma user: "0:0" restart: unless-stopped ports: - "{{ uptime_kuma_port }}:3001" volumes: - ./uptime-kuma-data:/app/data networks: - monitoring labels: - "traefik.enable=true" - "traefik.http.routers.uptime.rule=Host(`{{ uptime_domain }}`)" - "traefik.http.routers.uptime.entrypoints=websecure" - "traefik.http.routers.uptime.tls.certresolver=myresolver" - "traefik.http.services.uptime.loadbalancer.server.port={{ uptime_kuma_port }}" # === HOST METRICS: Node Exporter === # Collects metrics from Watchtower itself node-exporter: image: prom/node-exporter:latest container_name: node-exporter restart: unless-stopped command: - '--path.procfs=/host/proc' - '--path.sysfs=/host/sys' - '--path.rootfs=/rootfs' - '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)' volumes: - /proc:/host/proc:ro - /sys:/host/sys:ro - /:/rootfs:ro ports: - "9100:9100" networks: - monitoring security_opt: - no-new-privileges:true read_only: true # === WATCHTOWER CONTAINER METRICS: cAdvisor === # Captures local container resource usage on Watchtower itself. watchtower-cadvisor: image: gcr.io/cadvisor/cadvisor:latest container_name: watchtower-cadvisor restart: unless-stopped command: - '--housekeeping_interval=30s' - '--docker_only=true' - '--store_container_labels=false' - '--disable_metrics=advtcp,udp,process,sched,referenced_memory,resctrl' volumes: - /:/rootfs:ro - /var/run:/var/run:ro - /sys:/sys:ro - /var/lib/docker:/var/lib/docker:ro - /dev/disk:/dev/disk:ro ports: - "18080:8080" networks: - monitoring labels: - "traefik.enable=false" # === NETWORK/ENDPOINT PROBING: Blackbox Exporter === blackbox-exporter: image: {{ blackbox_exporter_image }} container_name: blackbox-exporter restart: unless-stopped command: - '--config.file=/etc/blackbox_exporter/blackbox.yml' ports: - "{{ blackbox_port }}:9115" volumes: - ./blackbox-config:/etc/blackbox_exporter:ro networks: - monitoring labels: - "traefik.enable=false" # === PROXMOX API METRICS: pve_exporter === # Authenticates to Proxmox API via read-only token and exposes VM/node/storage metrics. # Credentials are stored in pve-exporter-config/pve.yml (mode 0600, vault-sourced). pve-exporter: image: prompve/prometheus-pve-exporter:latest container_name: pve-exporter user: "0:0" restart: unless-stopped ports: - "{{ pve_exporter_port }}:9221" volumes: - ./pve-exporter-config:/etc/prometheus:ro networks: - monitoring labels: - "traefik.enable=false" # === LOG VIEWER: Dozzle === {% if monitoring_enable_dozzle | bool %} dozzle: image: amir20/dozzle:v9.0.1 container_name: dozzle user: "0:0" restart: unless-stopped ports: - "{{ dozzle_port }}:8080" read_only: true volumes: - /run/user/1000/docker.sock:/var/run/docker.sock:ro - ./dozzle-data:/data environment: - TZ=America/New_York - "DOZZLE_REMOTE_AGENT={% for host in groups['swarm_hosts'] %}{{ hostvars[host].ansible_host }}:{{ dozzle_agent_port }}{% if not loop.last %},{% endif %}{% endfor %}" logging: driver: "json-file" options: max-size: "10m" max-file: "3" tmpfs: - /tmp security_opt: - no-new-privileges:true cap_drop: - ALL networks: - monitoring labels: - "traefik.enable={{ 'true' if dozzle_expose_via_traefik | bool else 'false' }}" {% if dozzle_expose_via_traefik | bool %} - "traefik.http.routers.dozzle.rule=Host(`{{ dozzle_domain }}`)" - "traefik.http.routers.dozzle.entrypoints=websecure" - "traefik.http.routers.dozzle.tls.certresolver=myresolver" - "traefik.http.services.dozzle.loadbalancer.server.port={{ dozzle_port }}" {% if monitoring_enable_authentik_outpost | bool %} - "traefik.http.routers.dozzle.middlewares=authentik-outpost-dozzle@redis" {% endif %} {% endif %} # === AUTHENTICATION: Authentik Outpost === {% if monitoring_enable_authentik_outpost | bool %} authentik-outpost-dozzle: image: ghcr.io/goauthentik/proxy:2025.10.3 container_name: authentik-outpost-dozzle restart: unless-stopped ports: - "{{ authentik_outpost_port }}:9000" environment: - AUTHENTIK_HOST={{ authentik_host }} - AUTHENTIK_INSECURE=false - AUTHENTIK_TOKEN=${AUTHENTIK_OUTPOST_DOZZLE_TOKEN} - AUTHENTIK_HOST_BROWSER={{ authentik_host }} networks: - monitoring labels: - "traefik.enable=true" - "traefik.http.routers.authentik-outpost-dozzle.rule=Host(`{{ dozzle_domain }}`) && PathPrefix(`/outpost.goauthentik.io/`)" - "traefik.http.routers.authentik-outpost-dozzle.entrypoints=websecure" - "traefik.http.routers.authentik-outpost-dozzle.tls.certresolver=myresolver" - "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.address=http://{{ watchtower_ip }}:{{ authentik_outpost_port }}/outpost.goauthentik.io/auth/traefik" - "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.trustforwardheader=true" - "traefik.http.middlewares.authentik-outpost-dozzle.forwardauth.authresponseheaders=X-authentik-username,X-authentik-groups,X-authentik-email,X-authentik-name,X-authentik-uid" - "traefik.http.services.authentik-outpost-dozzle.loadbalancer.server.port={{ authentik_outpost_port }}" {% endif %} {% endif %} # === CONTAINER MANAGEMENT: Portainer === portainer: image: portainer/portainer-ce:latest container_name: portainer restart: unless-stopped ports: - "{{ portainer_http_port }}:9000" - "{{ portainer_https_port }}:9443" - "{{ portainer_edge_port }}:8000" volumes: - /run/user/1000/docker.sock:/var/run/docker.sock:ro - ./portainer-data:/data networks: - monitoring labels: - "traefik.enable=true" - "traefik.http.routers.portainer.rule=Host(`{{ portainer_domain }}`)" - "traefik.http.routers.portainer.entrypoints=websecure" - "traefik.http.routers.portainer.tls.certresolver=myresolver" - "traefik.http.services.portainer.loadbalancer.server.port={{ portainer_http_port }}" # === LOG AGGREGATION: Loki === # "Prometheus for logs" - indexes labels, not content loki: image: grafana/loki:latest container_name: loki user: "0:0" restart: unless-stopped ports: - "{{ loki_port }}:3100" volumes: - ./loki-config:/etc/loki - ./loki-data:/loki command: -config.file=/etc/loki/loki-config.yml networks: - monitoring labels: - "traefik.enable=false" # === LOG SHIPPER: Promtail === # Reads Docker logs and ships to Loki promtail: image: grafana/promtail:latest container_name: promtail restart: unless-stopped volumes: - ./promtail-config:/etc/promtail - /var/run/docker.sock:/var/run/docker.sock:ro command: -config.file=/etc/promtail/promtail-config.yml networks: - monitoring depends_on: - loki # === BEST PRACTICE: Dedicated Network === # Isolates monitoring traffic from production workloads networks: monitoring: driver: bridge name: monitoring