--- # playbooks/monitoring/deploy_swarm_monitoring.yml # Complete observability stack deployment for Docker Swarm cluster + standalone hosts # # === ARCHITECTURE OVERVIEW === # This playbook deploys a three-tier monitoring solution: # # TIER 1: Data Collection (Swarm Nodes + Standalone Docker Hosts) # - node-exporter: Host metrics (CPU, RAM, disk, network) on swarm nodes and standalone hosts # - cAdvisor: Container metrics (per-container resource usage) on swarm nodes only # # TIER 2: Aggregation & Storage (Watchtower) # - Prometheus: Metrics time-series database # - Loki: Log aggregation and indexing # # TIER 3: Visualization & Alerting (Watchtower) # - Grafana: Dashboards and data exploration # - Uptime Kuma: HTTP health checks # - Dozzle: Real-time log viewer # # === PREREQUISITES === # - Docker Swarm cluster is initialized and running # - All nodes are accessible via SSH # - Docker is installed on all nodes (swarm + standalone hosts) # - Authentik token is set in group_vars (for Dozzle auth) # # === USAGE === # Deploy full stack (swarm nodes, standalone hosts, and watchtower): # ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml # # Deploy only to swarm nodes: # ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags swarm # # Deploy only to standalone docker hosts: # ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags docker-hosts # # Deploy only watchtower stack: # ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags watchtower - name: Deploy monitoring exporters on swarm nodes hosts: swarm_hosts become: false gather_facts: true tags: ['swarm', 'exporters'] pre_tasks: - name: Verify Docker is installed ansible.builtin.command: docker --version register: docker_check changed_when: false failed_when: docker_check.rc != 0 - name: Display deployment target ansible.builtin.debug: msg: - "🎯 Deploying monitoring exporters to: {{ inventory_hostname }}" - " Role: {{ 'Manager' if inventory_hostname in groups['swarm_managers'] else 'Worker' }}" - " IP: {{ ansible_host }}" roles: - role: swarm_node_exporter tags: ['node-exporter'] - role: swarm_cadvisor tags: ['cadvisor'] - name: Deploy Dozzle swarm agents hosts: swarm_managers become: false gather_facts: false tags: ['swarm', 'dozzle-agent'] tasks: - name: Deploy and validate dozzle-agent service from primary manager ansible.builtin.include_role: name: swarm_dozzle_agent when: inventory_hostname == groups['swarm_managers'][0] post_tasks: - name: Validate exporter endpoints ansible.builtin.uri: url: "{{ item.url }}" method: GET status_code: 200 loop: - { name: "node-exporter", url: "http://localhost:9100/metrics" } - { name: "cAdvisor", url: "http://localhost:8080/metrics" } loop_control: label: "{{ item.name }}" register: endpoint_check retries: 3 delay: 5 - name: Display exporter status ansible.builtin.debug: msg: "✅ {{ inventory_hostname }}: All exporters are healthy" - name: Deploy node-exporter on standalone docker hosts hosts: docker_hosts become: false gather_facts: true tags: ['docker-hosts', 'exporters', 'node-exporter'] pre_tasks: - name: Verify Docker is installed ansible.builtin.command: docker --version register: docker_check changed_when: false failed_when: docker_check.rc != 0 - name: Display deployment target ansible.builtin.debug: msg: - "🎯 Deploying node-exporter to standalone docker host: {{ inventory_hostname }}" - " IP: {{ ansible_host }}" - " Purpose: Hardware and software metrics collection" tasks: - name: Deploy node-exporter role with elevated privileges ansible.builtin.include_role: name: swarm_node_exporter apply: become: true tags: ['node-exporter'] post_tasks: - name: Validate node-exporter endpoint ansible.builtin.uri: url: "http://localhost:9100/metrics" method: GET status_code: 200 retries: 3 delay: 5 register: exporter_check - name: Display node-exporter status ansible.builtin.debug: msg: "✅ {{ inventory_hostname }}: node-exporter deployed and healthy on port 9100" - name: Deploy monitoring stack on Watchtower hosts: watchtower connection: local become: false gather_facts: true tags: ['watchtower', 'stack'] vars: # Canonical encrypted vars location (ADR-008) vault_encrypted_vars_file: "{{ playbook_dir }}/../../group_vars/vault/all.yml" pre_tasks: - name: Check vault encrypted vars file state ansible.builtin.stat: path: "{{ vault_encrypted_vars_file }}" register: vault_vars_file_state - name: Load encrypted vars when present ansible.builtin.include_vars: file: "{{ vault_encrypted_vars_file }}" name: vault_vars when: vault_vars_file_state.stat.exists no_log: true - name: Resolve monitoring secrets from vault or environment fallback ansible.builtin.set_fact: grafana_admin_password: >- {{ ( vault_vars.vault_grafana_admin_password if (vault_vars is defined and 'vault_grafana_admin_password' in vault_vars) else (grafana_admin_password | default('')) ) | default('', true) }} authentik_outpost_dozzle_token: >- {{ ( vault_vars.vault_authentik_outpost_dozzle_token if (vault_vars is defined and 'vault_authentik_outpost_dozzle_token' in vault_vars) else ( secrets.AUTHENTIK_OUTPOST_DOZZLE_TOKEN if (secrets is defined and 'AUTHENTIK_OUTPOST_DOZZLE_TOKEN' in secrets) else lookup('env', 'AUTHENTIK_OUTPOST_DOZZLE_TOKEN') ) ) | default('', true) }} pve_exporter_token: >- {{ ( vault_vars.vault_pve_exporter_token if (vault_vars is defined and 'vault_pve_exporter_token' in vault_vars) else lookup('env', 'PVE_EXPORTER_TOKEN') ) | default('', true) }} no_log: true - name: Verify Docker Compose V2 is available ansible.builtin.command: docker compose version register: compose_check changed_when: false failed_when: compose_check.rc != 0 - name: Display Watchtower deployment info ansible.builtin.debug: msg: - "🏗️ Deploying monitoring stack to Watchtower" - " Swarm targets: {{ groups['swarm_managers'] | length }} managers + {{ groups['swarm_workers'] | length }} workers" - " Standalone hosts: {{ groups['docker_hosts'] | length }} (node-exporter)" - " Total monitored nodes: {{ groups['swarm_hosts'] | length + groups['docker_hosts'] | length + 1 }} (including Watchtower)" roles: - role: monitoring_stack post_tasks: - name: Wait for Prometheus to be ready ansible.builtin.uri: url: "http://{{ watchtower_ip }}:{{ prometheus_host_port }}/-/ready" method: GET status_code: 200 retries: 10 delay: 5 register: prometheus_ready when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus') - name: Verify Prometheus can scrape all targets ansible.builtin.uri: url: "http://{{ watchtower_ip }}:{{ prometheus_host_port }}/api/v1/targets" method: GET return_content: true register: prometheus_targets retries: 3 delay: 10 when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus') - name: Build watchtower edge route backend reconciliation list ansible.builtin.set_fact: watchtower_edge_route_backends: >- {{ [ {'name': 'grafana', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (grafana_port | string)}, {'name': 'uptime', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (uptime_kuma_port | string)} ] + ( [ {'name': 'dozzle', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (dozzle_port | string)} ] if (monitoring_enable_dozzle | default(false) | bool) and (dozzle_expose_via_traefik | default(false) | bool) else [] ) + ( [ {'name': 'authentik-outpost-dozzle', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (authentik_outpost_port | string)} ] if monitoring_enable_authentik_outpost | default(false) | bool else [] ) + [ {'name': 'portainer', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (portainer_http_port | string)} ] }} - name: Reconcile watchtower service backends in Redis edge routing ansible.builtin.command: >- ssh {{ (edge_routing | default({})).get('edge_host', {}).get('ip', '10.0.0.151') }} sudo docker exec redis redis-cli SET traefik/http/services/{{ item.name }}/loadBalancer/servers/0/url {{ item.url }} changed_when: true loop: "{{ watchtower_edge_route_backends }}" loop_control: label: "{{ item.name }} -> {{ item.url }}" - name: Verify reconciled watchtower service backends in Redis ansible.builtin.command: >- ssh {{ (edge_routing | default({})).get('edge_host', {}).get('ip', '10.0.0.151') }} sudo docker exec redis redis-cli GET traefik/http/services/{{ item.name }}/loadBalancer/servers/0/url register: watchtower_route_backend_reads changed_when: false loop: "{{ watchtower_edge_route_backends }}" loop_control: label: "{{ item.name }}" - name: Assert watchtower service backends are reconciled to host IP routes ansible.builtin.assert: that: - item.stdout == item.item.url fail_msg: >- Edge route drift persisted for {{ item.item.name }}. Expected {{ item.item.url }}, got {{ item.stdout | default('') }}. success_msg: >- Edge route {{ item.item.name }} correctly reconciled to {{ item.item.url }}. loop: "{{ watchtower_route_backend_reads.results }}" loop_control: label: "{{ item.item.name }}" - name: Display monitoring stack summary ansible.builtin.debug: msg: - "╔════════════════════════════════════════════════════════╗" - "║ 🎉 SWARM MONITORING STACK DEPLOYED SUCCESSFULLY! ║" - "╚════════════════════════════════════════════════════════╝" - "" - "📊 METRICS & DASHBOARDS:" - " Prometheus: http://{{ watchtower_ip }}:{{ prometheus_host_port }}" - " Grafana: https://{{ grafana_domain }}" - "" - "📋 LOGS:" - " Dozzle: https://{{ dozzle_domain }}" - " Loki API: http://{{ watchtower_ip }}:{{ loki_port }}" - "" - "✅ UPTIME:" - " Uptime Kuma: https://{{ uptime_domain }}" - "" - "🔍 NEXT STEPS:" - " 1. Open Grafana: https://{{ grafana_domain }}" - " 2. Verify provisioned data sources: {{ grafana_prometheus_datasource_name }} + {{ grafana_loki_datasource_name }}" - " 3. Review the provisioned dashboard folder: {{ grafana_dashboards_folder }}" - " 4. Optionally import extra dashboards: 1860, 893, 13639, 10347" - " 5. Configure Uptime Kuma health checks for swarm services" - "" - "📚 CONCEPTS YOU LEARNED:" - " ✓ Multi-tier monitoring architecture" - " ✓ Prometheus service discovery & scraping" - " ✓ Loki label-based log indexing" - " ✓ Ansible roles for modular infrastructure" - " ✓ Idempotent deployment (run this playbook anytime!)" when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus') - name: Display focused deployment summary ansible.builtin.debug: msg: - "Focused deployment completed" - "Service: {{ monitoring_focus_service | default('not-set') }}" - "Mode: additive (existing running services preserved)" when: monitoring_focus_mode | default(false) | bool and (monitoring_focus_service | default('') != 'prometheus') - name: Generate monitoring documentation hosts: localhost connection: local gather_facts: false tags: ['docs'] run_once: true tasks: - name: Create monitoring quick-reference guide ansible.builtin.copy: dest: "{{ playbook_dir }}/../../documentation/swarm-monitoring-guide.md" mode: '0644' content: | # Docker Swarm Monitoring Guide **Deployed:** {{ ansible_date_time.iso8601 }} **Cluster:** {{ groups['swarm_hosts'] | length }} nodes ({{ groups['swarm_managers'] | length }} managers, {{ groups['swarm_workers'] | length }} workers) ## Quick Access | Service | URL | Purpose | |---------|-----|---------| | Prometheus | http://{{ hostvars['localhost'].watchtower_ip }}:{{ hostvars['localhost'].prometheus_port }} | Metrics storage & query | | Grafana | https://{{ hostvars['localhost'].grafana_domain }} | Dashboards & visualization | | Loki | http://{{ hostvars['localhost'].watchtower_ip }}:{{ hostvars['localhost'].loki_port }} | Log aggregation | | Dozzle | https://{{ hostvars['localhost'].dozzle_domain }} | Real-time log viewer | | Uptime Kuma | https://{{ hostvars['localhost'].uptime_domain }} | Service uptime tracking | ## Monitored Nodes ### Managers {% for host in groups['swarm_managers'] %} - **{{ host }}** ({{ hostvars[host].ansible_host }}) - node-exporter: http://{{ hostvars[host].ansible_host }}:9100/metrics - cAdvisor: http://{{ hostvars[host].ansible_host }}:8080/metrics {% endfor %} ### Workers {% for host in groups['swarm_workers'] %} - **{{ host }}** ({{ hostvars[host].ansible_host }}) - node-exporter: http://{{ hostvars[host].ansible_host }}:9100/metrics - cAdvisor: http://{{ hostvars[host].ansible_host }}:8080/metrics {% endfor %} ## Useful Prometheus Queries ```promql # Total cluster CPU usage 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) # Memory usage per node (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 # Container count per node count(container_last_seen) by (instance) # Network traffic by node rate(node_network_receive_bytes_total[5m]) ``` ## Troubleshooting ### Exporter not reachable ```bash # Check if container is running ansible swarm_hosts -i inventory/hosts.ini -a "docker ps | grep exporter" # Check firewall ansible swarm_hosts -i inventory/hosts.ini -a "ss -tlnp | grep -E '9100|8080'" ``` ### Prometheus shows target down ```bash # Test from Watchtower curl http://:9100/metrics curl http://:8080/metrics ``` ## Maintenance ### Update all monitoring components ```bash cd /home/chester/homelab/ansible ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml ``` ### View Prometheus configuration ```bash cat /opt/stacks/watchtower/prometheus-config/prometheus.yml ``` ### Check alert rules ```bash cat /opt/stacks/watchtower/prometheus-config/alerts/homelab.yml ``` register: docs_created - name: Display documentation location ansible.builtin.debug: msg: "📚 Monitoring guide created at: {{ docs_created.dest }}" when: docs_created.changed