442 lines
17 KiB
YAML
442 lines
17 KiB
YAML
---
|
|
# playbooks/monitoring/deploy_swarm_monitoring.yml
|
|
# Complete observability stack deployment for Docker Swarm cluster + standalone hosts
|
|
#
|
|
# === ARCHITECTURE OVERVIEW ===
|
|
# This playbook deploys a three-tier monitoring solution:
|
|
#
|
|
# TIER 1: Data Collection (Swarm Nodes + Standalone Docker Hosts)
|
|
# - node-exporter: Host metrics (CPU, RAM, disk, network) on swarm nodes and standalone hosts
|
|
# - cAdvisor: Container metrics (per-container resource usage) on swarm nodes only
|
|
#
|
|
# TIER 2: Aggregation & Storage (Watchtower)
|
|
# - Prometheus: Metrics time-series database
|
|
# - Loki: Log aggregation and indexing
|
|
#
|
|
# TIER 3: Visualization & Alerting (Watchtower)
|
|
# - Grafana: Dashboards and data exploration
|
|
# - Uptime Kuma: HTTP health checks
|
|
# - Dozzle: Real-time log viewer
|
|
#
|
|
# === PREREQUISITES ===
|
|
# - Docker Swarm cluster is initialized and running
|
|
# - All nodes are accessible via SSH
|
|
# - Docker is installed on all nodes (swarm + standalone hosts)
|
|
# - Authentik token is set in group_vars (for Dozzle auth)
|
|
#
|
|
# === USAGE ===
|
|
# Deploy full stack (swarm nodes, standalone hosts, and watchtower):
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml
|
|
#
|
|
# Deploy only to swarm nodes:
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags swarm
|
|
#
|
|
# Deploy only to standalone docker hosts:
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags docker-hosts
|
|
#
|
|
# Deploy only watchtower stack:
|
|
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags watchtower
|
|
|
|
- name: Deploy monitoring exporters on swarm nodes
|
|
hosts: swarm_hosts
|
|
become: false
|
|
gather_facts: true
|
|
tags: ['swarm', 'exporters']
|
|
|
|
pre_tasks:
|
|
- name: Verify Docker is installed
|
|
ansible.builtin.command: docker --version
|
|
register: docker_check
|
|
changed_when: false
|
|
failed_when: docker_check.rc != 0
|
|
|
|
- name: Display deployment target
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "🎯 Deploying monitoring exporters to: {{ inventory_hostname }}"
|
|
- " Role: {{ 'Manager' if inventory_hostname in groups['swarm_managers'] else 'Worker' }}"
|
|
- " IP: {{ ansible_host }}"
|
|
|
|
roles:
|
|
- role: swarm_node_exporter
|
|
tags: ['node-exporter']
|
|
|
|
- role: swarm_cadvisor
|
|
tags: ['cadvisor']
|
|
|
|
- name: Deploy Dozzle swarm agents
|
|
hosts: swarm_managers
|
|
become: false
|
|
gather_facts: false
|
|
tags: ['swarm', 'dozzle-agent']
|
|
|
|
tasks:
|
|
- name: Deploy and validate dozzle-agent service from primary manager
|
|
ansible.builtin.include_role:
|
|
name: swarm_dozzle_agent
|
|
when: inventory_hostname == groups['swarm_managers'][0]
|
|
|
|
post_tasks:
|
|
- name: Validate exporter endpoints
|
|
ansible.builtin.uri:
|
|
url: "{{ item.url }}"
|
|
method: GET
|
|
status_code: 200
|
|
loop:
|
|
- { name: "node-exporter", url: "http://localhost:9100/metrics" }
|
|
- { name: "cAdvisor", url: "http://localhost:8080/metrics" }
|
|
loop_control:
|
|
label: "{{ item.name }}"
|
|
register: endpoint_check
|
|
retries: 3
|
|
delay: 5
|
|
|
|
- name: Display exporter status
|
|
ansible.builtin.debug:
|
|
msg: "✅ {{ inventory_hostname }}: All exporters are healthy"
|
|
|
|
- name: Deploy node-exporter on standalone docker hosts
|
|
hosts: docker_hosts
|
|
become: false
|
|
gather_facts: true
|
|
tags: ['docker-hosts', 'exporters', 'node-exporter']
|
|
|
|
pre_tasks:
|
|
- name: Verify Docker is installed
|
|
ansible.builtin.command: docker --version
|
|
register: docker_check
|
|
changed_when: false
|
|
failed_when: docker_check.rc != 0
|
|
|
|
- name: Display deployment target
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "🎯 Deploying node-exporter to standalone docker host: {{ inventory_hostname }}"
|
|
- " IP: {{ ansible_host }}"
|
|
- " Purpose: Hardware and software metrics collection"
|
|
|
|
tasks:
|
|
- name: Deploy node-exporter role with elevated privileges
|
|
ansible.builtin.include_role:
|
|
name: swarm_node_exporter
|
|
apply:
|
|
become: true
|
|
tags: ['node-exporter']
|
|
|
|
post_tasks:
|
|
- name: Validate node-exporter endpoint
|
|
ansible.builtin.uri:
|
|
url: "http://localhost:9100/metrics"
|
|
method: GET
|
|
status_code: 200
|
|
retries: 3
|
|
delay: 5
|
|
register: exporter_check
|
|
|
|
- name: Display node-exporter status
|
|
ansible.builtin.debug:
|
|
msg: "✅ {{ inventory_hostname }}: node-exporter deployed and healthy on port 9100"
|
|
|
|
- name: Deploy monitoring stack on Watchtower
|
|
hosts: watchtower
|
|
connection: local
|
|
become: false
|
|
gather_facts: true
|
|
tags: ['watchtower', 'stack']
|
|
|
|
vars:
|
|
# Canonical encrypted vars location (ADR-008)
|
|
vault_encrypted_vars_file: "{{ playbook_dir }}/../../group_vars/vault/all.yml"
|
|
|
|
pre_tasks:
|
|
- name: Check vault encrypted vars file state
|
|
ansible.builtin.stat:
|
|
path: "{{ vault_encrypted_vars_file }}"
|
|
register: vault_vars_file_state
|
|
|
|
- name: Load encrypted vars when present
|
|
ansible.builtin.include_vars:
|
|
file: "{{ vault_encrypted_vars_file }}"
|
|
name: vault_vars
|
|
when: vault_vars_file_state.stat.exists
|
|
no_log: true
|
|
|
|
- name: Resolve monitoring secrets from vault or environment fallback
|
|
ansible.builtin.set_fact:
|
|
grafana_admin_password: >-
|
|
{{
|
|
(
|
|
vault_vars.vault_grafana_admin_password
|
|
if (vault_vars is defined and 'vault_grafana_admin_password' in vault_vars)
|
|
else (grafana_admin_password | default(''))
|
|
) | default('', true)
|
|
}}
|
|
authentik_outpost_dozzle_token: >-
|
|
{{
|
|
(
|
|
vault_vars.vault_authentik_outpost_dozzle_token
|
|
if (vault_vars is defined and 'vault_authentik_outpost_dozzle_token' in vault_vars)
|
|
else (
|
|
secrets.AUTHENTIK_OUTPOST_DOZZLE_TOKEN
|
|
if (secrets is defined and 'AUTHENTIK_OUTPOST_DOZZLE_TOKEN' in secrets)
|
|
else lookup('env', 'AUTHENTIK_OUTPOST_DOZZLE_TOKEN')
|
|
)
|
|
) | default('', true)
|
|
}}
|
|
pve_exporter_token: >-
|
|
{{
|
|
(
|
|
vault_vars.vault_pve_exporter_token
|
|
if (vault_vars is defined and 'vault_pve_exporter_token' in vault_vars)
|
|
else lookup('env', 'PVE_EXPORTER_TOKEN')
|
|
) | default('', true)
|
|
}}
|
|
no_log: true
|
|
|
|
- name: Verify Docker Compose V2 is available
|
|
ansible.builtin.command: docker compose version
|
|
register: compose_check
|
|
changed_when: false
|
|
failed_when: compose_check.rc != 0
|
|
|
|
- name: Display Watchtower deployment info
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "🏗️ Deploying monitoring stack to Watchtower"
|
|
- " Swarm targets: {{ groups['swarm_managers'] | length }} managers + {{ groups['swarm_workers'] | length }} workers"
|
|
- " Standalone hosts: {{ groups['docker_hosts'] | length }} (node-exporter)"
|
|
- " Total monitored nodes: {{ groups['swarm_hosts'] | length + groups['docker_hosts'] | length + 1 }} (including Watchtower)"
|
|
|
|
roles:
|
|
- role: monitoring_stack
|
|
|
|
post_tasks:
|
|
- name: Wait for Prometheus to be ready
|
|
ansible.builtin.uri:
|
|
url: "http://{{ watchtower_ip }}:{{ prometheus_host_port }}/-/ready"
|
|
method: GET
|
|
status_code: 200
|
|
retries: 10
|
|
delay: 5
|
|
register: prometheus_ready
|
|
when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus')
|
|
|
|
- name: Verify Prometheus can scrape all targets
|
|
ansible.builtin.uri:
|
|
url: "http://{{ watchtower_ip }}:{{ prometheus_host_port }}/api/v1/targets"
|
|
method: GET
|
|
return_content: true
|
|
register: prometheus_targets
|
|
retries: 3
|
|
delay: 10
|
|
when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus')
|
|
|
|
- name: Build watchtower edge route backend reconciliation list
|
|
ansible.builtin.set_fact:
|
|
watchtower_edge_route_backends: >-
|
|
{{
|
|
[
|
|
{'name': 'grafana', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (grafana_port | string)},
|
|
{'name': 'uptime', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (uptime_kuma_port | string)}
|
|
]
|
|
+
|
|
(
|
|
[
|
|
{'name': 'dozzle', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (dozzle_port | string)}
|
|
]
|
|
if (monitoring_enable_dozzle | default(false) | bool) and (dozzle_expose_via_traefik | default(false) | bool)
|
|
else []
|
|
)
|
|
+
|
|
(
|
|
[
|
|
{'name': 'authentik-outpost-dozzle', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (authentik_outpost_port | string)}
|
|
]
|
|
if monitoring_enable_authentik_outpost | default(false) | bool
|
|
else []
|
|
)
|
|
+
|
|
[
|
|
{'name': 'portainer', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (portainer_http_port | string)}
|
|
]
|
|
}}
|
|
|
|
- name: Reconcile watchtower service backends in Redis edge routing
|
|
ansible.builtin.command: >-
|
|
ssh {{ (edge_routing | default({})).get('edge_host', {}).get('ip', '10.0.0.151') }}
|
|
sudo docker exec redis redis-cli SET
|
|
traefik/http/services/{{ item.name }}/loadBalancer/servers/0/url
|
|
{{ item.url }}
|
|
changed_when: true
|
|
loop: "{{ watchtower_edge_route_backends }}"
|
|
loop_control:
|
|
label: "{{ item.name }} -> {{ item.url }}"
|
|
|
|
- name: Verify reconciled watchtower service backends in Redis
|
|
ansible.builtin.command: >-
|
|
ssh {{ (edge_routing | default({})).get('edge_host', {}).get('ip', '10.0.0.151') }}
|
|
sudo docker exec redis redis-cli GET
|
|
traefik/http/services/{{ item.name }}/loadBalancer/servers/0/url
|
|
register: watchtower_route_backend_reads
|
|
changed_when: false
|
|
loop: "{{ watchtower_edge_route_backends }}"
|
|
loop_control:
|
|
label: "{{ item.name }}"
|
|
|
|
- name: Assert watchtower service backends are reconciled to host IP routes
|
|
ansible.builtin.assert:
|
|
that:
|
|
- item.stdout == item.item.url
|
|
fail_msg: >-
|
|
Edge route drift persisted for {{ item.item.name }}.
|
|
Expected {{ item.item.url }}, got {{ item.stdout | default('') }}.
|
|
success_msg: >-
|
|
Edge route {{ item.item.name }} correctly reconciled to {{ item.item.url }}.
|
|
loop: "{{ watchtower_route_backend_reads.results }}"
|
|
loop_control:
|
|
label: "{{ item.item.name }}"
|
|
|
|
- name: Display monitoring stack summary
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "╔════════════════════════════════════════════════════════╗"
|
|
- "║ 🎉 SWARM MONITORING STACK DEPLOYED SUCCESSFULLY! ║"
|
|
- "╚════════════════════════════════════════════════════════╝"
|
|
- ""
|
|
- "📊 METRICS & DASHBOARDS:"
|
|
- " Prometheus: http://{{ watchtower_ip }}:{{ prometheus_host_port }}"
|
|
- " Grafana: https://{{ grafana_domain }}"
|
|
- ""
|
|
- "📋 LOGS:"
|
|
- " Dozzle: https://{{ dozzle_domain }}"
|
|
- " Loki API: http://{{ watchtower_ip }}:{{ loki_port }}"
|
|
- ""
|
|
- "✅ UPTIME:"
|
|
- " Uptime Kuma: https://{{ uptime_domain }}"
|
|
- ""
|
|
- "🔍 NEXT STEPS:"
|
|
- " 1. Open Grafana: https://{{ grafana_domain }}"
|
|
- " 2. Verify provisioned data sources: {{ grafana_prometheus_datasource_name }} + {{ grafana_loki_datasource_name }}"
|
|
- " 3. Review the provisioned dashboard folder: {{ grafana_dashboards_folder }}"
|
|
- " 4. Optionally import extra dashboards: 1860, 893, 13639, 10347"
|
|
- " 5. Configure Uptime Kuma health checks for swarm services"
|
|
- ""
|
|
- "📚 CONCEPTS YOU LEARNED:"
|
|
- " ✓ Multi-tier monitoring architecture"
|
|
- " ✓ Prometheus service discovery & scraping"
|
|
- " ✓ Loki label-based log indexing"
|
|
- " ✓ Ansible roles for modular infrastructure"
|
|
- " ✓ Idempotent deployment (run this playbook anytime!)"
|
|
when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus')
|
|
|
|
- name: Display focused deployment summary
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "Focused deployment completed"
|
|
- "Service: {{ monitoring_focus_service | default('not-set') }}"
|
|
- "Mode: additive (existing running services preserved)"
|
|
when: monitoring_focus_mode | default(false) | bool and (monitoring_focus_service | default('') != 'prometheus')
|
|
|
|
- name: Generate monitoring documentation
|
|
hosts: localhost
|
|
connection: local
|
|
gather_facts: false
|
|
tags: ['docs']
|
|
run_once: true
|
|
|
|
tasks:
|
|
- name: Create monitoring quick-reference guide
|
|
ansible.builtin.copy:
|
|
dest: "{{ playbook_dir }}/../../documentation/swarm-monitoring-guide.md"
|
|
mode: '0644'
|
|
content: |
|
|
# Docker Swarm Monitoring Guide
|
|
|
|
**Deployed:** {{ ansible_date_time.iso8601 }}
|
|
**Cluster:** {{ groups['swarm_hosts'] | length }} nodes ({{ groups['swarm_managers'] | length }} managers, {{ groups['swarm_workers'] | length }} workers)
|
|
|
|
## Quick Access
|
|
|
|
| Service | URL | Purpose |
|
|
|---------|-----|---------|
|
|
| Prometheus | http://{{ hostvars['localhost'].watchtower_ip }}:{{ hostvars['localhost'].prometheus_port }} | Metrics storage & query |
|
|
| Grafana | https://{{ hostvars['localhost'].grafana_domain }} | Dashboards & visualization |
|
|
| Loki | http://{{ hostvars['localhost'].watchtower_ip }}:{{ hostvars['localhost'].loki_port }} | Log aggregation |
|
|
| Dozzle | https://{{ hostvars['localhost'].dozzle_domain }} | Real-time log viewer |
|
|
| Uptime Kuma | https://{{ hostvars['localhost'].uptime_domain }} | Service uptime tracking |
|
|
|
|
## Monitored Nodes
|
|
|
|
### Managers
|
|
{% for host in groups['swarm_managers'] %}
|
|
- **{{ host }}** ({{ hostvars[host].ansible_host }})
|
|
- node-exporter: http://{{ hostvars[host].ansible_host }}:9100/metrics
|
|
- cAdvisor: http://{{ hostvars[host].ansible_host }}:8080/metrics
|
|
{% endfor %}
|
|
|
|
### Workers
|
|
{% for host in groups['swarm_workers'] %}
|
|
- **{{ host }}** ({{ hostvars[host].ansible_host }})
|
|
- node-exporter: http://{{ hostvars[host].ansible_host }}:9100/metrics
|
|
- cAdvisor: http://{{ hostvars[host].ansible_host }}:8080/metrics
|
|
{% endfor %}
|
|
|
|
## Useful Prometheus Queries
|
|
|
|
```promql
|
|
# Total cluster CPU usage
|
|
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
|
|
|
|
# Memory usage per node
|
|
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
|
|
|
|
# Container count per node
|
|
count(container_last_seen) by (instance)
|
|
|
|
# Network traffic by node
|
|
rate(node_network_receive_bytes_total[5m])
|
|
```
|
|
|
|
## Troubleshooting
|
|
|
|
### Exporter not reachable
|
|
```bash
|
|
# Check if container is running
|
|
ansible swarm_hosts -i inventory/hosts.ini -a "docker ps | grep exporter"
|
|
|
|
# Check firewall
|
|
ansible swarm_hosts -i inventory/hosts.ini -a "ss -tlnp | grep -E '9100|8080'"
|
|
```
|
|
|
|
### Prometheus shows target down
|
|
```bash
|
|
# Test from Watchtower
|
|
curl http://<node-ip>:9100/metrics
|
|
curl http://<node-ip>:8080/metrics
|
|
```
|
|
|
|
## Maintenance
|
|
|
|
### Update all monitoring components
|
|
```bash
|
|
cd /home/chester/homelab/ansible
|
|
ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml
|
|
```
|
|
|
|
### View Prometheus configuration
|
|
```bash
|
|
cat /opt/stacks/watchtower/prometheus-config/prometheus.yml
|
|
```
|
|
|
|
### Check alert rules
|
|
```bash
|
|
cat /opt/stacks/watchtower/prometheus-config/alerts/homelab.yml
|
|
```
|
|
|
|
register: docs_created
|
|
|
|
- name: Display documentation location
|
|
ansible.builtin.debug:
|
|
msg: "📚 Monitoring guide created at: {{ docs_created.dest }}"
|
|
when: docs_created.changed
|