homelab/ansible/ansible-old/playbooks/monitoring/deploy_swarm_monitoring.yml

442 lines
17 KiB
YAML

---
# playbooks/monitoring/deploy_swarm_monitoring.yml
# Complete observability stack deployment for Docker Swarm cluster + standalone hosts
#
# === ARCHITECTURE OVERVIEW ===
# This playbook deploys a three-tier monitoring solution:
#
# TIER 1: Data Collection (Swarm Nodes + Standalone Docker Hosts)
# - node-exporter: Host metrics (CPU, RAM, disk, network) on swarm nodes and standalone hosts
# - cAdvisor: Container metrics (per-container resource usage) on swarm nodes only
#
# TIER 2: Aggregation & Storage (Watchtower)
# - Prometheus: Metrics time-series database
# - Loki: Log aggregation and indexing
#
# TIER 3: Visualization & Alerting (Watchtower)
# - Grafana: Dashboards and data exploration
# - Uptime Kuma: HTTP health checks
# - Dozzle: Real-time log viewer
#
# === PREREQUISITES ===
# - Docker Swarm cluster is initialized and running
# - All nodes are accessible via SSH
# - Docker is installed on all nodes (swarm + standalone hosts)
# - Authentik token is set in group_vars (for Dozzle auth)
#
# === USAGE ===
# Deploy full stack (swarm nodes, standalone hosts, and watchtower):
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml
#
# Deploy only to swarm nodes:
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags swarm
#
# Deploy only to standalone docker hosts:
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags docker-hosts
#
# Deploy only watchtower stack:
# ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml --tags watchtower
- name: Deploy monitoring exporters on swarm nodes
hosts: swarm_hosts
become: false
gather_facts: true
tags: ['swarm', 'exporters']
pre_tasks:
- name: Verify Docker is installed
ansible.builtin.command: docker --version
register: docker_check
changed_when: false
failed_when: docker_check.rc != 0
- name: Display deployment target
ansible.builtin.debug:
msg:
- "🎯 Deploying monitoring exporters to: {{ inventory_hostname }}"
- " Role: {{ 'Manager' if inventory_hostname in groups['swarm_managers'] else 'Worker' }}"
- " IP: {{ ansible_host }}"
roles:
- role: swarm_node_exporter
tags: ['node-exporter']
- role: swarm_cadvisor
tags: ['cadvisor']
- name: Deploy Dozzle swarm agents
hosts: swarm_managers
become: false
gather_facts: false
tags: ['swarm', 'dozzle-agent']
tasks:
- name: Deploy and validate dozzle-agent service from primary manager
ansible.builtin.include_role:
name: swarm_dozzle_agent
when: inventory_hostname == groups['swarm_managers'][0]
post_tasks:
- name: Validate exporter endpoints
ansible.builtin.uri:
url: "{{ item.url }}"
method: GET
status_code: 200
loop:
- { name: "node-exporter", url: "http://localhost:9100/metrics" }
- { name: "cAdvisor", url: "http://localhost:8080/metrics" }
loop_control:
label: "{{ item.name }}"
register: endpoint_check
retries: 3
delay: 5
- name: Display exporter status
ansible.builtin.debug:
msg: "✅ {{ inventory_hostname }}: All exporters are healthy"
- name: Deploy node-exporter on standalone docker hosts
hosts: docker_hosts
become: false
gather_facts: true
tags: ['docker-hosts', 'exporters', 'node-exporter']
pre_tasks:
- name: Verify Docker is installed
ansible.builtin.command: docker --version
register: docker_check
changed_when: false
failed_when: docker_check.rc != 0
- name: Display deployment target
ansible.builtin.debug:
msg:
- "🎯 Deploying node-exporter to standalone docker host: {{ inventory_hostname }}"
- " IP: {{ ansible_host }}"
- " Purpose: Hardware and software metrics collection"
tasks:
- name: Deploy node-exporter role with elevated privileges
ansible.builtin.include_role:
name: swarm_node_exporter
apply:
become: true
tags: ['node-exporter']
post_tasks:
- name: Validate node-exporter endpoint
ansible.builtin.uri:
url: "http://localhost:9100/metrics"
method: GET
status_code: 200
retries: 3
delay: 5
register: exporter_check
- name: Display node-exporter status
ansible.builtin.debug:
msg: "✅ {{ inventory_hostname }}: node-exporter deployed and healthy on port 9100"
- name: Deploy monitoring stack on Watchtower
hosts: watchtower
connection: local
become: false
gather_facts: true
tags: ['watchtower', 'stack']
vars:
# Canonical encrypted vars location (ADR-008)
vault_encrypted_vars_file: "{{ playbook_dir }}/../../group_vars/vault/all.yml"
pre_tasks:
- name: Check vault encrypted vars file state
ansible.builtin.stat:
path: "{{ vault_encrypted_vars_file }}"
register: vault_vars_file_state
- name: Load encrypted vars when present
ansible.builtin.include_vars:
file: "{{ vault_encrypted_vars_file }}"
name: vault_vars
when: vault_vars_file_state.stat.exists
no_log: true
- name: Resolve monitoring secrets from vault or environment fallback
ansible.builtin.set_fact:
grafana_admin_password: >-
{{
(
vault_vars.vault_grafana_admin_password
if (vault_vars is defined and 'vault_grafana_admin_password' in vault_vars)
else (grafana_admin_password | default(''))
) | default('', true)
}}
authentik_outpost_dozzle_token: >-
{{
(
vault_vars.vault_authentik_outpost_dozzle_token
if (vault_vars is defined and 'vault_authentik_outpost_dozzle_token' in vault_vars)
else (
secrets.AUTHENTIK_OUTPOST_DOZZLE_TOKEN
if (secrets is defined and 'AUTHENTIK_OUTPOST_DOZZLE_TOKEN' in secrets)
else lookup('env', 'AUTHENTIK_OUTPOST_DOZZLE_TOKEN')
)
) | default('', true)
}}
pve_exporter_token: >-
{{
(
vault_vars.vault_pve_exporter_token
if (vault_vars is defined and 'vault_pve_exporter_token' in vault_vars)
else lookup('env', 'PVE_EXPORTER_TOKEN')
) | default('', true)
}}
no_log: true
- name: Verify Docker Compose V2 is available
ansible.builtin.command: docker compose version
register: compose_check
changed_when: false
failed_when: compose_check.rc != 0
- name: Display Watchtower deployment info
ansible.builtin.debug:
msg:
- "🏗️ Deploying monitoring stack to Watchtower"
- " Swarm targets: {{ groups['swarm_managers'] | length }} managers + {{ groups['swarm_workers'] | length }} workers"
- " Standalone hosts: {{ groups['docker_hosts'] | length }} (node-exporter)"
- " Total monitored nodes: {{ groups['swarm_hosts'] | length + groups['docker_hosts'] | length + 1 }} (including Watchtower)"
roles:
- role: monitoring_stack
post_tasks:
- name: Wait for Prometheus to be ready
ansible.builtin.uri:
url: "http://{{ watchtower_ip }}:{{ prometheus_host_port }}/-/ready"
method: GET
status_code: 200
retries: 10
delay: 5
register: prometheus_ready
when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus')
- name: Verify Prometheus can scrape all targets
ansible.builtin.uri:
url: "http://{{ watchtower_ip }}:{{ prometheus_host_port }}/api/v1/targets"
method: GET
return_content: true
register: prometheus_targets
retries: 3
delay: 10
when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus')
- name: Build watchtower edge route backend reconciliation list
ansible.builtin.set_fact:
watchtower_edge_route_backends: >-
{{
[
{'name': 'grafana', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (grafana_port | string)},
{'name': 'uptime', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (uptime_kuma_port | string)}
]
+
(
[
{'name': 'dozzle', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (dozzle_port | string)}
]
if (monitoring_enable_dozzle | default(false) | bool) and (dozzle_expose_via_traefik | default(false) | bool)
else []
)
+
(
[
{'name': 'authentik-outpost-dozzle', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (authentik_outpost_port | string)}
]
if monitoring_enable_authentik_outpost | default(false) | bool
else []
)
+
[
{'name': 'portainer', 'url': 'http://' ~ watchtower_ip ~ ':' ~ (portainer_http_port | string)}
]
}}
- name: Reconcile watchtower service backends in Redis edge routing
ansible.builtin.command: >-
ssh {{ (edge_routing | default({})).get('edge_host', {}).get('ip', '10.0.0.151') }}
sudo docker exec redis redis-cli SET
traefik/http/services/{{ item.name }}/loadBalancer/servers/0/url
{{ item.url }}
changed_when: true
loop: "{{ watchtower_edge_route_backends }}"
loop_control:
label: "{{ item.name }} -> {{ item.url }}"
- name: Verify reconciled watchtower service backends in Redis
ansible.builtin.command: >-
ssh {{ (edge_routing | default({})).get('edge_host', {}).get('ip', '10.0.0.151') }}
sudo docker exec redis redis-cli GET
traefik/http/services/{{ item.name }}/loadBalancer/servers/0/url
register: watchtower_route_backend_reads
changed_when: false
loop: "{{ watchtower_edge_route_backends }}"
loop_control:
label: "{{ item.name }}"
- name: Assert watchtower service backends are reconciled to host IP routes
ansible.builtin.assert:
that:
- item.stdout == item.item.url
fail_msg: >-
Edge route drift persisted for {{ item.item.name }}.
Expected {{ item.item.url }}, got {{ item.stdout | default('') }}.
success_msg: >-
Edge route {{ item.item.name }} correctly reconciled to {{ item.item.url }}.
loop: "{{ watchtower_route_backend_reads.results }}"
loop_control:
label: "{{ item.item.name }}"
- name: Display monitoring stack summary
ansible.builtin.debug:
msg:
- "╔════════════════════════════════════════════════════════╗"
- "║ 🎉 SWARM MONITORING STACK DEPLOYED SUCCESSFULLY! ║"
- "╚════════════════════════════════════════════════════════╝"
- ""
- "📊 METRICS & DASHBOARDS:"
- " Prometheus: http://{{ watchtower_ip }}:{{ prometheus_host_port }}"
- " Grafana: https://{{ grafana_domain }}"
- ""
- "📋 LOGS:"
- " Dozzle: https://{{ dozzle_domain }}"
- " Loki API: http://{{ watchtower_ip }}:{{ loki_port }}"
- ""
- "✅ UPTIME:"
- " Uptime Kuma: https://{{ uptime_domain }}"
- ""
- "🔍 NEXT STEPS:"
- " 1. Open Grafana: https://{{ grafana_domain }}"
- " 2. Verify provisioned data sources: {{ grafana_prometheus_datasource_name }} + {{ grafana_loki_datasource_name }}"
- " 3. Review the provisioned dashboard folder: {{ grafana_dashboards_folder }}"
- " 4. Optionally import extra dashboards: 1860, 893, 13639, 10347"
- " 5. Configure Uptime Kuma health checks for swarm services"
- ""
- "📚 CONCEPTS YOU LEARNED:"
- " ✓ Multi-tier monitoring architecture"
- " ✓ Prometheus service discovery & scraping"
- " ✓ Loki label-based log indexing"
- " ✓ Ansible roles for modular infrastructure"
- " ✓ Idempotent deployment (run this playbook anytime!)"
when: not (monitoring_focus_mode | default(false) | bool) or (monitoring_focus_service | default('') == 'prometheus')
- name: Display focused deployment summary
ansible.builtin.debug:
msg:
- "Focused deployment completed"
- "Service: {{ monitoring_focus_service | default('not-set') }}"
- "Mode: additive (existing running services preserved)"
when: monitoring_focus_mode | default(false) | bool and (monitoring_focus_service | default('') != 'prometheus')
- name: Generate monitoring documentation
hosts: localhost
connection: local
gather_facts: false
tags: ['docs']
run_once: true
tasks:
- name: Create monitoring quick-reference guide
ansible.builtin.copy:
dest: "{{ playbook_dir }}/../../documentation/swarm-monitoring-guide.md"
mode: '0644'
content: |
# Docker Swarm Monitoring Guide
**Deployed:** {{ ansible_date_time.iso8601 }}
**Cluster:** {{ groups['swarm_hosts'] | length }} nodes ({{ groups['swarm_managers'] | length }} managers, {{ groups['swarm_workers'] | length }} workers)
## Quick Access
| Service | URL | Purpose |
|---------|-----|---------|
| Prometheus | http://{{ hostvars['localhost'].watchtower_ip }}:{{ hostvars['localhost'].prometheus_port }} | Metrics storage & query |
| Grafana | https://{{ hostvars['localhost'].grafana_domain }} | Dashboards & visualization |
| Loki | http://{{ hostvars['localhost'].watchtower_ip }}:{{ hostvars['localhost'].loki_port }} | Log aggregation |
| Dozzle | https://{{ hostvars['localhost'].dozzle_domain }} | Real-time log viewer |
| Uptime Kuma | https://{{ hostvars['localhost'].uptime_domain }} | Service uptime tracking |
## Monitored Nodes
### Managers
{% for host in groups['swarm_managers'] %}
- **{{ host }}** ({{ hostvars[host].ansible_host }})
- node-exporter: http://{{ hostvars[host].ansible_host }}:9100/metrics
- cAdvisor: http://{{ hostvars[host].ansible_host }}:8080/metrics
{% endfor %}
### Workers
{% for host in groups['swarm_workers'] %}
- **{{ host }}** ({{ hostvars[host].ansible_host }})
- node-exporter: http://{{ hostvars[host].ansible_host }}:9100/metrics
- cAdvisor: http://{{ hostvars[host].ansible_host }}:8080/metrics
{% endfor %}
## Useful Prometheus Queries
```promql
# Total cluster CPU usage
100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# Memory usage per node
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# Container count per node
count(container_last_seen) by (instance)
# Network traffic by node
rate(node_network_receive_bytes_total[5m])
```
## Troubleshooting
### Exporter not reachable
```bash
# Check if container is running
ansible swarm_hosts -i inventory/hosts.ini -a "docker ps | grep exporter"
# Check firewall
ansible swarm_hosts -i inventory/hosts.ini -a "ss -tlnp | grep -E '9100|8080'"
```
### Prometheus shows target down
```bash
# Test from Watchtower
curl http://<node-ip>:9100/metrics
curl http://<node-ip>:8080/metrics
```
## Maintenance
### Update all monitoring components
```bash
cd /home/chester/homelab/ansible
ansible-playbook -i inventory/hosts.ini playbooks/monitoring/deploy_swarm_monitoring.yml
```
### View Prometheus configuration
```bash
cat /opt/stacks/watchtower/prometheus-config/prometheus.yml
```
### Check alert rules
```bash
cat /opt/stacks/watchtower/prometheus-config/alerts/homelab.yml
```
register: docs_created
- name: Display documentation location
ansible.builtin.debug:
msg: "📚 Monitoring guide created at: {{ docs_created.dest }}"
when: docs_created.changed