homelab/ansible/archive/templates/stacks/traefik-kop.stack.yml

66 lines
2.6 KiB
YAML

# traefik-kop Swarm stack
# Managed by Ansible — manual edits will be overwritten on next deploy.
# Source vars: group_vars/all.yml (edge_routing.swarm.*)
# Deploy via: ansible-playbook playbooks/docker/deploy_traefik_kop.yml
#
# WHAT THIS DOES:
# Runs as a Swarm service on a manager node. Reads Docker service labels
# (traefik.enable=true etc.) from Swarm services and publishes routing
# rules into the Redis instance on Heimdall ({{ edge_routing.integration.redis_addr }}).
# Traefik then picks up these routes from Redis automatically.
#
# NETWORK NOTE:
# proxy-net here is a Swarm overlay network — distinct from the bridge
# network of the same name on Heimdall. The overlay allows future Swarm
# services to declare `networks: [proxy-net]` and be discoverable by kop.
version: "3.9"
services:
traefik-kop:
image: "{{ edge_routing.integration.agent_image }}"
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
# WHY :ro — kop only reads Swarm service state, never modifies Docker.
# Read-only mount is defence-in-depth against container escape.
environment:
- REDIS_ADDR={{ edge_routing.integration.redis_addr }}
- BIND_IP={{ edge_routing.swarm.bind_ip }}
# WHY BIND_IP is a Swarm node IP (not Heimdall):
# kop writes "route traffic for <service> to BIND_IP:<published-port>".
# The Swarm routing mesh makes published ports available on ALL nodes,
# so Traefik sends the request here and the mesh handles the rest.
networks:
- proxy-net
deploy:
replicas: 1
placement:
constraints:
- node.role == manager
# WHY manager only: only manager nodes hold full Swarm Raft state.
# A worker node has an incomplete view of all services and their labels.
restart_policy:
condition: on-failure
delay: 5s
max_attempts: 3
window: 30s
# WHY on-failure (not always): avoids rapid reconnect storms
# against Redis during a network partition.
update_config:
parallelism: 1
order: start-first
failure_action: rollback
delay: 10s
monitor: 30s
# WHY start-first: new task starts before old one stops, giving
# zero downtime. Rollback triggers if monitoring detects failure.
rollback_config:
parallelism: 1
order: stop-first
networks:
proxy-net:
external: true
name: "{{ edge_routing.swarm.proxy_network }}"
# WHY external: this overlay network is pre-created in the deploy playbook
# so future Swarm service stacks can also join it without stack coupling.