feat: add onboarding and setup for AI nodes with NVIDIA runtime configuration

This commit is contained in:
Nathan 2026-04-21 12:01:58 -04:00
parent 689d5a3710
commit e61bf3d5c7
19 changed files with 395 additions and 0 deletions

View File

@ -33,6 +33,9 @@ heimdall ansible_host=10.0.0.151 ansible_user=chester
waldorf ansible_host=10.0.0.251 ansible_user=chester waldorf ansible_host=10.0.0.251 ansible_user=chester
ai-p410 ansible_host=10.0.0.202 ansible_user=chester ai-p410 ansible_host=10.0.0.202 ansible_user=chester
[ai_nodes]
ai-p410 ansible_host=10.0.0.202 ansible_user=chester
[raspberry_pi] [raspberry_pi]
watchtower ansible_host=10.0.0.200 ansible_user=chester watchtower ansible_host=10.0.0.200 ansible_user=chester

View File

@ -0,0 +1,11 @@
---
# Dedicated onboarding workflow for AI-focused nodes.
# Usage: ansible-playbook playbooks/onboard-ai-node.yml -K --limit ai-p410
- name: Onboard and baseline AI nodes
hosts: ai_nodes
gather_facts: true
become: true
roles:
- role: ai_node_onboarding

View File

@ -0,0 +1,11 @@
---
# Configure NVIDIA driver/runtime for AI nodes.
# Usage: ansible-playbook playbooks/setup-ai-gpu-runtime.yml -K --limit ai-p410
- name: Configure NVIDIA runtime on AI nodes
hosts: ai_nodes
gather_facts: true
become: true
roles:
- role: nvidia_runtime_setup

View File

@ -0,0 +1,36 @@
---
# Toggle to true only if you intentionally want to hard-fail when NVIDIA tooling is missing.
ai_node_require_nvidia_tooling: false
# OS packages useful for AI-node observability and build workloads.
ai_node_base_packages:
- ca-certificates
- curl
- git
- htop
- nvtop
- pciutils
- python3
- python3-pip
- python3-venv
- tmux
# Conservative kernel tuning for mixed service + AI workloads.
ai_node_sysctl:
vm.swappiness: "10"
vm.max_map_count: "262144"
# AI workload directories. Keep models/data on persistent storage.
ai_node_directories:
- path: /srv/ai
owner: root
group: root
mode: "0755"
- path: /srv/ai/models
owner: root
group: root
mode: "0755"
- path: /srv/ai/workspaces
owner: root
group: root
mode: "0775"

View File

@ -0,0 +1,27 @@
---
- name: Check nvidia-smi availability
ansible.builtin.command: nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader
register: ai_node_nvidia_smi
changed_when: false
failed_when: false
- name: Optionally fail when NVIDIA tooling is required but unavailable
ansible.builtin.fail:
msg: >-
NVIDIA GPU tooling is unavailable. Install a compatible NVIDIA driver and
nvidia-utils package, then re-run onboarding.
when:
- ai_node_require_nvidia_tooling | bool
- ai_node_nvidia_smi.rc != 0
- name: Warn when nvidia-smi is unavailable
ansible.builtin.debug:
msg: >-
nvidia-smi is not available yet. This is common on fresh hosts before driver install.
Continue onboarding now, then install validated drivers separately.
when: ai_node_nvidia_smi.rc != 0
- name: Capture GPU info lines
ansible.builtin.set_fact:
ai_node_gpu_lines: "{{ ai_node_nvidia_smi.stdout_lines | default([]) }}"
when: ai_node_nvidia_smi.rc == 0

View File

@ -0,0 +1,15 @@
---
- name: Validate AI node prerequisites
ansible.builtin.import_tasks: validate.yml
- name: Install baseline packages
ansible.builtin.import_tasks: prereqs.yml
- name: Apply kernel tuning
ansible.builtin.import_tasks: tuning.yml
- name: Run GPU readiness checks
ansible.builtin.import_tasks: gpu_checks.yml
- name: Show onboarding summary
ansible.builtin.import_tasks: summary.yml

View File

@ -0,0 +1,15 @@
---
- name: Install AI-node baseline packages
ansible.builtin.apt:
name: "{{ ai_node_base_packages }}"
state: present
update_cache: true
- name: Ensure AI workload directories exist
ansible.builtin.file:
path: "{{ item.path }}"
state: directory
owner: "{{ item.owner }}"
group: "{{ item.group }}"
mode: "{{ item.mode }}"
loop: "{{ ai_node_directories }}"

View File

@ -0,0 +1,12 @@
---
- name: Summarize AI-node onboarding results
ansible.builtin.debug:
msg:
- "=========================================="
- "AI node onboarding complete for {{ inventory_hostname }}"
- "=========================================="
- "RAM (MB): {{ ansible_memtotal_mb }}"
- "NVIDIA detected via lspci: {{ ai_node_has_nvidia_gpu | default(false) }}"
- "nvidia-smi ready: {{ (ai_node_nvidia_smi.rc | default(1)) == 0 }}"
- "GPU details: {{ ai_node_gpu_lines | default(['not available']) | join('; ') }}"
- "AI directories: {{ ai_node_directories | map(attribute='path') | list | join(', ') }}"

View File

@ -0,0 +1,8 @@
---
- name: Apply sysctl settings for AI workloads
ansible.posix.sysctl:
name: "{{ item.key }}"
value: "{{ item.value }}"
state: present
reload: true
loop: "{{ ai_node_sysctl | dict2items }}"

View File

@ -0,0 +1,26 @@
---
- name: Assert supported operating system family
ansible.builtin.assert:
that:
- ansible_os_family == "Debian"
fail_msg: "ai_node_onboarding currently supports Debian/Ubuntu only."
- name: Assert minimum RAM for AI node profile
ansible.builtin.assert:
that:
- ansible_memtotal_mb | int >= 16384
fail_msg: "AI node profile expects at least 16 GB RAM."
- name: Detect NVIDIA GPU via lspci
ansible.builtin.command: lspci
register: ai_node_lspci
changed_when: false
- name: Derive GPU detection flag
ansible.builtin.set_fact:
ai_node_has_nvidia_gpu: "{{ 'NVIDIA' in ai_node_lspci.stdout }}"
- name: Warn when no NVIDIA GPU is detected
ansible.builtin.debug:
msg: "No NVIDIA GPU was detected via lspci; continuing because this check is advisory."
when: not ai_node_has_nvidia_gpu | bool

View File

@ -0,0 +1,31 @@
# nvidia_runtime_setup
Ansible role to configure NVIDIA driver/runtime readiness on Debian-family hosts.
## What it does
- Detects NVIDIA GPU hardware via `lspci`
- Auto-selects a recommended driver on Ubuntu (or uses an explicit package pin)
- Installs the NVIDIA driver package
- Optionally installs CUDA toolkit and NVIDIA container toolkit
- Handles optional reboot logic
- Verifies readiness with `nvidia-smi`
## Safe defaults
- Reboot is disabled by default (`nvidia_runtime_reboot_if_needed: false`)
- CUDA and container toolkit installs are disabled by default
- Validation is enabled by default and fails if `nvidia-smi` is unavailable
## Example
```yaml
---
- name: Configure NVIDIA runtime for AI nodes
hosts: ai_nodes
become: true
roles:
- role: nvidia_runtime_setup
vars:
nvidia_runtime_reboot_if_needed: true
```

View File

@ -0,0 +1,25 @@
---
# Fail if no NVIDIA hardware is detected.
nvidia_runtime_require_gpu: true
# Install/repair NVIDIA driver packages.
nvidia_runtime_install_driver: true
# Optional explicit driver package pin (for example: nvidia-driver-550).
# When empty on Ubuntu, the role will auto-detect the recommended package.
nvidia_runtime_driver_package: ""
# Install CUDA toolkit from distro repository.
nvidia_runtime_install_cuda_toolkit: false
nvidia_runtime_cuda_package: nvidia-cuda-toolkit
# Install NVIDIA container runtime package if available in configured repos.
nvidia_runtime_install_container_toolkit: false
nvidia_runtime_container_toolkit_package: nvidia-container-toolkit
# Reboot handling
nvidia_runtime_reboot_if_needed: false
nvidia_runtime_reboot_timeout: 900
# Post-install validation behavior
nvidia_runtime_validate_after_install: true

View File

@ -0,0 +1,64 @@
---
- name: Ensure hardware detection utilities are present
ansible.builtin.apt:
name:
- pciutils
- ubuntu-drivers-common
state: present
update_cache: true
when: ansible_distribution == "Ubuntu"
- name: Ensure hardware detection utilities are present (non-Ubuntu)
ansible.builtin.apt:
name:
- pciutils
state: present
update_cache: true
when: ansible_distribution != "Ubuntu"
- name: Detect PCI devices
ansible.builtin.command: lspci
register: nvidia_runtime_lspci
changed_when: false
- name: Set hardware detection fact
ansible.builtin.set_fact:
nvidia_runtime_has_gpu: "{{ 'NVIDIA' in nvidia_runtime_lspci.stdout }}"
- name: Stop when GPU is required but missing
ansible.builtin.fail:
msg: "No NVIDIA GPU detected on this host."
when:
- nvidia_runtime_require_gpu | bool
- not nvidia_runtime_has_gpu | bool
- name: Detect recommended Ubuntu NVIDIA driver
ansible.builtin.command: ubuntu-drivers devices
register: nvidia_runtime_ubuntu_drivers
changed_when: false
failed_when: false
when:
- ansible_distribution == "Ubuntu"
- nvidia_runtime_driver_package | length == 0
- name: Derive auto-selected driver package
ansible.builtin.set_fact:
nvidia_runtime_selected_driver: >-
{{
nvidia_runtime_driver_package
if (nvidia_runtime_driver_package | length > 0)
else (
(nvidia_runtime_ubuntu_drivers.stdout | default(''))
| regex_search('nvidia-driver-[0-9]+')
| default('')
)
}}
- name: Validate selected driver package
ansible.builtin.fail:
msg: >-
Could not determine an NVIDIA driver package automatically.
Set nvidia_runtime_driver_package explicitly.
when:
- nvidia_runtime_install_driver | bool
- nvidia_runtime_selected_driver | length == 0

View File

@ -0,0 +1,19 @@
---
- name: Install NVIDIA driver package
ansible.builtin.apt:
name: "{{ nvidia_runtime_selected_driver }}"
state: present
update_cache: true
when: nvidia_runtime_install_driver | bool
- name: Install CUDA toolkit package
ansible.builtin.apt:
name: "{{ nvidia_runtime_cuda_package }}"
state: present
when: nvidia_runtime_install_cuda_toolkit | bool
- name: Install NVIDIA container toolkit package
ansible.builtin.apt:
name: "{{ nvidia_runtime_container_toolkit_package }}"
state: present
when: nvidia_runtime_install_container_toolkit | bool

View File

@ -0,0 +1,18 @@
---
- name: Validate role inputs
ansible.builtin.import_tasks: validate.yml
- name: Detect NVIDIA hardware and tooling
ansible.builtin.import_tasks: detect.yml
- name: Install driver and optional runtime packages
ansible.builtin.import_tasks: install.yml
- name: Handle reboot requirements
ansible.builtin.import_tasks: reboot.yml
- name: Validate NVIDIA runtime state
ansible.builtin.import_tasks: verify.yml
- name: Print runtime summary
ansible.builtin.import_tasks: summary.yml

View File

@ -0,0 +1,22 @@
---
- name: Check whether reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: nvidia_runtime_reboot_required
- name: Warn when reboot is required but disabled
ansible.builtin.debug:
msg: >-
NVIDIA packages were installed but reboot is required.
Set nvidia_runtime_reboot_if_needed=true to allow automatic reboot.
when:
- nvidia_runtime_reboot_required.stat.exists
- not nvidia_runtime_reboot_if_needed | bool
- name: Reboot host when required and enabled
ansible.builtin.reboot:
msg: "Reboot triggered by nvidia_runtime_setup role"
reboot_timeout: "{{ nvidia_runtime_reboot_timeout }}"
when:
- nvidia_runtime_reboot_required.stat.exists
- nvidia_runtime_reboot_if_needed | bool

View File

@ -0,0 +1,12 @@
---
- name: Print NVIDIA runtime summary
ansible.builtin.debug:
msg:
- "=========================================="
- "NVIDIA runtime setup complete for {{ inventory_hostname }}"
- "=========================================="
- "GPU detected via lspci: {{ nvidia_runtime_has_gpu | default(false) }}"
- "Driver package selected: {{ nvidia_runtime_selected_driver | default('not set') }}"
- "Reboot required: {{ nvidia_runtime_reboot_required.stat.exists | default(false) }}"
- "nvidia-smi ready: {{ (nvidia_runtime_smi.rc | default(1)) == 0 }}"
- "GPU details: {{ nvidia_runtime_gpu_lines | default(['not available']) | join('; ') }}"

View File

@ -0,0 +1,20 @@
---
- name: Assert supported OS family
ansible.builtin.assert:
that:
- ansible_os_family == "Debian"
fail_msg: "nvidia_runtime_setup currently supports Debian-family distributions only."
- name: Assert explicit driver package on non-Ubuntu systems
ansible.builtin.assert:
that:
- ansible_distribution == "Ubuntu" or nvidia_runtime_driver_package | length > 0
fail_msg: >-
On non-Ubuntu systems set nvidia_runtime_driver_package explicitly to a valid package name.
- name: Assert optional package names are not empty when enabled
ansible.builtin.assert:
that:
- not nvidia_runtime_install_cuda_toolkit or nvidia_runtime_cuda_package | length > 0
- not nvidia_runtime_install_container_toolkit or nvidia_runtime_container_toolkit_package | length > 0
fail_msg: "Optional package toggles are enabled but package names are missing."

View File

@ -0,0 +1,20 @@
---
- name: Check nvidia-smi status
ansible.builtin.command: nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader
register: nvidia_runtime_smi
changed_when: false
failed_when: false
- name: Fail when post-install validation is required and nvidia-smi is unavailable
ansible.builtin.fail:
msg: >-
nvidia-smi is unavailable after installation.
This usually means a reboot is still required or the selected driver is incompatible.
when:
- nvidia_runtime_validate_after_install | bool
- nvidia_runtime_smi.rc != 0
- name: Capture GPU info lines
ansible.builtin.set_fact:
nvidia_runtime_gpu_lines: "{{ nvidia_runtime_smi.stdout_lines | default([]) }}"
when: nvidia_runtime_smi.rc == 0