feat: add onboarding and setup for AI nodes with NVIDIA runtime configuration
This commit is contained in:
parent
689d5a3710
commit
e61bf3d5c7
@ -33,6 +33,9 @@ heimdall ansible_host=10.0.0.151 ansible_user=chester
|
||||
waldorf ansible_host=10.0.0.251 ansible_user=chester
|
||||
ai-p410 ansible_host=10.0.0.202 ansible_user=chester
|
||||
|
||||
[ai_nodes]
|
||||
ai-p410 ansible_host=10.0.0.202 ansible_user=chester
|
||||
|
||||
[raspberry_pi]
|
||||
watchtower ansible_host=10.0.0.200 ansible_user=chester
|
||||
|
||||
|
||||
11
ansible/playbooks/onboard-ai-node.yml
Normal file
11
ansible/playbooks/onboard-ai-node.yml
Normal file
@ -0,0 +1,11 @@
|
||||
---
|
||||
# Dedicated onboarding workflow for AI-focused nodes.
|
||||
# Usage: ansible-playbook playbooks/onboard-ai-node.yml -K --limit ai-p410
|
||||
|
||||
- name: Onboard and baseline AI nodes
|
||||
hosts: ai_nodes
|
||||
gather_facts: true
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: ai_node_onboarding
|
||||
11
ansible/playbooks/setup-ai-gpu-runtime.yml
Normal file
11
ansible/playbooks/setup-ai-gpu-runtime.yml
Normal file
@ -0,0 +1,11 @@
|
||||
---
|
||||
# Configure NVIDIA driver/runtime for AI nodes.
|
||||
# Usage: ansible-playbook playbooks/setup-ai-gpu-runtime.yml -K --limit ai-p410
|
||||
|
||||
- name: Configure NVIDIA runtime on AI nodes
|
||||
hosts: ai_nodes
|
||||
gather_facts: true
|
||||
become: true
|
||||
|
||||
roles:
|
||||
- role: nvidia_runtime_setup
|
||||
36
ansible/roles/ai_node_onboarding/defaults/main.yml
Normal file
36
ansible/roles/ai_node_onboarding/defaults/main.yml
Normal file
@ -0,0 +1,36 @@
|
||||
---
|
||||
# Toggle to true only if you intentionally want to hard-fail when NVIDIA tooling is missing.
|
||||
ai_node_require_nvidia_tooling: false
|
||||
|
||||
# OS packages useful for AI-node observability and build workloads.
|
||||
ai_node_base_packages:
|
||||
- ca-certificates
|
||||
- curl
|
||||
- git
|
||||
- htop
|
||||
- nvtop
|
||||
- pciutils
|
||||
- python3
|
||||
- python3-pip
|
||||
- python3-venv
|
||||
- tmux
|
||||
|
||||
# Conservative kernel tuning for mixed service + AI workloads.
|
||||
ai_node_sysctl:
|
||||
vm.swappiness: "10"
|
||||
vm.max_map_count: "262144"
|
||||
|
||||
# AI workload directories. Keep models/data on persistent storage.
|
||||
ai_node_directories:
|
||||
- path: /srv/ai
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
- path: /srv/ai/models
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
- path: /srv/ai/workspaces
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0775"
|
||||
27
ansible/roles/ai_node_onboarding/tasks/gpu_checks.yml
Normal file
27
ansible/roles/ai_node_onboarding/tasks/gpu_checks.yml
Normal file
@ -0,0 +1,27 @@
|
||||
---
|
||||
- name: Check nvidia-smi availability
|
||||
ansible.builtin.command: nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader
|
||||
register: ai_node_nvidia_smi
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Optionally fail when NVIDIA tooling is required but unavailable
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
NVIDIA GPU tooling is unavailable. Install a compatible NVIDIA driver and
|
||||
nvidia-utils package, then re-run onboarding.
|
||||
when:
|
||||
- ai_node_require_nvidia_tooling | bool
|
||||
- ai_node_nvidia_smi.rc != 0
|
||||
|
||||
- name: Warn when nvidia-smi is unavailable
|
||||
ansible.builtin.debug:
|
||||
msg: >-
|
||||
nvidia-smi is not available yet. This is common on fresh hosts before driver install.
|
||||
Continue onboarding now, then install validated drivers separately.
|
||||
when: ai_node_nvidia_smi.rc != 0
|
||||
|
||||
- name: Capture GPU info lines
|
||||
ansible.builtin.set_fact:
|
||||
ai_node_gpu_lines: "{{ ai_node_nvidia_smi.stdout_lines | default([]) }}"
|
||||
when: ai_node_nvidia_smi.rc == 0
|
||||
15
ansible/roles/ai_node_onboarding/tasks/main.yml
Normal file
15
ansible/roles/ai_node_onboarding/tasks/main.yml
Normal file
@ -0,0 +1,15 @@
|
||||
---
|
||||
- name: Validate AI node prerequisites
|
||||
ansible.builtin.import_tasks: validate.yml
|
||||
|
||||
- name: Install baseline packages
|
||||
ansible.builtin.import_tasks: prereqs.yml
|
||||
|
||||
- name: Apply kernel tuning
|
||||
ansible.builtin.import_tasks: tuning.yml
|
||||
|
||||
- name: Run GPU readiness checks
|
||||
ansible.builtin.import_tasks: gpu_checks.yml
|
||||
|
||||
- name: Show onboarding summary
|
||||
ansible.builtin.import_tasks: summary.yml
|
||||
15
ansible/roles/ai_node_onboarding/tasks/prereqs.yml
Normal file
15
ansible/roles/ai_node_onboarding/tasks/prereqs.yml
Normal file
@ -0,0 +1,15 @@
|
||||
---
|
||||
- name: Install AI-node baseline packages
|
||||
ansible.builtin.apt:
|
||||
name: "{{ ai_node_base_packages }}"
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Ensure AI workload directories exist
|
||||
ansible.builtin.file:
|
||||
path: "{{ item.path }}"
|
||||
state: directory
|
||||
owner: "{{ item.owner }}"
|
||||
group: "{{ item.group }}"
|
||||
mode: "{{ item.mode }}"
|
||||
loop: "{{ ai_node_directories }}"
|
||||
12
ansible/roles/ai_node_onboarding/tasks/summary.yml
Normal file
12
ansible/roles/ai_node_onboarding/tasks/summary.yml
Normal file
@ -0,0 +1,12 @@
|
||||
---
|
||||
- name: Summarize AI-node onboarding results
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "=========================================="
|
||||
- "AI node onboarding complete for {{ inventory_hostname }}"
|
||||
- "=========================================="
|
||||
- "RAM (MB): {{ ansible_memtotal_mb }}"
|
||||
- "NVIDIA detected via lspci: {{ ai_node_has_nvidia_gpu | default(false) }}"
|
||||
- "nvidia-smi ready: {{ (ai_node_nvidia_smi.rc | default(1)) == 0 }}"
|
||||
- "GPU details: {{ ai_node_gpu_lines | default(['not available']) | join('; ') }}"
|
||||
- "AI directories: {{ ai_node_directories | map(attribute='path') | list | join(', ') }}"
|
||||
8
ansible/roles/ai_node_onboarding/tasks/tuning.yml
Normal file
8
ansible/roles/ai_node_onboarding/tasks/tuning.yml
Normal file
@ -0,0 +1,8 @@
|
||||
---
|
||||
- name: Apply sysctl settings for AI workloads
|
||||
ansible.posix.sysctl:
|
||||
name: "{{ item.key }}"
|
||||
value: "{{ item.value }}"
|
||||
state: present
|
||||
reload: true
|
||||
loop: "{{ ai_node_sysctl | dict2items }}"
|
||||
26
ansible/roles/ai_node_onboarding/tasks/validate.yml
Normal file
26
ansible/roles/ai_node_onboarding/tasks/validate.yml
Normal file
@ -0,0 +1,26 @@
|
||||
---
|
||||
- name: Assert supported operating system family
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_os_family == "Debian"
|
||||
fail_msg: "ai_node_onboarding currently supports Debian/Ubuntu only."
|
||||
|
||||
- name: Assert minimum RAM for AI node profile
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_memtotal_mb | int >= 16384
|
||||
fail_msg: "AI node profile expects at least 16 GB RAM."
|
||||
|
||||
- name: Detect NVIDIA GPU via lspci
|
||||
ansible.builtin.command: lspci
|
||||
register: ai_node_lspci
|
||||
changed_when: false
|
||||
|
||||
- name: Derive GPU detection flag
|
||||
ansible.builtin.set_fact:
|
||||
ai_node_has_nvidia_gpu: "{{ 'NVIDIA' in ai_node_lspci.stdout }}"
|
||||
|
||||
- name: Warn when no NVIDIA GPU is detected
|
||||
ansible.builtin.debug:
|
||||
msg: "No NVIDIA GPU was detected via lspci; continuing because this check is advisory."
|
||||
when: not ai_node_has_nvidia_gpu | bool
|
||||
31
ansible/roles/nvidia_runtime_setup/README.md
Normal file
31
ansible/roles/nvidia_runtime_setup/README.md
Normal file
@ -0,0 +1,31 @@
|
||||
# nvidia_runtime_setup
|
||||
|
||||
Ansible role to configure NVIDIA driver/runtime readiness on Debian-family hosts.
|
||||
|
||||
## What it does
|
||||
|
||||
- Detects NVIDIA GPU hardware via `lspci`
|
||||
- Auto-selects a recommended driver on Ubuntu (or uses an explicit package pin)
|
||||
- Installs the NVIDIA driver package
|
||||
- Optionally installs CUDA toolkit and NVIDIA container toolkit
|
||||
- Handles optional reboot logic
|
||||
- Verifies readiness with `nvidia-smi`
|
||||
|
||||
## Safe defaults
|
||||
|
||||
- Reboot is disabled by default (`nvidia_runtime_reboot_if_needed: false`)
|
||||
- CUDA and container toolkit installs are disabled by default
|
||||
- Validation is enabled by default and fails if `nvidia-smi` is unavailable
|
||||
|
||||
## Example
|
||||
|
||||
```yaml
|
||||
---
|
||||
- name: Configure NVIDIA runtime for AI nodes
|
||||
hosts: ai_nodes
|
||||
become: true
|
||||
roles:
|
||||
- role: nvidia_runtime_setup
|
||||
vars:
|
||||
nvidia_runtime_reboot_if_needed: true
|
||||
```
|
||||
25
ansible/roles/nvidia_runtime_setup/defaults/main.yml
Normal file
25
ansible/roles/nvidia_runtime_setup/defaults/main.yml
Normal file
@ -0,0 +1,25 @@
|
||||
---
|
||||
# Fail if no NVIDIA hardware is detected.
|
||||
nvidia_runtime_require_gpu: true
|
||||
|
||||
# Install/repair NVIDIA driver packages.
|
||||
nvidia_runtime_install_driver: true
|
||||
|
||||
# Optional explicit driver package pin (for example: nvidia-driver-550).
|
||||
# When empty on Ubuntu, the role will auto-detect the recommended package.
|
||||
nvidia_runtime_driver_package: ""
|
||||
|
||||
# Install CUDA toolkit from distro repository.
|
||||
nvidia_runtime_install_cuda_toolkit: false
|
||||
nvidia_runtime_cuda_package: nvidia-cuda-toolkit
|
||||
|
||||
# Install NVIDIA container runtime package if available in configured repos.
|
||||
nvidia_runtime_install_container_toolkit: false
|
||||
nvidia_runtime_container_toolkit_package: nvidia-container-toolkit
|
||||
|
||||
# Reboot handling
|
||||
nvidia_runtime_reboot_if_needed: false
|
||||
nvidia_runtime_reboot_timeout: 900
|
||||
|
||||
# Post-install validation behavior
|
||||
nvidia_runtime_validate_after_install: true
|
||||
64
ansible/roles/nvidia_runtime_setup/tasks/detect.yml
Normal file
64
ansible/roles/nvidia_runtime_setup/tasks/detect.yml
Normal file
@ -0,0 +1,64 @@
|
||||
---
|
||||
- name: Ensure hardware detection utilities are present
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- pciutils
|
||||
- ubuntu-drivers-common
|
||||
state: present
|
||||
update_cache: true
|
||||
when: ansible_distribution == "Ubuntu"
|
||||
|
||||
- name: Ensure hardware detection utilities are present (non-Ubuntu)
|
||||
ansible.builtin.apt:
|
||||
name:
|
||||
- pciutils
|
||||
state: present
|
||||
update_cache: true
|
||||
when: ansible_distribution != "Ubuntu"
|
||||
|
||||
- name: Detect PCI devices
|
||||
ansible.builtin.command: lspci
|
||||
register: nvidia_runtime_lspci
|
||||
changed_when: false
|
||||
|
||||
- name: Set hardware detection fact
|
||||
ansible.builtin.set_fact:
|
||||
nvidia_runtime_has_gpu: "{{ 'NVIDIA' in nvidia_runtime_lspci.stdout }}"
|
||||
|
||||
- name: Stop when GPU is required but missing
|
||||
ansible.builtin.fail:
|
||||
msg: "No NVIDIA GPU detected on this host."
|
||||
when:
|
||||
- nvidia_runtime_require_gpu | bool
|
||||
- not nvidia_runtime_has_gpu | bool
|
||||
|
||||
- name: Detect recommended Ubuntu NVIDIA driver
|
||||
ansible.builtin.command: ubuntu-drivers devices
|
||||
register: nvidia_runtime_ubuntu_drivers
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
when:
|
||||
- ansible_distribution == "Ubuntu"
|
||||
- nvidia_runtime_driver_package | length == 0
|
||||
|
||||
- name: Derive auto-selected driver package
|
||||
ansible.builtin.set_fact:
|
||||
nvidia_runtime_selected_driver: >-
|
||||
{{
|
||||
nvidia_runtime_driver_package
|
||||
if (nvidia_runtime_driver_package | length > 0)
|
||||
else (
|
||||
(nvidia_runtime_ubuntu_drivers.stdout | default(''))
|
||||
| regex_search('nvidia-driver-[0-9]+')
|
||||
| default('')
|
||||
)
|
||||
}}
|
||||
|
||||
- name: Validate selected driver package
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
Could not determine an NVIDIA driver package automatically.
|
||||
Set nvidia_runtime_driver_package explicitly.
|
||||
when:
|
||||
- nvidia_runtime_install_driver | bool
|
||||
- nvidia_runtime_selected_driver | length == 0
|
||||
19
ansible/roles/nvidia_runtime_setup/tasks/install.yml
Normal file
19
ansible/roles/nvidia_runtime_setup/tasks/install.yml
Normal file
@ -0,0 +1,19 @@
|
||||
---
|
||||
- name: Install NVIDIA driver package
|
||||
ansible.builtin.apt:
|
||||
name: "{{ nvidia_runtime_selected_driver }}"
|
||||
state: present
|
||||
update_cache: true
|
||||
when: nvidia_runtime_install_driver | bool
|
||||
|
||||
- name: Install CUDA toolkit package
|
||||
ansible.builtin.apt:
|
||||
name: "{{ nvidia_runtime_cuda_package }}"
|
||||
state: present
|
||||
when: nvidia_runtime_install_cuda_toolkit | bool
|
||||
|
||||
- name: Install NVIDIA container toolkit package
|
||||
ansible.builtin.apt:
|
||||
name: "{{ nvidia_runtime_container_toolkit_package }}"
|
||||
state: present
|
||||
when: nvidia_runtime_install_container_toolkit | bool
|
||||
18
ansible/roles/nvidia_runtime_setup/tasks/main.yml
Normal file
18
ansible/roles/nvidia_runtime_setup/tasks/main.yml
Normal file
@ -0,0 +1,18 @@
|
||||
---
|
||||
- name: Validate role inputs
|
||||
ansible.builtin.import_tasks: validate.yml
|
||||
|
||||
- name: Detect NVIDIA hardware and tooling
|
||||
ansible.builtin.import_tasks: detect.yml
|
||||
|
||||
- name: Install driver and optional runtime packages
|
||||
ansible.builtin.import_tasks: install.yml
|
||||
|
||||
- name: Handle reboot requirements
|
||||
ansible.builtin.import_tasks: reboot.yml
|
||||
|
||||
- name: Validate NVIDIA runtime state
|
||||
ansible.builtin.import_tasks: verify.yml
|
||||
|
||||
- name: Print runtime summary
|
||||
ansible.builtin.import_tasks: summary.yml
|
||||
22
ansible/roles/nvidia_runtime_setup/tasks/reboot.yml
Normal file
22
ansible/roles/nvidia_runtime_setup/tasks/reboot.yml
Normal file
@ -0,0 +1,22 @@
|
||||
---
|
||||
- name: Check whether reboot is required
|
||||
ansible.builtin.stat:
|
||||
path: /var/run/reboot-required
|
||||
register: nvidia_runtime_reboot_required
|
||||
|
||||
- name: Warn when reboot is required but disabled
|
||||
ansible.builtin.debug:
|
||||
msg: >-
|
||||
NVIDIA packages were installed but reboot is required.
|
||||
Set nvidia_runtime_reboot_if_needed=true to allow automatic reboot.
|
||||
when:
|
||||
- nvidia_runtime_reboot_required.stat.exists
|
||||
- not nvidia_runtime_reboot_if_needed | bool
|
||||
|
||||
- name: Reboot host when required and enabled
|
||||
ansible.builtin.reboot:
|
||||
msg: "Reboot triggered by nvidia_runtime_setup role"
|
||||
reboot_timeout: "{{ nvidia_runtime_reboot_timeout }}"
|
||||
when:
|
||||
- nvidia_runtime_reboot_required.stat.exists
|
||||
- nvidia_runtime_reboot_if_needed | bool
|
||||
12
ansible/roles/nvidia_runtime_setup/tasks/summary.yml
Normal file
12
ansible/roles/nvidia_runtime_setup/tasks/summary.yml
Normal file
@ -0,0 +1,12 @@
|
||||
---
|
||||
- name: Print NVIDIA runtime summary
|
||||
ansible.builtin.debug:
|
||||
msg:
|
||||
- "=========================================="
|
||||
- "NVIDIA runtime setup complete for {{ inventory_hostname }}"
|
||||
- "=========================================="
|
||||
- "GPU detected via lspci: {{ nvidia_runtime_has_gpu | default(false) }}"
|
||||
- "Driver package selected: {{ nvidia_runtime_selected_driver | default('not set') }}"
|
||||
- "Reboot required: {{ nvidia_runtime_reboot_required.stat.exists | default(false) }}"
|
||||
- "nvidia-smi ready: {{ (nvidia_runtime_smi.rc | default(1)) == 0 }}"
|
||||
- "GPU details: {{ nvidia_runtime_gpu_lines | default(['not available']) | join('; ') }}"
|
||||
20
ansible/roles/nvidia_runtime_setup/tasks/validate.yml
Normal file
20
ansible/roles/nvidia_runtime_setup/tasks/validate.yml
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
- name: Assert supported OS family
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_os_family == "Debian"
|
||||
fail_msg: "nvidia_runtime_setup currently supports Debian-family distributions only."
|
||||
|
||||
- name: Assert explicit driver package on non-Ubuntu systems
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- ansible_distribution == "Ubuntu" or nvidia_runtime_driver_package | length > 0
|
||||
fail_msg: >-
|
||||
On non-Ubuntu systems set nvidia_runtime_driver_package explicitly to a valid package name.
|
||||
|
||||
- name: Assert optional package names are not empty when enabled
|
||||
ansible.builtin.assert:
|
||||
that:
|
||||
- not nvidia_runtime_install_cuda_toolkit or nvidia_runtime_cuda_package | length > 0
|
||||
- not nvidia_runtime_install_container_toolkit or nvidia_runtime_container_toolkit_package | length > 0
|
||||
fail_msg: "Optional package toggles are enabled but package names are missing."
|
||||
20
ansible/roles/nvidia_runtime_setup/tasks/verify.yml
Normal file
20
ansible/roles/nvidia_runtime_setup/tasks/verify.yml
Normal file
@ -0,0 +1,20 @@
|
||||
---
|
||||
- name: Check nvidia-smi status
|
||||
ansible.builtin.command: nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader
|
||||
register: nvidia_runtime_smi
|
||||
changed_when: false
|
||||
failed_when: false
|
||||
|
||||
- name: Fail when post-install validation is required and nvidia-smi is unavailable
|
||||
ansible.builtin.fail:
|
||||
msg: >-
|
||||
nvidia-smi is unavailable after installation.
|
||||
This usually means a reboot is still required or the selected driver is incompatible.
|
||||
when:
|
||||
- nvidia_runtime_validate_after_install | bool
|
||||
- nvidia_runtime_smi.rc != 0
|
||||
|
||||
- name: Capture GPU info lines
|
||||
ansible.builtin.set_fact:
|
||||
nvidia_runtime_gpu_lines: "{{ nvidia_runtime_smi.stdout_lines | default([]) }}"
|
||||
when: nvidia_runtime_smi.rc == 0
|
||||
Loading…
x
Reference in New Issue
Block a user