From e61bf3d5c769dc9891bb08a50fbb54e0509c27ae Mon Sep 17 00:00:00 2001 From: Nathan Date: Tue, 21 Apr 2026 12:01:58 -0400 Subject: [PATCH] feat: add onboarding and setup for AI nodes with NVIDIA runtime configuration --- ansible/inventory/hosts.ini | 3 + ansible/playbooks/onboard-ai-node.yml | 11 ++++ ansible/playbooks/setup-ai-gpu-runtime.yml | 11 ++++ .../ai_node_onboarding/defaults/main.yml | 36 +++++++++++ .../ai_node_onboarding/tasks/gpu_checks.yml | 27 ++++++++ .../roles/ai_node_onboarding/tasks/main.yml | 15 +++++ .../ai_node_onboarding/tasks/prereqs.yml | 15 +++++ .../ai_node_onboarding/tasks/summary.yml | 12 ++++ .../roles/ai_node_onboarding/tasks/tuning.yml | 8 +++ .../ai_node_onboarding/tasks/validate.yml | 26 ++++++++ ansible/roles/nvidia_runtime_setup/README.md | 31 +++++++++ .../nvidia_runtime_setup/defaults/main.yml | 25 ++++++++ .../nvidia_runtime_setup/tasks/detect.yml | 64 +++++++++++++++++++ .../nvidia_runtime_setup/tasks/install.yml | 19 ++++++ .../roles/nvidia_runtime_setup/tasks/main.yml | 18 ++++++ .../nvidia_runtime_setup/tasks/reboot.yml | 22 +++++++ .../nvidia_runtime_setup/tasks/summary.yml | 12 ++++ .../nvidia_runtime_setup/tasks/validate.yml | 20 ++++++ .../nvidia_runtime_setup/tasks/verify.yml | 20 ++++++ 19 files changed, 395 insertions(+) create mode 100644 ansible/playbooks/onboard-ai-node.yml create mode 100644 ansible/playbooks/setup-ai-gpu-runtime.yml create mode 100644 ansible/roles/ai_node_onboarding/defaults/main.yml create mode 100644 ansible/roles/ai_node_onboarding/tasks/gpu_checks.yml create mode 100644 ansible/roles/ai_node_onboarding/tasks/main.yml create mode 100644 ansible/roles/ai_node_onboarding/tasks/prereqs.yml create mode 100644 ansible/roles/ai_node_onboarding/tasks/summary.yml create mode 100644 ansible/roles/ai_node_onboarding/tasks/tuning.yml create mode 100644 ansible/roles/ai_node_onboarding/tasks/validate.yml create mode 100644 ansible/roles/nvidia_runtime_setup/README.md create mode 100644 ansible/roles/nvidia_runtime_setup/defaults/main.yml create mode 100644 ansible/roles/nvidia_runtime_setup/tasks/detect.yml create mode 100644 ansible/roles/nvidia_runtime_setup/tasks/install.yml create mode 100644 ansible/roles/nvidia_runtime_setup/tasks/main.yml create mode 100644 ansible/roles/nvidia_runtime_setup/tasks/reboot.yml create mode 100644 ansible/roles/nvidia_runtime_setup/tasks/summary.yml create mode 100644 ansible/roles/nvidia_runtime_setup/tasks/validate.yml create mode 100644 ansible/roles/nvidia_runtime_setup/tasks/verify.yml diff --git a/ansible/inventory/hosts.ini b/ansible/inventory/hosts.ini index 1690718..211d750 100644 --- a/ansible/inventory/hosts.ini +++ b/ansible/inventory/hosts.ini @@ -33,6 +33,9 @@ heimdall ansible_host=10.0.0.151 ansible_user=chester waldorf ansible_host=10.0.0.251 ansible_user=chester ai-p410 ansible_host=10.0.0.202 ansible_user=chester +[ai_nodes] +ai-p410 ansible_host=10.0.0.202 ansible_user=chester + [raspberry_pi] watchtower ansible_host=10.0.0.200 ansible_user=chester diff --git a/ansible/playbooks/onboard-ai-node.yml b/ansible/playbooks/onboard-ai-node.yml new file mode 100644 index 0000000..deafaad --- /dev/null +++ b/ansible/playbooks/onboard-ai-node.yml @@ -0,0 +1,11 @@ +--- +# Dedicated onboarding workflow for AI-focused nodes. +# Usage: ansible-playbook playbooks/onboard-ai-node.yml -K --limit ai-p410 + +- name: Onboard and baseline AI nodes + hosts: ai_nodes + gather_facts: true + become: true + + roles: + - role: ai_node_onboarding diff --git a/ansible/playbooks/setup-ai-gpu-runtime.yml b/ansible/playbooks/setup-ai-gpu-runtime.yml new file mode 100644 index 0000000..8ce1ce5 --- /dev/null +++ b/ansible/playbooks/setup-ai-gpu-runtime.yml @@ -0,0 +1,11 @@ +--- +# Configure NVIDIA driver/runtime for AI nodes. +# Usage: ansible-playbook playbooks/setup-ai-gpu-runtime.yml -K --limit ai-p410 + +- name: Configure NVIDIA runtime on AI nodes + hosts: ai_nodes + gather_facts: true + become: true + + roles: + - role: nvidia_runtime_setup diff --git a/ansible/roles/ai_node_onboarding/defaults/main.yml b/ansible/roles/ai_node_onboarding/defaults/main.yml new file mode 100644 index 0000000..a538766 --- /dev/null +++ b/ansible/roles/ai_node_onboarding/defaults/main.yml @@ -0,0 +1,36 @@ +--- +# Toggle to true only if you intentionally want to hard-fail when NVIDIA tooling is missing. +ai_node_require_nvidia_tooling: false + +# OS packages useful for AI-node observability and build workloads. +ai_node_base_packages: + - ca-certificates + - curl + - git + - htop + - nvtop + - pciutils + - python3 + - python3-pip + - python3-venv + - tmux + +# Conservative kernel tuning for mixed service + AI workloads. +ai_node_sysctl: + vm.swappiness: "10" + vm.max_map_count: "262144" + +# AI workload directories. Keep models/data on persistent storage. +ai_node_directories: + - path: /srv/ai + owner: root + group: root + mode: "0755" + - path: /srv/ai/models + owner: root + group: root + mode: "0755" + - path: /srv/ai/workspaces + owner: root + group: root + mode: "0775" diff --git a/ansible/roles/ai_node_onboarding/tasks/gpu_checks.yml b/ansible/roles/ai_node_onboarding/tasks/gpu_checks.yml new file mode 100644 index 0000000..f9757a2 --- /dev/null +++ b/ansible/roles/ai_node_onboarding/tasks/gpu_checks.yml @@ -0,0 +1,27 @@ +--- +- name: Check nvidia-smi availability + ansible.builtin.command: nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader + register: ai_node_nvidia_smi + changed_when: false + failed_when: false + +- name: Optionally fail when NVIDIA tooling is required but unavailable + ansible.builtin.fail: + msg: >- + NVIDIA GPU tooling is unavailable. Install a compatible NVIDIA driver and + nvidia-utils package, then re-run onboarding. + when: + - ai_node_require_nvidia_tooling | bool + - ai_node_nvidia_smi.rc != 0 + +- name: Warn when nvidia-smi is unavailable + ansible.builtin.debug: + msg: >- + nvidia-smi is not available yet. This is common on fresh hosts before driver install. + Continue onboarding now, then install validated drivers separately. + when: ai_node_nvidia_smi.rc != 0 + +- name: Capture GPU info lines + ansible.builtin.set_fact: + ai_node_gpu_lines: "{{ ai_node_nvidia_smi.stdout_lines | default([]) }}" + when: ai_node_nvidia_smi.rc == 0 diff --git a/ansible/roles/ai_node_onboarding/tasks/main.yml b/ansible/roles/ai_node_onboarding/tasks/main.yml new file mode 100644 index 0000000..567508d --- /dev/null +++ b/ansible/roles/ai_node_onboarding/tasks/main.yml @@ -0,0 +1,15 @@ +--- +- name: Validate AI node prerequisites + ansible.builtin.import_tasks: validate.yml + +- name: Install baseline packages + ansible.builtin.import_tasks: prereqs.yml + +- name: Apply kernel tuning + ansible.builtin.import_tasks: tuning.yml + +- name: Run GPU readiness checks + ansible.builtin.import_tasks: gpu_checks.yml + +- name: Show onboarding summary + ansible.builtin.import_tasks: summary.yml diff --git a/ansible/roles/ai_node_onboarding/tasks/prereqs.yml b/ansible/roles/ai_node_onboarding/tasks/prereqs.yml new file mode 100644 index 0000000..bcddb45 --- /dev/null +++ b/ansible/roles/ai_node_onboarding/tasks/prereqs.yml @@ -0,0 +1,15 @@ +--- +- name: Install AI-node baseline packages + ansible.builtin.apt: + name: "{{ ai_node_base_packages }}" + state: present + update_cache: true + +- name: Ensure AI workload directories exist + ansible.builtin.file: + path: "{{ item.path }}" + state: directory + owner: "{{ item.owner }}" + group: "{{ item.group }}" + mode: "{{ item.mode }}" + loop: "{{ ai_node_directories }}" diff --git a/ansible/roles/ai_node_onboarding/tasks/summary.yml b/ansible/roles/ai_node_onboarding/tasks/summary.yml new file mode 100644 index 0000000..ff4d38a --- /dev/null +++ b/ansible/roles/ai_node_onboarding/tasks/summary.yml @@ -0,0 +1,12 @@ +--- +- name: Summarize AI-node onboarding results + ansible.builtin.debug: + msg: + - "==========================================" + - "AI node onboarding complete for {{ inventory_hostname }}" + - "==========================================" + - "RAM (MB): {{ ansible_memtotal_mb }}" + - "NVIDIA detected via lspci: {{ ai_node_has_nvidia_gpu | default(false) }}" + - "nvidia-smi ready: {{ (ai_node_nvidia_smi.rc | default(1)) == 0 }}" + - "GPU details: {{ ai_node_gpu_lines | default(['not available']) | join('; ') }}" + - "AI directories: {{ ai_node_directories | map(attribute='path') | list | join(', ') }}" diff --git a/ansible/roles/ai_node_onboarding/tasks/tuning.yml b/ansible/roles/ai_node_onboarding/tasks/tuning.yml new file mode 100644 index 0000000..5e200e5 --- /dev/null +++ b/ansible/roles/ai_node_onboarding/tasks/tuning.yml @@ -0,0 +1,8 @@ +--- +- name: Apply sysctl settings for AI workloads + ansible.posix.sysctl: + name: "{{ item.key }}" + value: "{{ item.value }}" + state: present + reload: true + loop: "{{ ai_node_sysctl | dict2items }}" diff --git a/ansible/roles/ai_node_onboarding/tasks/validate.yml b/ansible/roles/ai_node_onboarding/tasks/validate.yml new file mode 100644 index 0000000..0b283d0 --- /dev/null +++ b/ansible/roles/ai_node_onboarding/tasks/validate.yml @@ -0,0 +1,26 @@ +--- +- name: Assert supported operating system family + ansible.builtin.assert: + that: + - ansible_os_family == "Debian" + fail_msg: "ai_node_onboarding currently supports Debian/Ubuntu only." + +- name: Assert minimum RAM for AI node profile + ansible.builtin.assert: + that: + - ansible_memtotal_mb | int >= 16384 + fail_msg: "AI node profile expects at least 16 GB RAM." + +- name: Detect NVIDIA GPU via lspci + ansible.builtin.command: lspci + register: ai_node_lspci + changed_when: false + +- name: Derive GPU detection flag + ansible.builtin.set_fact: + ai_node_has_nvidia_gpu: "{{ 'NVIDIA' in ai_node_lspci.stdout }}" + +- name: Warn when no NVIDIA GPU is detected + ansible.builtin.debug: + msg: "No NVIDIA GPU was detected via lspci; continuing because this check is advisory." + when: not ai_node_has_nvidia_gpu | bool diff --git a/ansible/roles/nvidia_runtime_setup/README.md b/ansible/roles/nvidia_runtime_setup/README.md new file mode 100644 index 0000000..f3f695b --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/README.md @@ -0,0 +1,31 @@ +# nvidia_runtime_setup + +Ansible role to configure NVIDIA driver/runtime readiness on Debian-family hosts. + +## What it does + +- Detects NVIDIA GPU hardware via `lspci` +- Auto-selects a recommended driver on Ubuntu (or uses an explicit package pin) +- Installs the NVIDIA driver package +- Optionally installs CUDA toolkit and NVIDIA container toolkit +- Handles optional reboot logic +- Verifies readiness with `nvidia-smi` + +## Safe defaults + +- Reboot is disabled by default (`nvidia_runtime_reboot_if_needed: false`) +- CUDA and container toolkit installs are disabled by default +- Validation is enabled by default and fails if `nvidia-smi` is unavailable + +## Example + +```yaml +--- +- name: Configure NVIDIA runtime for AI nodes + hosts: ai_nodes + become: true + roles: + - role: nvidia_runtime_setup + vars: + nvidia_runtime_reboot_if_needed: true +``` diff --git a/ansible/roles/nvidia_runtime_setup/defaults/main.yml b/ansible/roles/nvidia_runtime_setup/defaults/main.yml new file mode 100644 index 0000000..721d68b --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/defaults/main.yml @@ -0,0 +1,25 @@ +--- +# Fail if no NVIDIA hardware is detected. +nvidia_runtime_require_gpu: true + +# Install/repair NVIDIA driver packages. +nvidia_runtime_install_driver: true + +# Optional explicit driver package pin (for example: nvidia-driver-550). +# When empty on Ubuntu, the role will auto-detect the recommended package. +nvidia_runtime_driver_package: "" + +# Install CUDA toolkit from distro repository. +nvidia_runtime_install_cuda_toolkit: false +nvidia_runtime_cuda_package: nvidia-cuda-toolkit + +# Install NVIDIA container runtime package if available in configured repos. +nvidia_runtime_install_container_toolkit: false +nvidia_runtime_container_toolkit_package: nvidia-container-toolkit + +# Reboot handling +nvidia_runtime_reboot_if_needed: false +nvidia_runtime_reboot_timeout: 900 + +# Post-install validation behavior +nvidia_runtime_validate_after_install: true diff --git a/ansible/roles/nvidia_runtime_setup/tasks/detect.yml b/ansible/roles/nvidia_runtime_setup/tasks/detect.yml new file mode 100644 index 0000000..0562f74 --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/tasks/detect.yml @@ -0,0 +1,64 @@ +--- +- name: Ensure hardware detection utilities are present + ansible.builtin.apt: + name: + - pciutils + - ubuntu-drivers-common + state: present + update_cache: true + when: ansible_distribution == "Ubuntu" + +- name: Ensure hardware detection utilities are present (non-Ubuntu) + ansible.builtin.apt: + name: + - pciutils + state: present + update_cache: true + when: ansible_distribution != "Ubuntu" + +- name: Detect PCI devices + ansible.builtin.command: lspci + register: nvidia_runtime_lspci + changed_when: false + +- name: Set hardware detection fact + ansible.builtin.set_fact: + nvidia_runtime_has_gpu: "{{ 'NVIDIA' in nvidia_runtime_lspci.stdout }}" + +- name: Stop when GPU is required but missing + ansible.builtin.fail: + msg: "No NVIDIA GPU detected on this host." + when: + - nvidia_runtime_require_gpu | bool + - not nvidia_runtime_has_gpu | bool + +- name: Detect recommended Ubuntu NVIDIA driver + ansible.builtin.command: ubuntu-drivers devices + register: nvidia_runtime_ubuntu_drivers + changed_when: false + failed_when: false + when: + - ansible_distribution == "Ubuntu" + - nvidia_runtime_driver_package | length == 0 + +- name: Derive auto-selected driver package + ansible.builtin.set_fact: + nvidia_runtime_selected_driver: >- + {{ + nvidia_runtime_driver_package + if (nvidia_runtime_driver_package | length > 0) + else ( + (nvidia_runtime_ubuntu_drivers.stdout | default('')) + | regex_search('nvidia-driver-[0-9]+') + | default('') + ) + }} + +- name: Validate selected driver package + ansible.builtin.fail: + msg: >- + Could not determine an NVIDIA driver package automatically. + Set nvidia_runtime_driver_package explicitly. + when: + - nvidia_runtime_install_driver | bool + - nvidia_runtime_selected_driver | length == 0 diff --git a/ansible/roles/nvidia_runtime_setup/tasks/install.yml b/ansible/roles/nvidia_runtime_setup/tasks/install.yml new file mode 100644 index 0000000..d710852 --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/tasks/install.yml @@ -0,0 +1,19 @@ +--- +- name: Install NVIDIA driver package + ansible.builtin.apt: + name: "{{ nvidia_runtime_selected_driver }}" + state: present + update_cache: true + when: nvidia_runtime_install_driver | bool + +- name: Install CUDA toolkit package + ansible.builtin.apt: + name: "{{ nvidia_runtime_cuda_package }}" + state: present + when: nvidia_runtime_install_cuda_toolkit | bool + +- name: Install NVIDIA container toolkit package + ansible.builtin.apt: + name: "{{ nvidia_runtime_container_toolkit_package }}" + state: present + when: nvidia_runtime_install_container_toolkit | bool diff --git a/ansible/roles/nvidia_runtime_setup/tasks/main.yml b/ansible/roles/nvidia_runtime_setup/tasks/main.yml new file mode 100644 index 0000000..45e319b --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/tasks/main.yml @@ -0,0 +1,18 @@ +--- +- name: Validate role inputs + ansible.builtin.import_tasks: validate.yml + +- name: Detect NVIDIA hardware and tooling + ansible.builtin.import_tasks: detect.yml + +- name: Install driver and optional runtime packages + ansible.builtin.import_tasks: install.yml + +- name: Handle reboot requirements + ansible.builtin.import_tasks: reboot.yml + +- name: Validate NVIDIA runtime state + ansible.builtin.import_tasks: verify.yml + +- name: Print runtime summary + ansible.builtin.import_tasks: summary.yml diff --git a/ansible/roles/nvidia_runtime_setup/tasks/reboot.yml b/ansible/roles/nvidia_runtime_setup/tasks/reboot.yml new file mode 100644 index 0000000..073584a --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/tasks/reboot.yml @@ -0,0 +1,22 @@ +--- +- name: Check whether reboot is required + ansible.builtin.stat: + path: /var/run/reboot-required + register: nvidia_runtime_reboot_required + +- name: Warn when reboot is required but disabled + ansible.builtin.debug: + msg: >- + NVIDIA packages were installed but reboot is required. + Set nvidia_runtime_reboot_if_needed=true to allow automatic reboot. + when: + - nvidia_runtime_reboot_required.stat.exists + - not nvidia_runtime_reboot_if_needed | bool + +- name: Reboot host when required and enabled + ansible.builtin.reboot: + msg: "Reboot triggered by nvidia_runtime_setup role" + reboot_timeout: "{{ nvidia_runtime_reboot_timeout }}" + when: + - nvidia_runtime_reboot_required.stat.exists + - nvidia_runtime_reboot_if_needed | bool diff --git a/ansible/roles/nvidia_runtime_setup/tasks/summary.yml b/ansible/roles/nvidia_runtime_setup/tasks/summary.yml new file mode 100644 index 0000000..a59e658 --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/tasks/summary.yml @@ -0,0 +1,12 @@ +--- +- name: Print NVIDIA runtime summary + ansible.builtin.debug: + msg: + - "==========================================" + - "NVIDIA runtime setup complete for {{ inventory_hostname }}" + - "==========================================" + - "GPU detected via lspci: {{ nvidia_runtime_has_gpu | default(false) }}" + - "Driver package selected: {{ nvidia_runtime_selected_driver | default('not set') }}" + - "Reboot required: {{ nvidia_runtime_reboot_required.stat.exists | default(false) }}" + - "nvidia-smi ready: {{ (nvidia_runtime_smi.rc | default(1)) == 0 }}" + - "GPU details: {{ nvidia_runtime_gpu_lines | default(['not available']) | join('; ') }}" diff --git a/ansible/roles/nvidia_runtime_setup/tasks/validate.yml b/ansible/roles/nvidia_runtime_setup/tasks/validate.yml new file mode 100644 index 0000000..cd73325 --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/tasks/validate.yml @@ -0,0 +1,20 @@ +--- +- name: Assert supported OS family + ansible.builtin.assert: + that: + - ansible_os_family == "Debian" + fail_msg: "nvidia_runtime_setup currently supports Debian-family distributions only." + +- name: Assert explicit driver package on non-Ubuntu systems + ansible.builtin.assert: + that: + - ansible_distribution == "Ubuntu" or nvidia_runtime_driver_package | length > 0 + fail_msg: >- + On non-Ubuntu systems set nvidia_runtime_driver_package explicitly to a valid package name. + +- name: Assert optional package names are not empty when enabled + ansible.builtin.assert: + that: + - not nvidia_runtime_install_cuda_toolkit or nvidia_runtime_cuda_package | length > 0 + - not nvidia_runtime_install_container_toolkit or nvidia_runtime_container_toolkit_package | length > 0 + fail_msg: "Optional package toggles are enabled but package names are missing." diff --git a/ansible/roles/nvidia_runtime_setup/tasks/verify.yml b/ansible/roles/nvidia_runtime_setup/tasks/verify.yml new file mode 100644 index 0000000..072f7a6 --- /dev/null +++ b/ansible/roles/nvidia_runtime_setup/tasks/verify.yml @@ -0,0 +1,20 @@ +--- +- name: Check nvidia-smi status + ansible.builtin.command: nvidia-smi --query-gpu=name,memory.total,driver_version --format=csv,noheader + register: nvidia_runtime_smi + changed_when: false + failed_when: false + +- name: Fail when post-install validation is required and nvidia-smi is unavailable + ansible.builtin.fail: + msg: >- + nvidia-smi is unavailable after installation. + This usually means a reboot is still required or the selected driver is incompatible. + when: + - nvidia_runtime_validate_after_install | bool + - nvidia_runtime_smi.rc != 0 + +- name: Capture GPU info lines + ansible.builtin.set_fact: + nvidia_runtime_gpu_lines: "{{ nvidia_runtime_smi.stdout_lines | default([]) }}" + when: nvidia_runtime_smi.rc == 0