349 lines
13 KiB
YAML

---
# ============================================================================
# AI WORKSTATION BOOTSTRAP PLAYBOOK
# ============================================================================
# Purpose: Prepare fresh Ubuntu installations for AI/ML workloads
# Targets: ai_grid inventory group (NVIDIA GPU-equipped machines)
# ============================================================================
- name: Bootstrap AI workstation (GPU + Ollama + Storage)
hosts: ai_grid
become: true
vars:
# Ollama network configuration
ollama_host: "0.0.0.0:11434" # Listen on all interfaces
ollama_port: 11434
# Essential packages for AI workstations
essential_packages:
- build-essential # Compiler and build tools
- git # Version control
- curl # HTTP client
- wget # Download utility
- htop # System monitoring
- nvtop # GPU monitoring (NVIDIA)
- python3-pip # Python package manager
- python3-venv # Python virtual environments
- net-tools # Network utilities
- nfs-common # NFS client support
tasks:
# ========================================================================
# PHASE 1: SYSTEM BASELINE
# ========================================================================
- name: Update apt cache
ansible.builtin.apt:
update_cache: true
cache_valid_time: 3600
tags: [baseline, update]
- name: Upgrade all installed packages
ansible.builtin.apt:
upgrade: dist
autoremove: true
autoclean: true
register: upgrade_result
tags: [baseline, update]
# ========================================================================
# PHASE 2: ESSENTIAL UTILITIES
# ========================================================================
- name: Install essential utilities and development tools
ansible.builtin.apt:
name: "{{ essential_packages }}"
state: present
tags: [baseline, utilities]
# ========================================================================
# PHASE 2.5: IDENTITY MANAGEMENT
# ========================================================================
# Purpose: Ensure the 'chester' admin user exists with proper access
# Why: Allows the playbook to bootstrap from a fresh Ubuntu install
# without manual user creation
# ========================================================================
- name: Create chester identity and access
block:
- name: Install sudo package
ansible.builtin.apt:
name: sudo
state: present
update_cache: false
- name: Ensure chester group exists
ansible.builtin.group:
name: chester
state: present
- name: Create chester user with sudo access
ansible.builtin.user:
name: chester
group: chester
groups: sudo
shell: /bin/bash
password: '!'
password_lock: true
comment: "Homelab Administrator"
- name: Deploy SSH key to chester user
ansible.posix.authorized_key:
user: chester
state: present
key: "{{ lookup('file', '~/.ssh/id_ed25519.pub') }}"
- name: Allow chester to use sudo without password
ansible.builtin.copy:
dest: /etc/sudoers.d/chester
content: "chester ALL=(ALL) NOPASSWD: ALL\n"
mode: '0440'
owner: root
group: root
validate: '/usr/sbin/visudo -cf %s'
tags: [identity, baseline]
# ========================================================================
# PHASE 3: NVIDIA DRIVERS
# ========================================================================
- name: Install ubuntu-drivers-common package
ansible.builtin.apt:
name: ubuntu-drivers-common
state: present
tags: [gpu, nvidia]
- name: Detect and install recommended NVIDIA drivers
ansible.builtin.command: ubuntu-drivers autoinstall
args:
creates: /usr/bin/nvidia-smi
register: nvidia_install
changed_when: false
tags: [gpu, nvidia]
- name: Verify NVIDIA driver installation
ansible.builtin.command: nvidia-smi
register: nvidia_check
failed_when: false
changed_when: false
tags: [gpu, nvidia, verify]
- name: Display NVIDIA driver status
ansible.builtin.debug:
msg: "{{ nvidia_check.stdout_lines }}"
when: nvidia_check.rc == 0
tags: [gpu, nvidia, verify]
# ========================================================================
# PHASE 3.5: LAPTOP TUNING & SAFETY
# ========================================================================
- name: Configure GRUB for ASPM & Intel hybrid cores
ansible.builtin.lineinfile:
path: /etc/default/grub
regexp: '^GRUB_CMDLINE_LINUX_DEFAULT='
line: 'GRUB_CMDLINE_LINUX_DEFAULT="quiet pcie_aspm=force intel_pstate=passive"'
notify: Update Grub
tags: [laptop, tuning]
- name: Configure logind to ignore lid-close events
ansible.builtin.lineinfile:
path: /etc/systemd/logind.conf
regexp: "^#?{{ item.key }}="
line: "{{ item.key }}={{ item.value }}"
loop:
- { key: "HandleLidSwitch", value: "ignore" }
- { key: "HandleLidSwitchExternalPower", value: "ignore" }
notify: Restart Logind
tags: [laptop, safety]
- name: Mask sleep targets to keep workloads running
ansible.builtin.systemd:
name: "{{ item }}"
masked: true
loop:
- sleep.target
- suspend.target
- hibernate.target
- hybrid-sleep.target
tags: [laptop, safety]
- name: Disable swap to protect NVMe under sustained load
ansible.builtin.shell: |
swapoff -a
sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
when: ansible_swaptotal_mb > 0
changed_when: false
tags: [storage, tuning]
- name: Check Intel Thread Director support messages
ansible.builtin.shell: "dmesg | grep -i 'Hardware Feedback Interface'"
register: hfi_check
failed_when: false
changed_when: false
tags: [verify, laptop]
# ========================================================================
# PHASE 4: OLLAMA INSTALLATION
# ========================================================================
- name: Check if Ollama is already installed
ansible.builtin.stat:
path: /usr/local/bin/ollama
register: ollama_binary
tags: [ollama]
- name: Download Ollama installation script
ansible.builtin.get_url:
url: https://ollama.ai/install.sh
dest: /tmp/ollama-install.sh
mode: '0755'
when: not ollama_binary.stat.exists
tags: [ollama]
- name: Install Ollama
ansible.builtin.command: /tmp/ollama-install.sh
when: not ollama_binary.stat.exists
changed_when: false
tags: [ollama]
- name: Create systemd override directory for Ollama
ansible.builtin.file:
path: /etc/systemd/system/ollama.service.d
state: directory
mode: '0755'
tags: [ollama, network]
- name: Configure Ollama to listen on all network interfaces
ansible.builtin.copy:
dest: /etc/systemd/system/ollama.service.d/override.conf
content: |
[Service]
Environment="OLLAMA_HOST={{ ollama_host }}"
mode: '0644'
notify: Restart ollama
tags: [ollama, network]
- name: Ensure Ollama service is enabled and started
ansible.builtin.systemd:
name: ollama
state: started
enabled: true
daemon_reload: true
tags: [ollama]
- name: Apply pending Ollama handler changes before readiness check
ansible.builtin.meta: flush_handlers
tags: [ollama]
- name: Restart Ollama to apply network binding
ansible.builtin.systemd:
name: ollama
state: restarted
daemon_reload: true
tags: [ollama]
- name: Wait for Ollama service to be ready
ansible.builtin.wait_for:
host: "{{ ansible_host }}"
port: "{{ ollama_port }}"
delay: 5
timeout: 30
tags: [ollama, verify]
# ========================================================================
# PHASE 5: NFS STORAGE MOUNTS (TODO)
# ========================================================================
# Instructions:
# 1. Define NFS server variables in group_vars/ai_grid.yml:
# nfs_server: "10.0.0.249"
# nfs_export: "/volume1/ai-datasets"
# nfs_mount_point: "/mnt/ai-datasets"
#
# 2. Uncomment the tasks below and customize paths
# ========================================================================
# - name: Create NFS mount point directory
# ansible.builtin.file:
# path: "{{ nfs_mount_point }}"
# state: directory
# owner: "{{ ansible_user }}"
# group: "{{ ansible_user }}"
# mode: '0755'
# tags: [storage, nfs]
#
# - name: Mount NFS share for AI datasets
# ansible.posix.mount:
# path: "{{ nfs_mount_point }}"
# src: "{{ nfs_server }}:{{ nfs_export }}"
# fstype: nfs
# opts: defaults,nfsvers=4
# state: mounted
# tags: [storage, nfs]
#
# - name: Verify NFS mount is accessible
# ansible.builtin.command: "ls -la {{ nfs_mount_point }}"
# register: nfs_verify
# changed_when: false
# tags: [storage, nfs, verify]
# ========================================================================
# PHASE 6: POST-INSTALL VERIFICATION
# ========================================================================
- name: Check if system reboot is required
ansible.builtin.stat:
path: /var/run/reboot-required
register: reboot_required
tags: [verify, reboot]
- name: Display reboot notification if needed
ansible.builtin.debug:
msg: |
╔════════════════════════════════════════════════════════════════╗
║ WARNING: System reboot is required to complete installation ║
║ Reason: Kernel or driver updates ║
║ Action: Please reboot this host manually ║
╚════════════════════════════════════════════════════════════════╝
when: reboot_required.stat.exists
tags: [verify, reboot]
- name: Display bootstrap completion summary
ansible.builtin.debug:
msg:
- "╔════════════════════════════════════════════════════════════════╗"
- "║ AI Workstation Bootstrap Complete! ║"
- "╠════════════════════════════════════════════════════════════════╣"
- "║ ✓ System updated and essential utilities installed ║"
- "║ ✓ NVIDIA drivers installed (verify with nvidia-smi) ║"
- "║ ✓ Ollama installed and network-accessible ║"
- "║ → Ollama API: http://{{ ansible_host }}:{{ ollama_port }} ║"
- "╠════════════════════════════════════════════════════════════════╣"
- "║ Next Steps: ║"
- "║ 1. Reboot if required (check above) ║"
- "║ 2. Pull models: ollama pull llama3.1:8b ║"
- "║ 3. Configure NFS mounts (see Phase 5 in playbook) ║"
- "╚════════════════════════════════════════════════════════════════╝"
tags: [verify]
# ==========================================================================
# HANDLERS
# ==========================================================================
handlers:
- name: Restart ollama
ansible.builtin.systemd:
name: ollama
state: restarted
daemon_reload: true
- name: Update Grub
ansible.builtin.command: update-grub
changed_when: false
- name: Restart Logind
ansible.builtin.systemd:
name: systemd-logind
state: restarted