349 lines
13 KiB
YAML
349 lines
13 KiB
YAML
---
|
|
# ============================================================================
|
|
# AI WORKSTATION BOOTSTRAP PLAYBOOK
|
|
# ============================================================================
|
|
# Purpose: Prepare fresh Ubuntu installations for AI/ML workloads
|
|
# Targets: ai_grid inventory group (NVIDIA GPU-equipped machines)
|
|
# ============================================================================
|
|
|
|
- name: Bootstrap AI workstation (GPU + Ollama + Storage)
|
|
hosts: ai_grid
|
|
become: true
|
|
|
|
vars:
|
|
# Ollama network configuration
|
|
ollama_host: "0.0.0.0:11434" # Listen on all interfaces
|
|
ollama_port: 11434
|
|
|
|
# Essential packages for AI workstations
|
|
essential_packages:
|
|
- build-essential # Compiler and build tools
|
|
- git # Version control
|
|
- curl # HTTP client
|
|
- wget # Download utility
|
|
- htop # System monitoring
|
|
- nvtop # GPU monitoring (NVIDIA)
|
|
- python3-pip # Python package manager
|
|
- python3-venv # Python virtual environments
|
|
- net-tools # Network utilities
|
|
- nfs-common # NFS client support
|
|
|
|
tasks:
|
|
# ========================================================================
|
|
# PHASE 1: SYSTEM BASELINE
|
|
# ========================================================================
|
|
|
|
- name: Update apt cache
|
|
ansible.builtin.apt:
|
|
update_cache: true
|
|
cache_valid_time: 3600
|
|
tags: [baseline, update]
|
|
|
|
- name: Upgrade all installed packages
|
|
ansible.builtin.apt:
|
|
upgrade: dist
|
|
autoremove: true
|
|
autoclean: true
|
|
register: upgrade_result
|
|
tags: [baseline, update]
|
|
|
|
# ========================================================================
|
|
# PHASE 2: ESSENTIAL UTILITIES
|
|
# ========================================================================
|
|
|
|
- name: Install essential utilities and development tools
|
|
ansible.builtin.apt:
|
|
name: "{{ essential_packages }}"
|
|
state: present
|
|
tags: [baseline, utilities]
|
|
|
|
# ========================================================================
|
|
# PHASE 2.5: IDENTITY MANAGEMENT
|
|
# ========================================================================
|
|
# Purpose: Ensure the 'chester' admin user exists with proper access
|
|
# Why: Allows the playbook to bootstrap from a fresh Ubuntu install
|
|
# without manual user creation
|
|
# ========================================================================
|
|
|
|
- name: Create chester identity and access
|
|
block:
|
|
- name: Install sudo package
|
|
ansible.builtin.apt:
|
|
name: sudo
|
|
state: present
|
|
update_cache: false
|
|
|
|
- name: Ensure chester group exists
|
|
ansible.builtin.group:
|
|
name: chester
|
|
state: present
|
|
|
|
- name: Create chester user with sudo access
|
|
ansible.builtin.user:
|
|
name: chester
|
|
group: chester
|
|
groups: sudo
|
|
shell: /bin/bash
|
|
password: '!'
|
|
password_lock: true
|
|
comment: "Homelab Administrator"
|
|
|
|
- name: Deploy SSH key to chester user
|
|
ansible.posix.authorized_key:
|
|
user: chester
|
|
state: present
|
|
key: "{{ lookup('file', '~/.ssh/id_ed25519.pub') }}"
|
|
|
|
- name: Allow chester to use sudo without password
|
|
ansible.builtin.copy:
|
|
dest: /etc/sudoers.d/chester
|
|
content: "chester ALL=(ALL) NOPASSWD: ALL\n"
|
|
mode: '0440'
|
|
owner: root
|
|
group: root
|
|
validate: '/usr/sbin/visudo -cf %s'
|
|
|
|
tags: [identity, baseline]
|
|
|
|
# ========================================================================
|
|
# PHASE 3: NVIDIA DRIVERS
|
|
# ========================================================================
|
|
|
|
- name: Install ubuntu-drivers-common package
|
|
ansible.builtin.apt:
|
|
name: ubuntu-drivers-common
|
|
state: present
|
|
tags: [gpu, nvidia]
|
|
|
|
- name: Detect and install recommended NVIDIA drivers
|
|
ansible.builtin.command: ubuntu-drivers autoinstall
|
|
args:
|
|
creates: /usr/bin/nvidia-smi
|
|
register: nvidia_install
|
|
changed_when: false
|
|
tags: [gpu, nvidia]
|
|
|
|
- name: Verify NVIDIA driver installation
|
|
ansible.builtin.command: nvidia-smi
|
|
register: nvidia_check
|
|
failed_when: false
|
|
changed_when: false
|
|
tags: [gpu, nvidia, verify]
|
|
|
|
- name: Display NVIDIA driver status
|
|
ansible.builtin.debug:
|
|
msg: "{{ nvidia_check.stdout_lines }}"
|
|
when: nvidia_check.rc == 0
|
|
tags: [gpu, nvidia, verify]
|
|
|
|
# ========================================================================
|
|
# PHASE 3.5: LAPTOP TUNING & SAFETY
|
|
# ========================================================================
|
|
|
|
- name: Configure GRUB for ASPM & Intel hybrid cores
|
|
ansible.builtin.lineinfile:
|
|
path: /etc/default/grub
|
|
regexp: '^GRUB_CMDLINE_LINUX_DEFAULT='
|
|
line: 'GRUB_CMDLINE_LINUX_DEFAULT="quiet pcie_aspm=force intel_pstate=passive"'
|
|
notify: Update Grub
|
|
tags: [laptop, tuning]
|
|
|
|
- name: Configure logind to ignore lid-close events
|
|
ansible.builtin.lineinfile:
|
|
path: /etc/systemd/logind.conf
|
|
regexp: "^#?{{ item.key }}="
|
|
line: "{{ item.key }}={{ item.value }}"
|
|
loop:
|
|
- { key: "HandleLidSwitch", value: "ignore" }
|
|
- { key: "HandleLidSwitchExternalPower", value: "ignore" }
|
|
notify: Restart Logind
|
|
tags: [laptop, safety]
|
|
|
|
- name: Mask sleep targets to keep workloads running
|
|
ansible.builtin.systemd:
|
|
name: "{{ item }}"
|
|
masked: true
|
|
loop:
|
|
- sleep.target
|
|
- suspend.target
|
|
- hibernate.target
|
|
- hybrid-sleep.target
|
|
tags: [laptop, safety]
|
|
|
|
- name: Disable swap to protect NVMe under sustained load
|
|
ansible.builtin.shell: |
|
|
swapoff -a
|
|
sed -i '/ swap / s/^\(.*\)$/#\1/g' /etc/fstab
|
|
when: ansible_swaptotal_mb > 0
|
|
changed_when: false
|
|
tags: [storage, tuning]
|
|
|
|
- name: Check Intel Thread Director support messages
|
|
ansible.builtin.shell: "dmesg | grep -i 'Hardware Feedback Interface'"
|
|
register: hfi_check
|
|
failed_when: false
|
|
changed_when: false
|
|
tags: [verify, laptop]
|
|
|
|
# ========================================================================
|
|
# PHASE 4: OLLAMA INSTALLATION
|
|
# ========================================================================
|
|
|
|
- name: Check if Ollama is already installed
|
|
ansible.builtin.stat:
|
|
path: /usr/local/bin/ollama
|
|
register: ollama_binary
|
|
tags: [ollama]
|
|
|
|
- name: Download Ollama installation script
|
|
ansible.builtin.get_url:
|
|
url: https://ollama.ai/install.sh
|
|
dest: /tmp/ollama-install.sh
|
|
mode: '0755'
|
|
when: not ollama_binary.stat.exists
|
|
tags: [ollama]
|
|
|
|
- name: Install Ollama
|
|
ansible.builtin.command: /tmp/ollama-install.sh
|
|
when: not ollama_binary.stat.exists
|
|
changed_when: false
|
|
tags: [ollama]
|
|
|
|
- name: Create systemd override directory for Ollama
|
|
ansible.builtin.file:
|
|
path: /etc/systemd/system/ollama.service.d
|
|
state: directory
|
|
mode: '0755'
|
|
tags: [ollama, network]
|
|
|
|
- name: Configure Ollama to listen on all network interfaces
|
|
ansible.builtin.copy:
|
|
dest: /etc/systemd/system/ollama.service.d/override.conf
|
|
content: |
|
|
[Service]
|
|
Environment="OLLAMA_HOST={{ ollama_host }}"
|
|
mode: '0644'
|
|
notify: Restart ollama
|
|
tags: [ollama, network]
|
|
|
|
- name: Ensure Ollama service is enabled and started
|
|
ansible.builtin.systemd:
|
|
name: ollama
|
|
state: started
|
|
enabled: true
|
|
daemon_reload: true
|
|
tags: [ollama]
|
|
|
|
- name: Apply pending Ollama handler changes before readiness check
|
|
ansible.builtin.meta: flush_handlers
|
|
tags: [ollama]
|
|
|
|
- name: Restart Ollama to apply network binding
|
|
ansible.builtin.systemd:
|
|
name: ollama
|
|
state: restarted
|
|
daemon_reload: true
|
|
tags: [ollama]
|
|
|
|
- name: Wait for Ollama service to be ready
|
|
ansible.builtin.wait_for:
|
|
host: "{{ ansible_host }}"
|
|
port: "{{ ollama_port }}"
|
|
delay: 5
|
|
timeout: 30
|
|
tags: [ollama, verify]
|
|
|
|
# ========================================================================
|
|
# PHASE 5: NFS STORAGE MOUNTS (TODO)
|
|
# ========================================================================
|
|
# Instructions:
|
|
# 1. Define NFS server variables in group_vars/ai_grid.yml:
|
|
# nfs_server: "10.0.0.249"
|
|
# nfs_export: "/volume1/ai-datasets"
|
|
# nfs_mount_point: "/mnt/ai-datasets"
|
|
#
|
|
# 2. Uncomment the tasks below and customize paths
|
|
# ========================================================================
|
|
|
|
# - name: Create NFS mount point directory
|
|
# ansible.builtin.file:
|
|
# path: "{{ nfs_mount_point }}"
|
|
# state: directory
|
|
# owner: "{{ ansible_user }}"
|
|
# group: "{{ ansible_user }}"
|
|
# mode: '0755'
|
|
# tags: [storage, nfs]
|
|
#
|
|
# - name: Mount NFS share for AI datasets
|
|
# ansible.posix.mount:
|
|
# path: "{{ nfs_mount_point }}"
|
|
# src: "{{ nfs_server }}:{{ nfs_export }}"
|
|
# fstype: nfs
|
|
# opts: defaults,nfsvers=4
|
|
# state: mounted
|
|
# tags: [storage, nfs]
|
|
#
|
|
# - name: Verify NFS mount is accessible
|
|
# ansible.builtin.command: "ls -la {{ nfs_mount_point }}"
|
|
# register: nfs_verify
|
|
# changed_when: false
|
|
# tags: [storage, nfs, verify]
|
|
|
|
# ========================================================================
|
|
# PHASE 6: POST-INSTALL VERIFICATION
|
|
# ========================================================================
|
|
|
|
- name: Check if system reboot is required
|
|
ansible.builtin.stat:
|
|
path: /var/run/reboot-required
|
|
register: reboot_required
|
|
tags: [verify, reboot]
|
|
|
|
- name: Display reboot notification if needed
|
|
ansible.builtin.debug:
|
|
msg: |
|
|
╔════════════════════════════════════════════════════════════════╗
|
|
║ WARNING: System reboot is required to complete installation ║
|
|
║ Reason: Kernel or driver updates ║
|
|
║ Action: Please reboot this host manually ║
|
|
╚════════════════════════════════════════════════════════════════╝
|
|
when: reboot_required.stat.exists
|
|
tags: [verify, reboot]
|
|
|
|
- name: Display bootstrap completion summary
|
|
ansible.builtin.debug:
|
|
msg:
|
|
- "╔════════════════════════════════════════════════════════════════╗"
|
|
- "║ AI Workstation Bootstrap Complete! ║"
|
|
- "╠════════════════════════════════════════════════════════════════╣"
|
|
- "║ ✓ System updated and essential utilities installed ║"
|
|
- "║ ✓ NVIDIA drivers installed (verify with nvidia-smi) ║"
|
|
- "║ ✓ Ollama installed and network-accessible ║"
|
|
- "║ → Ollama API: http://{{ ansible_host }}:{{ ollama_port }} ║"
|
|
- "╠════════════════════════════════════════════════════════════════╣"
|
|
- "║ Next Steps: ║"
|
|
- "║ 1. Reboot if required (check above) ║"
|
|
- "║ 2. Pull models: ollama pull llama3.1:8b ║"
|
|
- "║ 3. Configure NFS mounts (see Phase 5 in playbook) ║"
|
|
- "╚════════════════════════════════════════════════════════════════╝"
|
|
tags: [verify]
|
|
|
|
# ==========================================================================
|
|
# HANDLERS
|
|
# ==========================================================================
|
|
handlers:
|
|
- name: Restart ollama
|
|
ansible.builtin.systemd:
|
|
name: ollama
|
|
state: restarted
|
|
daemon_reload: true
|
|
|
|
- name: Update Grub
|
|
ansible.builtin.command: update-grub
|
|
changed_when: false
|
|
|
|
- name: Restart Logind
|
|
ansible.builtin.systemd:
|
|
name: systemd-logind
|
|
state: restarted
|