Skip to content

Commit

Permalink
Various fixes and optimizations to the k3s deployment (#140)
Browse files Browse the repository at this point in the history
Fixes a bunch of issues that we've seen doing installs on different
kinds of systems:

1. Include poetry.lock to keep versioning consistent
2. Install curl and jq, if necessary
3. Use the helmchart CRD to install the Nvidia operator so we don't have
to install and configure helm
4. Include `nvidia-utils` in the Nvidia driver install path since it's
required for the operator to initialize
5. Be more patient waiting for the GPU capacity to be recognized by the
node.
  • Loading branch information
tomfaulhaber authored Nov 21, 2024
1 parent 8265846 commit befdcbb
Show file tree
Hide file tree
Showing 5 changed files with 2,989 additions and 28 deletions.
3 changes: 0 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,3 @@
# Don't check in the exact python library versions.
poetry.lock

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
Expand Down
23 changes: 23 additions & 0 deletions deploy/bin/helm-nvidia-operator.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
apiVersion: v1
kind: Namespace
metadata:
name: gpu-operator
---
apiVersion: helm.cattle.io/v1
kind: HelmChart
metadata:
name: nvidia-gpu-operator
namespace: kube-system
annotations:
helm.cattle.io/helm-controller: "true"
spec:
repo: https://nvidia.github.io/gpu-operator
chart: gpu-operator
targetNamespace: gpu-operator
bootstrap: true # Add this to ensure it's processed during bootstrap
# https://github.com/NVIDIA/gpu-operator/blob/main/deployments/gpu-operator/values.yaml
valuesContent: |-
driver:
enabled: false # Disable NVIDIA driver installation since we have it pre-installed
toolkit:
enabled: false # Disable NVIDIA Container Toolkit installation since we have it pre-installed
57 changes: 33 additions & 24 deletions deploy/bin/install-k3s-nvidia.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ check_nvidia_drivers_and_container_runtime() {

if ! command -v nvidia-smi &> /dev/null; then
echo "NVIDIA drivers are not installed (nvidia-smi not found). Installing..."
sudo apt update && sudo apt install -y "nvidia-headless-$NVIDIA_VERSION-server"
sudo apt update && sudo apt install -y "nvidia-headless-$NVIDIA_VERSION-server" "nvidia-utils-$NVIDIA_VERSION-server"
else
echo "NVIDIA drivers for version $NVIDIA_VERSION are installed."
fi
Expand All @@ -28,6 +28,11 @@ check_nvidia_drivers_and_container_runtime() {
# Get distribution information
DISTRIBUTION=$(. /etc/os-release; echo "$ID$VERSION_ID")

if ! command -v curl &> /dev/null; then
echo "Installing curl to retrieve NVIDIA repository info"
sudo apt update -y && sudo apt install -y curl
fi

# Add NVIDIA Docker repository
echo "Adding NVIDIA Docker repository..."
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
Expand All @@ -39,13 +44,6 @@ check_nvidia_drivers_and_container_runtime() {
fi
}

# Install Helm if it's not available since the Nvidia operator comes packaged as a Helm chart.
if ! command -v helm &> /dev/null
then
echo "Helm not found, installing Helm..."
curl https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 | bash
fi

K="k3s kubectl"
SCRIPT_DIR=$(dirname "$0")

Expand All @@ -54,27 +52,38 @@ check_nvidia_drivers_and_container_runtime
# Install k3s using our standard script
$SCRIPT_DIR/install-k3s.sh

# Add the NVIDIA GPU Operator Helm repository
helm repo add nvidia https://nvidia.github.io/gpu-operator
helm repo update

# Get the latest version of the GPU Operator (the second field on the second line of the search result)
LATEST_VERSION=$(helm search repo nvidia/gpu-operator --devel --versions | awk 'NR == 2 {print $2}')
$K apply -f ${SCRIPT_DIR}/helm-nvidia-operator.yaml

# Install the GPU Operator using Helm
echo "Installing NVIDIA GPU Operator version $LATEST_VERSION..."
helm install \
--wait \
--generate-name \
-n gpu-operator \
--create-namespace \
--version "$LATEST_VERSION" \
nvidia/gpu-operator

echo "NVIDIA GPU Operator installation completed."

# Verify that we actually added GPU capacity to the node
capacity=$($K get $($K get nodes -o name) -o=jsonpath='{.status.capacity.nvidia\.com/gpu}')
capacity=0
elapsed=0
timeout=120

set +x # Don't echo us running around the loop

echo
echo "Waiting up to two minutes for the GPU capacity to come online:"

while [ "$elapsed" -lt "$timeout" ]; do
# Run the command and capture its output
capacity=$($K get $($K get nodes -o name) -o=jsonpath='{.status.capacity.nvidia\.com/gpu}')

# Check if the command output is non-zero
if [ -n "$capacity" ] && [ "$capacity" -ne 0 ]; then
break
fi

echo -n "."

# Wait for 1 second
sleep 1
((elapsed++)) || true # Increment elapsed time (returns a non-zero code??)
done

echo
if [ "$capacity" = "1" ]; then
echo "GPU capacity successfully added"
else
Expand Down
2 changes: 1 addition & 1 deletion deploy/bin/install-k3s.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ set -ex
K="k3s kubectl"

# Update system
sudo apt update && sudo apt upgrade -y
sudo apt update && sudo apt upgrade -y && sudo apt install -y jq curl

# Check cgroup setup
./check_cgroup.sh
Expand Down
Loading

0 comments on commit befdcbb

Please sign in to comment.