From 72f9ec2f668959a806c0cfe357be9ac50d94ec62 Mon Sep 17 00:00:00 2001 From: whoix <151130171+whoix@users.noreply.github.com> Date: Fri, 1 Nov 2024 16:01:54 -0600 Subject: [PATCH] feat(al2023): Bug fix - Fix repo for Nvidia AL2023 repo, install Nvidia CTK for isolated partitions (#2033) * Bug fix - Fix repo formatting errors for Nvidia AL2023 repo, install Nvidia ctk * Fix lint errors --------- Co-authored-by: Wyatt Hicken --- .../provisioners/install-nvidia-driver.sh | 42 ++++++++++++++----- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/templates/al2023/provisioners/install-nvidia-driver.sh b/templates/al2023/provisioners/install-nvidia-driver.sh index 9d9d8c8df..a27c20103 100755 --- a/templates/al2023/provisioners/install-nvidia-driver.sh +++ b/templates/al2023/provisioners/install-nvidia-driver.sh @@ -20,6 +20,26 @@ function is-isolated-partition() { return 0 } +function rpm_install() { + local RPMS=($@) + echo "Pulling and installing local rpms from s3 bucket" + for RPM in "${RPMS[@]}"; do + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} + sudo dnf localinstall -y ${WORKING_DIR}/${RPM} + done +} + +function install-nvidia-container-toolkit() { + # The order of these RPMs is important, as they have dependencies on each other + RPMS=("libnvidia-container1-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-base-1.16.2-1.x86_64.rpm" "libnvidia-container-tools-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-1.16.2-1.x86_64.rpm") + for RPM in "${RPMS[@]}"; do + echo "pulling and installing rpms: (${RPM}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} + echo "installing rpm: ${WORKING_DIR}/${RPM}" + sudo rpm -ivh ${WORKING_DIR}/${RPM} + done +} + echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..." ################################################################################ @@ -27,15 +47,10 @@ echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..." ################################################################################ # Determine the domain based on the region if is-isolated-partition; then - echo '[amzn2023-nvidia] - name=Amazon Linux 2023 Nvidia repository - mirrorlist=https://al2023-repos-$awsregion-de612dc2.s3.$awsregion.$awsdomain/nvidia/mirrors/$releasever/$basearch/mirror.list - priority=20 - enabled=1 - repo_gpgcheck=0 - type=rpm - gpgcheck=0 - gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2023' | sudo tee /etc/yum.repos.d/amzn2023-nvidia.repo + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/amzn2023-nvidia.repo ${WORKING_DIR}/amzn2023-nvidia.repo + + sudo dnf config-manager --add-repo ${WORKING_DIR}/amzn2023-nvidia.repo + rpm_install "opencl-filesystem-1.0-5.el7.noarch.rpm" "ocl-icd-2.2.12-1.el7.x86_64.rpm" else if [[ $AWS_REGION == cn-* ]]; then @@ -102,7 +117,14 @@ sudo systemctl enable set-nvidia-clocks.service ################################################################################ ### Install other dependencies ################################################# ################################################################################ -sudo dnf -y install nvidia-fabric-manager nvidia-container-toolkit +sudo dnf -y install nvidia-fabric-manager + +# NVIDIA Container toolkit needs to be locally installed for isolated partitions +if is-isolated-partition; then + install-nvidia-container-toolkit +else + sudo dnf -y install nvidia-container-toolkit +fi sudo systemctl enable nvidia-fabricmanager sudo systemctl enable nvidia-persistenced