Skip to content

Commit

Permalink
feat(al2023): Bug fix - Fix repo for Nvidia AL2023 repo, install Nvid…
Browse files Browse the repository at this point in the history
…ia CTK for isolated partitions (#2033)

* Bug fix - Fix repo formatting errors for Nvidia AL2023 repo, install Nvidia ctk

* Fix lint errors

---------

Co-authored-by: Wyatt Hicken <[email protected]>
  • Loading branch information
whoix and Wyatt Hicken authored Nov 1, 2024
1 parent c63ff1c commit 72f9ec2
Showing 1 changed file with 32 additions and 10 deletions.
42 changes: 32 additions & 10 deletions templates/al2023/provisioners/install-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,37 @@ function is-isolated-partition() {
return 0
}

function rpm_install() {
local RPMS=($@)
echo "Pulling and installing local rpms from s3 bucket"
for RPM in "${RPMS[@]}"; do
aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM}
sudo dnf localinstall -y ${WORKING_DIR}/${RPM}
done
}

function install-nvidia-container-toolkit() {
# The order of these RPMs is important, as they have dependencies on each other
RPMS=("libnvidia-container1-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-base-1.16.2-1.x86_64.rpm" "libnvidia-container-tools-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-1.16.2-1.x86_64.rpm")
for RPM in "${RPMS[@]}"; do
echo "pulling and installing rpms: (${RPM}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})"
aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM}
echo "installing rpm: ${WORKING_DIR}/${RPM}"
sudo rpm -ivh ${WORKING_DIR}/${RPM}
done
}

echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..."

################################################################################
### Add repository #############################################################
################################################################################
# Determine the domain based on the region
if is-isolated-partition; then
echo '[amzn2023-nvidia]
name=Amazon Linux 2023 Nvidia repository
mirrorlist=https://al2023-repos-$awsregion-de612dc2.s3.$awsregion.$awsdomain/nvidia/mirrors/$releasever/$basearch/mirror.list
priority=20
enabled=1
repo_gpgcheck=0
type=rpm
gpgcheck=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2023' | sudo tee /etc/yum.repos.d/amzn2023-nvidia.repo
aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/amzn2023-nvidia.repo ${WORKING_DIR}/amzn2023-nvidia.repo

sudo dnf config-manager --add-repo ${WORKING_DIR}/amzn2023-nvidia.repo
rpm_install "opencl-filesystem-1.0-5.el7.noarch.rpm" "ocl-icd-2.2.12-1.el7.x86_64.rpm"

else
if [[ $AWS_REGION == cn-* ]]; then
Expand Down Expand Up @@ -102,7 +117,14 @@ sudo systemctl enable set-nvidia-clocks.service
################################################################################
### Install other dependencies #################################################
################################################################################
sudo dnf -y install nvidia-fabric-manager nvidia-container-toolkit
sudo dnf -y install nvidia-fabric-manager

# NVIDIA Container toolkit needs to be locally installed for isolated partitions
if is-isolated-partition; then
install-nvidia-container-toolkit
else
sudo dnf -y install nvidia-container-toolkit
fi

sudo systemctl enable nvidia-fabricmanager
sudo systemctl enable nvidia-persistenced

0 comments on commit 72f9ec2

Please sign in to comment.