Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(al2023): Bug fix - Fix repo for Nvidia AL2023 repo, install Nvidia CTK for isolated partitions #2033

Merged
merged 2 commits into from
Nov 1, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 32 additions & 10 deletions templates/al2023/provisioners/install-nvidia-driver.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,22 +20,37 @@ function is-isolated-partition() {
return 0
}

function rpm_install() {
local RPMS=($@)
echo "Pulling and installing local rpms from s3 bucket"
for RPM in "${RPMS[@]}"; do
aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM}
sudo dnf localinstall -y ${WORKING_DIR}/${RPM}
done
}

function install-nvidia-container-toolkit() {
# The order of these RPMs is important, as they have dependencies on each other
RPMS=("libnvidia-container1-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-base-1.16.2-1.x86_64.rpm" "libnvidia-container-tools-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-1.16.2-1.x86_64.rpm")
for RPM in "${RPMS[@]}"; do
echo "pulling and installing rpms: (${RPM}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})"
aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM}
echo "installing rpm: ${WORKING_DIR}/${RPM}"
sudo rpm -ivh ${WORKING_DIR}/${RPM}
done
}

echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..."

################################################################################
### Add repository #############################################################
################################################################################
# Determine the domain based on the region
if is-isolated-partition; then
echo '[amzn2023-nvidia]
name=Amazon Linux 2023 Nvidia repository
mirrorlist=https://al2023-repos-$awsregion-de612dc2.s3.$awsregion.$awsdomain/nvidia/mirrors/$releasever/$basearch/mirror.list
priority=20
enabled=1
repo_gpgcheck=0
type=rpm
gpgcheck=0
gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2023' | sudo tee /etc/yum.repos.d/amzn2023-nvidia.repo
aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/amzn2023-nvidia.repo ${WORKING_DIR}/amzn2023-nvidia.repo
Issacwww marked this conversation as resolved.
Show resolved Hide resolved

sudo dnf config-manager --add-repo ${WORKING_DIR}/amzn2023-nvidia.repo
rpm_install "opencl-filesystem-1.0-5.el7.noarch.rpm" "ocl-icd-2.2.12-1.el7.x86_64.rpm"

else
if [[ $AWS_REGION == cn-* ]]; then
Expand Down Expand Up @@ -102,7 +117,14 @@ sudo systemctl enable set-nvidia-clocks.service
################################################################################
### Install other dependencies #################################################
################################################################################
sudo dnf -y install nvidia-fabric-manager nvidia-container-toolkit
sudo dnf -y install nvidia-fabric-manager

# NVIDIA Container toolkit needs to be locally installed for isolated partitions
if is-isolated-partition; then
install-nvidia-container-toolkit
else
sudo dnf -y install nvidia-container-toolkit
fi

sudo systemctl enable nvidia-fabricmanager
sudo systemctl enable nvidia-persistenced
Loading