From b21f0179e96ebd2ef6010ce37c6a3b58c3b2f175 Mon Sep 17 00:00:00 2001 From: Wyatt Hicken Date: Thu, 31 Oct 2024 13:40:40 -0600 Subject: [PATCH 1/4] feat(al2023): NVIDIA variant in isolated paritions - fetch deps --- .../al2023/provisioners/install-nvidia-driver.sh | 14 ++++++++++++++ templates/al2023/template.json | 2 ++ 2 files changed, 16 insertions(+) diff --git a/templates/al2023/provisioners/install-nvidia-driver.sh b/templates/al2023/provisioners/install-nvidia-driver.sh index 9d9d8c8df..7e17cdd52 100755 --- a/templates/al2023/provisioners/install-nvidia-driver.sh +++ b/templates/al2023/provisioners/install-nvidia-driver.sh @@ -20,6 +20,16 @@ function is-isolated-partition() { return 0 } +function rpm_install() { + local RPMS=($@) + echo "pulling and installing rpms: (${RPMS[@]}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" + for RPM in ${RPMS[@]}; do + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} + sudo yum localinstall -y ${WORKING_DIR}/${RPM} + done +} + + echo "Installing NVIDIA ${NVIDIA_DRIVER_MAJOR_VERSION} drivers..." ################################################################################ @@ -37,6 +47,10 @@ if is-isolated-partition; then gpgcheck=0 gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2023' | sudo tee /etc/yum.repos.d/amzn2023-nvidia.repo + # these are required in order to build kmod-nvidia-open-dkms, and would + # normally be available from epel but that isn't reachable in isolated partitions + rpm_install "opencl-filesystem-1.0-5.el7.noarch.rpm" "ocl-icd-2.2.12-1.el7.x86_64.rpm" + else if [[ $AWS_REGION == cn-* ]]; then DOMAIN="nvidia.cn" diff --git a/templates/al2023/template.json b/templates/al2023/template.json index 10e6ef64d..c22fd2645 100644 --- a/templates/al2023/template.json +++ b/templates/al2023/template.json @@ -237,6 +237,8 @@ "environment_vars": [ "AWS_REGION={{user `aws_region`}}", "ENABLE_ACCELERATOR={{user `enable_accelerator`}}", + "BINARY_BUCKET_NAME={{user `binary_bucket_name`}}", + "BINARY_BUCKET_REGION={{user `binary_bucket_region`}}", "NVIDIA_DRIVER_MAJOR_VERSION={{user `nvidia_driver_major_version`}}", "WORKING_DIR={{user `working_dir`}}" ] From 1122f5fc4026e43985cd0c47bc2712cefcf35131 Mon Sep 17 00:00:00 2001 From: Wyatt Hicken Date: Thu, 31 Oct 2024 14:16:59 -0600 Subject: [PATCH 2/4] Fetch local rpms for NVIDIA variant in isolated partition --- .../provisioners/install-nvidia-driver.sh | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/templates/al2023/provisioners/install-nvidia-driver.sh b/templates/al2023/provisioners/install-nvidia-driver.sh index 7e17cdd52..60594976a 100755 --- a/templates/al2023/provisioners/install-nvidia-driver.sh +++ b/templates/al2023/provisioners/install-nvidia-driver.sh @@ -25,7 +25,18 @@ function rpm_install() { echo "pulling and installing rpms: (${RPMS[@]}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" for RPM in ${RPMS[@]}; do aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} - sudo yum localinstall -y ${WORKING_DIR}/${RPM} + sudo dnf localinstall -y ${WORKING_DIR}/${RPM} + done +} + +function patch-nvidia-container-toolkit(){ + # The order of these RPMs is important, as they have dependencies on each other + RPMS=("libnvidia-container1-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-base-1.16.2-1.x86_64.rpm" "libnvidia-container-tools-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-1.16.2-1.x86_64.rpm") + for RPM in ${RPMS[@]}; do + echo "pulling and installing rpms: (${RPM}) from s3 bucket: (${BINARY_BUCKET_NAME}) in region: (${BINARY_BUCKET_REGION})" + aws s3 cp --region ${BINARY_BUCKET_REGION} s3://${BINARY_BUCKET_NAME}/rpms/${RPM} ${WORKING_DIR}/${RPM} + echo "installing rpm: ${WORKING_DIR}/${RPM}" + sudo rpm -ivh ${WORKING_DIR}/${RPM} done } @@ -47,7 +58,7 @@ if is-isolated-partition; then gpgcheck=0 gpgkey=file:///etc/pki/rpm-gpg/RPM-GPG-KEY-amazon-linux-2023' | sudo tee /etc/yum.repos.d/amzn2023-nvidia.repo - # these are required in order to build kmod-nvidia-open-dkms, and would + # these are required in order to build kmod-nvidia-open-dkms # normally be available from epel but that isn't reachable in isolated partitions rpm_install "opencl-filesystem-1.0-5.el7.noarch.rpm" "ocl-icd-2.2.12-1.el7.x86_64.rpm" @@ -116,7 +127,14 @@ sudo systemctl enable set-nvidia-clocks.service ################################################################################ ### Install other dependencies ################################################# ################################################################################ -sudo dnf -y install nvidia-fabric-manager nvidia-container-toolkit +sudo dnf -y install nvidia-fabric-manager + +# NVIDIA Container toolkit needs to be locally installed for isolated partitions +if is-isolated-partition; then + patch-nvidia-container-toolkit +else + sudo dnf -y nvidia-container-toolkit +fi sudo systemctl enable nvidia-fabricmanager sudo systemctl enable nvidia-persistenced From 163317db24ab03f22557d70b8093cffc9ac0488d Mon Sep 17 00:00:00 2001 From: whoix <151130171+whoix@users.noreply.github.com> Date: Thu, 31 Oct 2024 14:53:55 -0600 Subject: [PATCH 3/4] Update templates/al2023/provisioners/install-nvidia-driver.sh Co-authored-by: Sichaow --- templates/al2023/provisioners/install-nvidia-driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/al2023/provisioners/install-nvidia-driver.sh b/templates/al2023/provisioners/install-nvidia-driver.sh index 60594976a..4535874d8 100755 --- a/templates/al2023/provisioners/install-nvidia-driver.sh +++ b/templates/al2023/provisioners/install-nvidia-driver.sh @@ -29,7 +29,7 @@ function rpm_install() { done } -function patch-nvidia-container-toolkit(){ +function install-nvidia-container-toolkit(){ # The order of these RPMs is important, as they have dependencies on each other RPMS=("libnvidia-container1-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-base-1.16.2-1.x86_64.rpm" "libnvidia-container-tools-1.16.2-1.x86_64.rpm" "nvidia-container-toolkit-1.16.2-1.x86_64.rpm") for RPM in ${RPMS[@]}; do From af11ade27cda387998d6cb11cd031e81dd9f4f55 Mon Sep 17 00:00:00 2001 From: Sichaow Date: Thu, 31 Oct 2024 13:56:29 -0700 Subject: [PATCH 4/4] Update templates/al2023/provisioners/install-nvidia-driver.sh --- templates/al2023/provisioners/install-nvidia-driver.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/templates/al2023/provisioners/install-nvidia-driver.sh b/templates/al2023/provisioners/install-nvidia-driver.sh index 4535874d8..acadafc8e 100755 --- a/templates/al2023/provisioners/install-nvidia-driver.sh +++ b/templates/al2023/provisioners/install-nvidia-driver.sh @@ -131,7 +131,7 @@ sudo dnf -y install nvidia-fabric-manager # NVIDIA Container toolkit needs to be locally installed for isolated partitions if is-isolated-partition; then - patch-nvidia-container-toolkit + install-nvidia-container-toolkit else sudo dnf -y nvidia-container-toolkit fi