Merge remote-tracking branch 'upstream/main'

red-hat-data-services · Feb 5, 2025 · b519083 · b519083
2 parents 9cf7cac + 92c8408
commit b519083
Show file tree

Hide file tree

Showing 4 changed files with 12 additions and 34 deletions.
diff --git a/.tekton/training-rocm-push.yaml b/.tekton/training-rocm-push.yaml
@@ -24,9 +24,9 @@ spec:
   - name: revision
     value: '{{revision}}'
   - name: output-image
-    value: quay.io/modh/training:py311-rocm61-torch241
+    value: quay.io/modh/training:py311-rocm62-torch241
   - name: additional-tag
-    value: py311-rocm61-torch241-{{revision}}
+    value: py311-rocm62-torch241-{{revision}}
   - name: dockerfile
     value: Dockerfile
   - name: path-context

diff --git a/images/runtime/training/rocm/Dockerfile b/images/runtime/training/rocm/Dockerfile
@@ -1,15 +1,15 @@
 ## Global Args ######################################################
-ARG IMAGE_TAG=1-77.1729776556
+ARG IMAGE_TAG=9.5-1737537151
 ARG PYTHON_VERSION=311
 
 # use UBI9 latest
 FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG} AS base
 
-LABEL name="training:py311-rocm61-torch241" \
-      summary="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
-      description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
-      io.k8s.display-name="ROCm 6.1 Python 3.11 PyTorch 2.4.1 base image for Training" \
-      io.k8s.description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+LABEL name="training:py311-rocm62-torch241" \
+      summary="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
+      io.k8s.display-name="ROCm 6.2 Python 3.11 PyTorch 2.4.1 base image for Training" \
+      io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
       authoritative-source-url="https://github.com/opendatahub-io/distributed-workloads"
 
 # Copy license
@@ -25,8 +25,8 @@ RUN pip install --no-cache-dir --upgrade requests==2.32.3
 # Install ROCm
 WORKDIR /opt/app-root/bin
 
-ARG ROCM_VERSION=6.1.2
-ARG AMDGPU_VERSION=6.1.2
+ARG ROCM_VERSION=6.2.4
+ARG AMDGPU_VERSION=6.2.4
 
 RUN <<EOF
 cat <<EOD > /etc/yum.repos.d/rocm.repo
@@ -48,7 +48,7 @@ gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
 EOD
 EOF
 
-RUN dnf -y install rocm && dnf clean all && rm -rf /var/cache/dnf
+RUN dnf install -y rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf
 
 # Install Python packages
 

diff --git a/images/runtime/training/rocm/README.md b/images/runtime/training/rocm/README.md
@@ -5,5 +5,5 @@ ROCm enabled container image for Training in OpenShift AI.
 It includes the following layers:
 * UBI 9
 * Python 3.11
-* ROCm 6.1
+* ROCm 6.2
 * PyTorch 2.4.1
diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go
@@ -20,7 +20,6 @@ import (
 	"bytes"
 	"fmt"
 	"testing"
-	"time"
 
 	kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
 	. "github.com/onsi/gomega"
@@ -83,27 +82,6 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
 	test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
 		Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))
 
-	// Verify GPU utilization
-	if IsOpenShift(test) && accelerator == NVIDIA {
-		trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()})
-		test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node
-
-		for _, trainingPod := range trainingPods {
-			// Check that GPUs for training pods were utilized recently
-			test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute).
-				Should(
-					And(
-						HaveLen(numProcPerNode),
-						ContainElement(
-							// Check that at least some GPU was utilized on more than 20%
-							HaveField("Value", BeNumerically(">", 20)),
-						),
-					),
-				)
-		}
-		test.T().Log("All GPUs were successfully utilized")
-	}
-
 	// Make sure the PyTorch job succeeded
 	test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
 	test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)