diff --git a/.tekton/training-rocm-push.yaml b/.tekton/training-rocm-push.yaml index 38e06d9b..148cece9 100644 --- a/.tekton/training-rocm-push.yaml +++ b/.tekton/training-rocm-push.yaml @@ -24,9 +24,9 @@ spec: - name: revision value: '{{revision}}' - name: output-image - value: quay.io/modh/training:py311-rocm61-torch241 + value: quay.io/modh/training:py311-rocm62-torch241 - name: additional-tag - value: py311-rocm61-torch241-{{revision}} + value: py311-rocm62-torch241-{{revision}} - name: dockerfile value: Dockerfile - name: path-context diff --git a/images/runtime/training/rocm/Dockerfile b/images/runtime/training/rocm/Dockerfile index d6145ea5..722afdb3 100644 --- a/images/runtime/training/rocm/Dockerfile +++ b/images/runtime/training/rocm/Dockerfile @@ -1,15 +1,15 @@ ## Global Args ###################################################### -ARG IMAGE_TAG=1-77.1729776556 +ARG IMAGE_TAG=9.5-1737537151 ARG PYTHON_VERSION=311 # use UBI9 latest FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG} AS base -LABEL name="training:py311-rocm61-torch241" \ - summary="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \ - description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \ - io.k8s.display-name="ROCm 6.1 Python 3.11 PyTorch 2.4.1 base image for Training" \ - io.k8s.description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \ +LABEL name="training:py311-rocm62-torch241" \ + summary="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \ + description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \ + io.k8s.display-name="ROCm 6.2 Python 3.11 PyTorch 2.4.1 base image for Training" \ + io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \ authoritative-source-url="https://github.com/opendatahub-io/distributed-workloads" # Copy license @@ -25,8 +25,8 @@ RUN pip install --no-cache-dir --upgrade requests==2.32.3 # Install ROCm WORKDIR /opt/app-root/bin -ARG ROCM_VERSION=6.1.2 -ARG AMDGPU_VERSION=6.1.2 +ARG ROCM_VERSION=6.2.4 +ARG AMDGPU_VERSION=6.2.4 RUN < /etc/yum.repos.d/rocm.repo @@ -48,7 +48,7 @@ gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key EOD EOF -RUN dnf -y install rocm && dnf clean all && rm -rf /var/cache/dnf +RUN dnf install -y rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf # Install Python packages diff --git a/images/runtime/training/rocm/README.md b/images/runtime/training/rocm/README.md index 8e41dea1..a59c85fe 100644 --- a/images/runtime/training/rocm/README.md +++ b/images/runtime/training/rocm/README.md @@ -5,5 +5,5 @@ ROCm enabled container image for Training in OpenShift AI. It includes the following layers: * UBI 9 * Python 3.11 -* ROCm 6.1 +* ROCm 6.2 * PyTorch 2.4.1 \ No newline at end of file diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go index 186a288e..22122184 100644 --- a/tests/kfto/kfto_mnist_training_test.go +++ b/tests/kfto/kfto_mnist_training_test.go @@ -20,7 +20,6 @@ import ( "bytes" "fmt" "testing" - "time" kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/gomega" @@ -83,27 +82,6 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string, test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble). Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue))) - // Verify GPU utilization - if IsOpenShift(test) && accelerator == NVIDIA { - trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()}) - test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node - - for _, trainingPod := range trainingPods { - // Check that GPUs for training pods were utilized recently - test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute). - Should( - And( - HaveLen(numProcPerNode), - ContainElement( - // Check that at least some GPU was utilized on more than 20% - HaveField("Value", BeNumerically(">", 20)), - ), - ), - ) - } - test.T().Log("All GPUs were successfully utilized") - } - // Make sure the PyTorch job succeeded test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue))) test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)