Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
ChughShilpa committed Feb 5, 2025
2 parents 9cf7cac + 92c8408 commit b519083
Show file tree
Hide file tree
Showing 4 changed files with 12 additions and 34 deletions.
4 changes: 2 additions & 2 deletions .tekton/training-rocm-push.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,9 @@ spec:
- name: revision
value: '{{revision}}'
- name: output-image
value: quay.io/modh/training:py311-rocm61-torch241
value: quay.io/modh/training:py311-rocm62-torch241
- name: additional-tag
value: py311-rocm61-torch241-{{revision}}
value: py311-rocm62-torch241-{{revision}}
- name: dockerfile
value: Dockerfile
- name: path-context
Expand Down
18 changes: 9 additions & 9 deletions images/runtime/training/rocm/Dockerfile
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
## Global Args ######################################################
ARG IMAGE_TAG=1-77.1729776556
ARG IMAGE_TAG=9.5-1737537151
ARG PYTHON_VERSION=311

# use UBI9 latest
FROM registry.access.redhat.com/ubi9/python-${PYTHON_VERSION}:${IMAGE_TAG} AS base

LABEL name="training:py311-rocm61-torch241" \
summary="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
io.k8s.display-name="ROCm 6.1 Python 3.11 PyTorch 2.4.1 base image for Training" \
io.k8s.description="ROCm 6.1 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
LABEL name="training:py311-rocm62-torch241" \
summary="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
io.k8s.display-name="ROCm 6.2 Python 3.11 PyTorch 2.4.1 base image for Training" \
io.k8s.description="ROCm 6.2 Python 3.11 PyTorch 2.4.1 image based on UBI9 for Training" \
authoritative-source-url="https://github.com/opendatahub-io/distributed-workloads"

# Copy license
Expand All @@ -25,8 +25,8 @@ RUN pip install --no-cache-dir --upgrade requests==2.32.3
# Install ROCm
WORKDIR /opt/app-root/bin

ARG ROCM_VERSION=6.1.2
ARG AMDGPU_VERSION=6.1.2
ARG ROCM_VERSION=6.2.4
ARG AMDGPU_VERSION=6.2.4

RUN <<EOF
cat <<EOD > /etc/yum.repos.d/rocm.repo
Expand All @@ -48,7 +48,7 @@ gpgkey=https://repo.radeon.com/rocm/rocm.gpg.key
EOD
EOF

RUN dnf -y install rocm && dnf clean all && rm -rf /var/cache/dnf
RUN dnf install -y rocm-developer-tools rocm-ml-sdk rocm-opencl-sdk rocm-openmp-sdk rocm-utils && dnf clean all && rm -rf /var/cache/dnf

# Install Python packages

Expand Down
2 changes: 1 addition & 1 deletion images/runtime/training/rocm/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,5 +5,5 @@ ROCm enabled container image for Training in OpenShift AI.
It includes the following layers:
* UBI 9
* Python 3.11
* ROCm 6.1
* ROCm 6.2
* PyTorch 2.4.1
22 changes: 0 additions & 22 deletions tests/kfto/kfto_mnist_training_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"bytes"
"fmt"
"testing"
"time"

kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -83,27 +82,6 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))

// Verify GPU utilization
if IsOpenShift(test) && accelerator == NVIDIA {
trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()})
test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node

for _, trainingPod := range trainingPods {
// Check that GPUs for training pods were utilized recently
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute).
Should(
And(
HaveLen(numProcPerNode),
ContainElement(
// Check that at least some GPU was utilized on more than 20%
HaveField("Value", BeNumerically(">", 20)),
),
),
)
}
test.T().Log("All GPUs were successfully utilized")
}

// Make sure the PyTorch job succeeded
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
Expand Down

0 comments on commit b519083

Please sign in to comment.