Skip to content

Commit

Permalink
Skip GPU usage validation for kfto pytorch multi-node tests
Browse files Browse the repository at this point in the history
  • Loading branch information
abhijeet-dhumal authored and openshift-merge-bot[bot] committed Feb 5, 2025
1 parent 5b1258f commit 7f8cb7e
Showing 1 changed file with 0 additions and 22 deletions.
22 changes: 0 additions & 22 deletions tests/kfto/kfto_mnist_training_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ import (
"bytes"
"fmt"
"testing"
"time"

kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1"
. "github.com/onsi/gomega"
Expand Down Expand Up @@ -83,27 +82,6 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string,
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).
Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue)))

// Verify GPU utilization
if IsOpenShift(test) && accelerator == NVIDIA {
trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()})
test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node

for _, trainingPod := range trainingPods {
// Check that GPUs for training pods were utilized recently
test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute).
Should(
And(
HaveLen(numProcPerNode),
ContainElement(
// Check that at least some GPU was utilized on more than 20%
HaveField("Value", BeNumerically(">", 20)),
),
),
)
}
test.T().Log("All GPUs were successfully utilized")
}

// Make sure the PyTorch job succeeded
test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue)))
test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)
Expand Down

0 comments on commit 7f8cb7e

Please sign in to comment.