From 7f8cb7e95c80e56f6a3e8befc5ce94a1900d3ebc Mon Sep 17 00:00:00 2001 From: abhijeet-dhumal Date: Wed, 5 Feb 2025 16:12:44 +0530 Subject: [PATCH] Skip GPU usage validation for kfto pytorch multi-node tests --- tests/kfto/kfto_mnist_training_test.go | 22 ---------------------- 1 file changed, 22 deletions(-) diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go index 186a288e..22122184 100644 --- a/tests/kfto/kfto_mnist_training_test.go +++ b/tests/kfto/kfto_mnist_training_test.go @@ -20,7 +20,6 @@ import ( "bytes" "fmt" "testing" - "time" kftov1 "github.com/kubeflow/training-operator/pkg/apis/kubeflow.org/v1" . "github.com/onsi/gomega" @@ -83,27 +82,6 @@ func runKFTOPyTorchMnistJob(t *testing.T, accelerator Accelerator, image string, test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble). Should(WithTransform(PyTorchJobConditionRunning, Equal(corev1.ConditionTrue))) - // Verify GPU utilization - if IsOpenShift(test) && accelerator == NVIDIA { - trainingPods := GetPods(test, namespace.Name, metav1.ListOptions{LabelSelector: "training.kubeflow.org/job-name=" + tuningJob.GetName()}) - test.Expect(trainingPods).To(HaveLen(workerReplicas + 1)) // +1 is a master node - - for _, trainingPod := range trainingPods { - // Check that GPUs for training pods were utilized recently - test.Eventually(OpenShiftPrometheusGpuUtil(test, trainingPod, accelerator), 15*time.Minute). - Should( - And( - HaveLen(numProcPerNode), - ContainElement( - // Check that at least some GPU was utilized on more than 20% - HaveField("Value", BeNumerically(">", 20)), - ), - ), - ) - } - test.T().Log("All GPUs were successfully utilized") - } - // Make sure the PyTorch job succeeded test.Eventually(PyTorchJob(test, namespace.Name, tuningJob.Name), TestTimeoutDouble).Should(WithTransform(PyTorchJobConditionSucceeded, Equal(corev1.ConditionTrue))) test.T().Logf("PytorchJob %s/%s ran successfully", tuningJob.Namespace, tuningJob.Name)