From a439296a6fa3262eb210556d86f1c66d02032e81 Mon Sep 17 00:00:00 2001 From: abhijeet-dhumal Date: Mon, 6 Jan 2025 17:47:30 +0530 Subject: [PATCH] Add tests to run KFTO pytorch MNIST training using multi-node/multi-gpu usecases --- tests/kfto/kfto_mnist_training_test.go | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tests/kfto/kfto_mnist_training_test.go b/tests/kfto/kfto_mnist_training_test.go index e11575ac..b1043e8f 100644 --- a/tests/kfto/kfto_mnist_training_test.go +++ b/tests/kfto/kfto_mnist_training_test.go @@ -30,17 +30,26 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -func TestPyTorchJobMnistCpu(t *testing.T) { +func TestPyTorchJobMnistMultiNodeCpu(t *testing.T) { runKFTOPyTorchMnistJob(t, 0, 2, "", GetCudaTrainingImage(), "resources/requirements.txt") } -func TestPyTorchJobMnistWithCuda(t *testing.T) { + +func TestPyTorchJobMnistMultiNodeWithCuda(t *testing.T) { runKFTOPyTorchMnistJob(t, 1, 1, "nvidia.com/gpu", GetCudaTrainingImage(), "resources/requirements.txt") } -func TestPyTorchJobMnistWithROCm(t *testing.T) { +func TestPyTorchJobMnistMultiNodeWithROCm(t *testing.T) { runKFTOPyTorchMnistJob(t, 1, 1, "amd.com/gpu", GetROCmTrainingImage(), "resources/requirements-rocm.txt") } +func TestPyTorchJobMnistMultiNodeMultiGpuWithCuda(t *testing.T) { + runKFTOPyTorchMnistJob(t, 2, 1, "nvidia.com/gpu", GetCudaTrainingImage(), "resources/requirements.txt") +} + +func TestPyTorchJobMnistMultiNodeMultiGpuWithROCm(t *testing.T) { + runKFTOPyTorchMnistJob(t, 2, 1, "amd.com/gpu", GetROCmTrainingImage(), "resources/requirements-rocm.txt") +} + func runKFTOPyTorchMnistJob(t *testing.T, numGpus int, workerReplicas int, gpuLabel string, image string, requirementsFile string) { test := With(t)