Adding cel validation on trainingRuntime CRD

Signed-off-by: Akshay Chitneni <[email protected]>
kubeflow · Feb 14, 2025 · b9ec5ff · b9ec5ff
1 parent 3f3a8d3
commit b9ec5ff
Show file tree

Hide file tree

Showing 12 changed files with 257 additions and 90 deletions.
diff --git a/manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml b/manifests/base/crds/trainer.kubeflow.org_clustertrainingruntimes.yaml
@@ -61,6 +61,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -585,14 +586,24 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
                         type: string
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be equal to auto, cpu, gpu,
+                            or int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -602,6 +613,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.

diff --git a/manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml b/manifests/base/crds/trainer.kubeflow.org_trainingruntimes.yaml
@@ -61,6 +61,7 @@ spec:
                         format: int32
                         type: integer
                       runLauncherAsNode:
+                        default: false
                         description: |-
                           Whether to run training process on the launcher Job.
                           Defaults to false.
@@ -585,14 +586,24 @@ spec:
                             type: integer
                         type: object
                       numProcPerNode:
+                        default: auto
                         description: |-
                           Number of processes per node.
                           This value is inserted into the `--nproc-per-node` argument of the `torchrun` CLI.
                           Supported values: `auto`, `cpu`, `gpu`, or int value.
                           Defaults to `auto`.
                         type: string
+                        x-kubernetes-validations:
+                        - message: NumProcPerNode must be equal to auto, cpu, gpu,
+                            or int value
+                          rule: self in ['auto', 'cpu', 'gpu'] || type(self) == int
                     type: object
                 type: object
+                x-kubernetes-validations:
+                - message: numNodes should not be set if torch.elasticPolicy is configured
+                  rule: '!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))'
+                - message: Only one of the policy can be configured
+                  rule: '!(has(self.torch) && has(self.mpi))'
               podGroupPolicy:
                 description: Configuration for the PodGroup to enable gang-scheduling
                   via supported plugins.
@@ -602,6 +613,7 @@ spec:
                       for gang-scheduling.
                     properties:
                       scheduleTimeoutSeconds:
+                        default: 60
                         description: |-
                           Time threshold to schedule PodGroup for gang-scheduling.
                           If the scheduling timeout is equal to 0, the default value is used.

diff --git a/pkg/apis/trainer/v1alpha1/trainingruntime_types.go b/pkg/apis/trainer/v1alpha1/trainingruntime_types.go
@@ -142,10 +142,13 @@ type CoschedulingPodGroupPolicySource struct {
 	// Time threshold to schedule PodGroup for gang-scheduling.
 	// If the scheduling timeout is equal to 0, the default value is used.
 	// Defaults to 60 seconds.
+	// +kubebuilder:default=60
 	ScheduleTimeoutSeconds *int32 `json:"scheduleTimeoutSeconds,omitempty"`
 }
 
 // MLPolicy represents configuration for the model trining with ML-specific parameters.
+// +kubebuilder:validation:XValidation:rule="!(has(self.numNodes) && (has(self.torch) && has(self.torch.elasticPolicy)))", message="numNodes should not be set if torch.elasticPolicy is configured"
+// +kubebuilder:validation:XValidation:rule="!(has(self.torch) && has(self.mpi))", message="Only one of the policy can be configured"
 type MLPolicy struct {
 	// Number of training nodes.
 	// Defaults to 1.
@@ -173,6 +176,8 @@ type TorchMLPolicySource struct {
 	// Supported values: `auto`, `cpu`, `gpu`, or int value.
 	// TODO (andreyvelich): Add kubebuilder validation.
 	// Defaults to `auto`.
+	// +kubebuilder:default="auto"
+	// +kubebuilder:validation:XValidation:rule="self in ['auto', 'cpu', 'gpu'] || type(self) == int", message="NumProcPerNode must be equal to auto, cpu, gpu, or int value"
 	NumProcPerNode *string `json:"numProcPerNode,omitempty"`
 
 	// Elastic policy for the PyTorch training.
@@ -218,6 +223,7 @@ type MPIMLPolicySource struct {
 
 	// Whether to run training process on the launcher Job.
 	// Defaults to false.
+	// +kubebuilder:default=false
 	RunLauncherAsNode *bool `json:"runLauncherAsNode,omitempty"`
 }
 

diff --git a/pkg/runtime/core/trainingruntime_test.go b/pkg/runtime/core/trainingruntime_test.go
@@ -19,6 +19,7 @@ package core
 import (
 	"context"
 	"fmt"
+	"k8s.io/utils/ptr"
 	"testing"
 
 	"github.com/google/go-cmp/cmp"
@@ -263,7 +264,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 		"succeeded to build JobSet with Torch values from the TrainJob": {
 			trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec(
 				testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec).
-					TorchPolicy(100, "auto").
+					TorchPolicy(100, ptr.To("auto")).
 					ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
 					Obj(),
 			).Obj(),
@@ -273,7 +274,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 				Trainer(
 					testingutil.MakeTrainJobTrainerWrapper().
 						NumNodes(30).
-						NumProcPerNode("3").
+						NumProcPerNode(ptr.To("3")).
 						Obj(),
 				).
 				Obj(),
@@ -317,7 +318,7 @@ func TestTrainingRuntimeNewObjects(t *testing.T) {
 		"succeeded to build JobSet with Torch values from the Runtime and envs.": {
 			trainingRuntime: testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").RuntimeSpec(
 				testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "test-runtime").Spec).
-					TorchPolicy(100, "auto").
+					TorchPolicy(100, ptr.To("auto")).
 					ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
 					ContainerTrainerEnv(
 						[]corev1.EnvVar{

diff --git a/pkg/runtime/framework/plugins/torch/torch.go b/pkg/runtime/framework/plugins/torch/torch.go
@@ -19,7 +19,6 @@ package torch
 import (
 	"context"
 	"fmt"
-
 	corev1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/util/sets"
 	"k8s.io/apimachinery/pkg/util/validation/field"
@@ -76,6 +75,7 @@ func (t *Torch) EnforceMLPolicy(info *runtime.Info, trainJob *trainer.TrainJob)
 	// TODO (andreyvelich): Add validation to check that TrainJob doesn't have "PET_" envs.
 	// TODO (andreyvelich): We should validate that envs from different plugins don't conflict with each other.
 	// Ref: https://github.com/kubeflow/trainer/pull/2308#discussion_r1823229940
+
 	infoEnvs := []corev1.EnvVar{
 		{
 			Name:  constants.TorchEnvNumNodes,

diff --git a/pkg/util/testing/wrapper.go b/pkg/util/testing/wrapper.go
@@ -392,8 +392,8 @@ func (t *TrainJobTrainerWrapper) NumNodes(numNodes int32) *TrainJobTrainerWrappe
 	return t
 }
 
-func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode string) *TrainJobTrainerWrapper {
-	t.Trainer.NumProcPerNode = &numProcPerNode
+func (t *TrainJobTrainerWrapper) NumProcPerNode(numProcPerNode *string) *TrainJobTrainerWrapper {
+	t.Trainer.NumProcPerNode = numProcPerNode
 	return t
 }
 
@@ -689,12 +689,12 @@ func (s *TrainingRuntimeSpecWrapper) NumNodes(numNodes int32) *TrainingRuntimeSp
 	return s
 }
 
-func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode string) *TrainingRuntimeSpecWrapper {
+func (s *TrainingRuntimeSpecWrapper) TorchPolicy(numNodes int32, numProcPerNode *string) *TrainingRuntimeSpecWrapper {
 	s.MLPolicy = &trainer.MLPolicy{
 		NumNodes: &numNodes,
 		MLPolicySource: trainer.MLPolicySource{
 			Torch: &trainer.TorchMLPolicySource{
-				NumProcPerNode: &numProcPerNode,
+				NumProcPerNode: numProcPerNode,
 			},
 		},
 	}

diff --git a/sdk/python/test-requirements.txt b/sdk/python/test-requirements.txt
@@ -0,0 +1,3 @@
+pytest~=4.6.7 # needed for python 2.7+3.4
+pytest-cov>=2.8.1
+pytest-randomly==1.2.3 # needed for python 2.7+3.4
diff --git a/sdk/python/tox.ini b/sdk/python/tox.ini
@@ -0,0 +1,9 @@
+[tox]
+envlist = py27, py3
+
+[testenv]
+deps=-r{toxinidir}/requirements.txt
+     -r{toxinidir}/test-requirements.txt
+
+commands=
+   pytest --cov=kubeflow.training
diff --git a/test/integration/controller/trainjob_controller_test.go b/test/integration/controller/trainjob_controller_test.go
@@ -278,7 +278,7 @@ var _ = ginkgo.Describe("TrainJob controller", ginkgo.Ordered, func() {
 			trainingRuntime = testingutil.MakeTrainingRuntimeWrapper(ns.Name, "alpha").
 				RuntimeSpec(
 					testingutil.MakeTrainingRuntimeSpecWrapper(testingutil.MakeTrainingRuntimeWrapper(metav1.NamespaceDefault, "alpha").Spec).
-						TorchPolicy(100, "auto").
+						TorchPolicy(100, ptr.To("auto")).
 						ContainerTrainer("test:runtime", []string{"runtime"}, []string{"runtime"}, resRequests).
 						Obj()).
 				Obj()

diff --git a/...n/webhooks/clustertrainingruntime_test.go → ...ks/clustertrainingruntime_webhook_test.go b/...n/webhooks/clustertrainingruntime_test.go → ...ks/clustertrainingruntime_webhook_test.go
@@ -17,14 +17,13 @@ limitations under the License.
 package webhooks
 
 import (
+	trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
+	testingutil "github.com/kubeflow/trainer/pkg/util/testing"
+	"github.com/kubeflow/trainer/test/integration/framework"
 	"github.com/onsi/ginkgo/v2"
 	"github.com/onsi/gomega"
 	corev1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
-
-	trainer "github.com/kubeflow/trainer/pkg/apis/trainer/v1alpha1"
-	testingutil "github.com/kubeflow/trainer/pkg/util/testing"
-	"github.com/kubeflow/trainer/test/integration/framework"
 )
 
 const clTrainingRuntimeName = "test-clustertrainingruntime"

diff --git a/test/integration/webhooks/trainingruntime_test.go b/test/integration/webhooks/trainingruntime_test.go