Skip to content

Commit

Permalink
Active Checks controller: nccl-tests crd
Browse files Browse the repository at this point in the history
  • Loading branch information
asteny committed Mar 7, 2025
1 parent 09fc747 commit b396e99
Show file tree
Hide file tree
Showing 5 changed files with 351 additions and 13 deletions.
65 changes: 61 additions & 4 deletions api/v1alpha1/activecheck_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,73 @@ limitations under the License.
package v1alpha1

import (
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

// ActiveCheckSpec defines the desired state of ActiveCheck.
type ActiveCheckSpec struct {
// INSERT ADDITIONAL SPEC FIELDS - desired state of cluster
// Important: Run "make" to regenerate code after modifying this file
// Schedule defines the CronJob schedule.
// By default, every year - at 00:00 on day-of-month 1 in January
// +kubebuilder:validation:Optional
// +kubebuilder:default="0 0 1 1 *"
Schedule string `json:"schedule,omitempty"`

// Suspend indicates whether the action is suspended.
// +kubebuilder:validation:Optional
// +kubebuilder:default=true
Suspend string `json:"suspend,omitempty"`

// JobType defines the type of a job.
// +kubebuilder:validation:Enum="one_shot";"periodic"
// +kubebuilder:default="one_shot"
JobType string `json:"jobType,omitempty"`

// Image defines the container image
// +kubebuilder:validation:Required
Image string `json:"image"`

// ImagePullPolicy defines the image pull policy
// +kubebuilder:validation:Enum=Always;Never;IfNotPresent
// +kubebuilder:validation:Optional
// +kubebuilder:default="IfNotPresent"
ImagePullPolicy corev1.PullPolicy `json:"imagePullPolicy,omitempty"`

// SpecNCCLTests defines settings for specific type of the check
SpecNCCLTests SpecNCCLTests `json:"specNCCLTests"`

// Reactions defines reaction on specific check
Reactions Reactions `json:"reactions"`
}

type Reactions struct {
// SetCondition enabling setting condition to the k8s node
SetCondition bool `json:"setCondition,omitempty"`

// DrainSlurmNode enabling slurm node draining if check failed
DrainSlurmNode bool `json:"DrainSlurmNode,omitempty"`
}

type SpecNCCLTests struct {
// Name + CheckType defines the name of k8s cronJob
// +kubebuilder:validation:Optional
// +kubebuilder:default=nccl-test
Name string `json:"name,omitempty"`

// Priority defines the priority of k8s cronJob
// +kubebuilder:validation:Optional
// +kubebuilder:default=10
Priority int32 `json:"priority,omitempty"`

// CheckType defines the name of the binary that will be executed during tests.
// +kubebuilder:validation:Enum=all_gather_perf;all_reduce_perf;alltoall_perf;broadcast_perf;gather_perf;hypercube_perf;reduce_perf;reduce_scatter_perf;scatter_perf;sendrecv_perf;all_gather_perf_mpi;all_reduce_perf_mpi;alltoall_perf_mpi;broadcast_perf_mpi;gather_perf_mpi;hypercube_perf_mpi;reduce_perf_mpi;reduce_scatter_perf_mpi;scatter_perf_mpi;sendrecv_perf_mpi
// +kubebuilder:validation:Optional
// +kubebuilder:default=all_reduce_perf
CheckType string `json:"checkType,omitempty"`

// Foo is an example field of ActiveCheck. Edit activecheck_types.go to remove/update
Foo string `json:"foo,omitempty"`
// Args defines raw params string
// +kubebuilder:validation:Optional
Args string `json:"args,omitempty"`
}

// ActiveCheckStatus defines the observed state of ActiveCheck.
Expand Down
32 changes: 32 additions & 0 deletions api/v1alpha1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

89 changes: 86 additions & 3 deletions config/crd/bases/slurm.nebius.ai_activechecks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,93 @@ spec:
spec:
description: ActiveCheckSpec defines the desired state of ActiveCheck.
properties:
foo:
description: Foo is an example field of ActiveCheck. Edit activecheck_types.go
to remove/update
image:
description: Image defines the container image
type: string
imagePullPolicy:
default: IfNotPresent
description: ImagePullPolicy defines the image pull policy
enum:
- Always
- Never
- IfNotPresent
type: string
jobType:
default: one_shot
description: JobType defines the type of a job.
enum:
- one_shot
- periodic
type: string
reactions:
description: Reactions defines reaction on specific check
properties:
DrainSlurmNode:
description: DrainSlurmNode enabling slurm node draining if check
failed
type: boolean
setCondition:
description: SetCondition enabling setting condition to the k8s
node
type: boolean
type: object
schedule:
default: 0 0 1 1 *
description: |-
Schedule defines the CronJob schedule.
By default, every year - at 00:00 on day-of-month 1 in January
type: string
specNCCLTests:
description: SpecNCCLTests defines settings for specific type of the
check
properties:
args:
description: Args defines raw params string
type: string
checkType:
default: all_reduce_perf
description: CheckType defines the name of the binary that will
be executed during tests.
enum:
- all_gather_perf
- all_reduce_perf
- alltoall_perf
- broadcast_perf
- gather_perf
- hypercube_perf
- reduce_perf
- reduce_scatter_perf
- scatter_perf
- sendrecv_perf
- all_gather_perf_mpi
- all_reduce_perf_mpi
- alltoall_perf_mpi
- broadcast_perf_mpi
- gather_perf_mpi
- hypercube_perf_mpi
- reduce_perf_mpi
- reduce_scatter_perf_mpi
- scatter_perf_mpi
- sendrecv_perf_mpi
type: string
name:
default: nccl-test
description: Name + CheckType defines the name of k8s cronJob
type: string
priority:
default: 10
description: Priority defines the priority of k8s cronJob
format: int32
type: integer
type: object
suspend:
default: true
description: Suspend indicates whether the action is suspended.
type: string
required:
- image
- reactions
- specNCCLTests
type: object
status:
description: ActiveCheckStatus defines the observed state of ActiveCheck.
Expand Down
89 changes: 86 additions & 3 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,93 @@ spec:
spec:
description: ActiveCheckSpec defines the desired state of ActiveCheck.
properties:
foo:
description: Foo is an example field of ActiveCheck. Edit activecheck_types.go
to remove/update
image:
description: Image defines the container image
type: string
imagePullPolicy:
default: IfNotPresent
description: ImagePullPolicy defines the image pull policy
enum:
- Always
- Never
- IfNotPresent
type: string
jobType:
default: one_shot
description: JobType defines the type of a job.
enum:
- one_shot
- periodic
type: string
reactions:
description: Reactions defines reaction on specific check
properties:
DrainSlurmNode:
description: DrainSlurmNode enabling slurm node draining if check
failed
type: boolean
setCondition:
description: SetCondition enabling setting condition to the k8s
node
type: boolean
type: object
schedule:
default: 0 0 1 1 *
description: |-
Schedule defines the CronJob schedule.
By default, every year - at 00:00 on day-of-month 1 in January
type: string
specNCCLTests:
description: SpecNCCLTests defines settings for specific type of the
check
properties:
args:
description: Args defines raw params string
type: string
checkType:
default: all_reduce_perf
description: CheckType defines the name of the binary that will
be executed during tests.
enum:
- all_gather_perf
- all_reduce_perf
- alltoall_perf
- broadcast_perf
- gather_perf
- hypercube_perf
- reduce_perf
- reduce_scatter_perf
- scatter_perf
- sendrecv_perf
- all_gather_perf_mpi
- all_reduce_perf_mpi
- alltoall_perf_mpi
- broadcast_perf_mpi
- gather_perf_mpi
- hypercube_perf_mpi
- reduce_perf_mpi
- reduce_scatter_perf_mpi
- scatter_perf_mpi
- sendrecv_perf_mpi
type: string
name:
default: nccl-test
description: Name + CheckType defines the name of k8s cronJob
type: string
priority:
default: 10
description: Priority defines the priority of k8s cronJob
format: int32
type: integer
type: object
suspend:
default: true
description: Suspend indicates whether the action is suspended.
type: string
required:
- image
- reactions
- specNCCLTests
type: object
status:
description: ActiveCheckStatus defines the observed state of ActiveCheck.
Expand Down
Loading

0 comments on commit b396e99

Please sign in to comment.