Skip to content

Commit

Permalink
Merge pull request #308 from nebius/dev
Browse files Browse the repository at this point in the history
Soperator release 1.17.0
  • Loading branch information
Uburro authored Jan 9, 2025
2 parents 3d3bfb0 + 988a816 commit 9b33f35
Show file tree
Hide file tree
Showing 103 changed files with 3,203 additions and 1,206 deletions.
2 changes: 1 addition & 1 deletion CODEOWNERS
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
# Global code owners (applies to the whole repo)
* @dstaroff @asteny @rdjjke @Uburro
* @dstaroff @asteny @rdjjke @Uburro @itechdima
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.23@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 AS operator_builder
FROM golang:1.23@sha256:7ea4c9dcb2b97ff8ee80a67db3d44f98c8ffa0d191399197007d8459c1453041 AS operator_builder

ARG GO_LDFLAGS=""
ARG BUILD_TIME
Expand All @@ -16,7 +16,7 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
go build -o slurm_operator ./cmd/

#######################################################################################################################
FROM alpine:latest@sha256:21dc6063fd678b478f57c0e13f47560d0ea4eeba26dfc947b2a4f81f686b9f45 AS slurm-operator
FROM alpine:latest@sha256:b97e2a89d0b9e4011bb88c02ddf01c544b8c781acf1f4d559e7c8f12f1047ac3 AS slurm-operator

COPY --from=operator_builder /operator/slurm_operator /usr/bin/

Expand Down
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.16.1
1.17.0
64 changes: 53 additions & 11 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ import (
"k8s.io/apimachinery/pkg/api/meta"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"nebius.ai/slurm-operator/internal/consts"

mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
Expand All @@ -22,12 +23,17 @@ type SlurmClusterSpec struct {
// +kubebuilder:validation:Optional
// +kubebuilder:default="gpu"
ClusterType string `json:"clusterType,omitempty"`

// Pause defines whether to gracefully stop the cluster.
// Setting it to false after cluster has been paused starts the cluster back
// Maintenance defines the maintenance window for the cluster.
// It can have the following values:
// - none: No maintenance is performed. The cluster operates normally.
// - downscale: Scales down all components to 0.
// - downscaleAndDeletePopulateJail: Scales down all components to 0 and deletes the kubernetes Kind Jobs populateJail.
// - skipPopulateJail: Skips the execution of the populateJail job during maintenance.
//
// +kubebuilder:validation:Optional
Pause bool `json:"pause,omitempty"` // TODO cluster pausing/resuming
// +kubebuilder:validation:Enum=none;downscale;downscaleAndDeletePopulateJail;skipPopulateJail
// +kubebuilder:default="none"
Maintenance *consts.MaintenanceMode `json:"maintenance,omitempty"`

// NCCLSettings
// +kubebuilder:validation:Optional
Expand Down Expand Up @@ -77,8 +83,12 @@ type SlurmClusterSpec struct {
// SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "Verbose", maxJobCount: 10000, minJobAge: 86400}
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
// Generate and set default AppArmor profile for the Slurm worker and login nodes. The Security Profiles Operator must be installed.
//
// +kubebuilder:default=false
UseDefaultAppArmorProfile bool `json:"useDefaultAppArmorProfile,omitempty"`
}

// SlurmConfig represents the Slurm configuration in slurm.conf
Expand Down Expand Up @@ -107,8 +117,8 @@ type SlurmConfig struct {
// Additional parameters for the task plugin
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="Verbose"
// +kubebuilder:validation:Pattern="^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$"
// +kubebuilder:default=""
// +kubebuilder:validation:Pattern="^(|((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+)$"
TaskPluginParam *string `json:"taskPluginParam,omitempty"`
// Keep N last jobs in controller memory
//
Expand Down Expand Up @@ -140,7 +150,7 @@ type NCCLSettings struct {

// TopologyType define type of NCCL GPU topology
//
// +kubebuilder:validation:Enum="H100 GPU cluster";auto;custom
// +kubebuilder:validation:Enum=auto;custom
// +kubebuilder:validation:Optional
// +kubebuilder:default="auto"
TopologyType string `json:"topologyType,omitempty"`
Expand Down Expand Up @@ -227,7 +237,7 @@ type NCCLBenchmark struct {
// FailedJobsHistoryLimit defines the number of failed finished jobs to retain
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=3
// +kubebuilder:default=16
FailedJobsHistoryLimit int32 `json:"failedJobsHistoryLimit,omitempty"`

// Image defines the nccl container image
Expand Down Expand Up @@ -585,6 +595,9 @@ type AccountingSlurmConf struct {
// +kubebuilder:default=0
PriorityWeightFairshare *int16 `json:"priorityWeightFairshare,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:default=0
PriorityWeightQOS *int16 `json:"priorityWeightQOS,omitempty"`
// +kubebuilder:validation:Optional
PriorityWeightTRES *string `json:"priorityWeightTRES,omitempty"`
}

Expand Down Expand Up @@ -640,6 +653,11 @@ type SlurmNodeWorker struct {
// +kubebuilder:validation:Optional
SupervisordConfigMapRefName string `json:"supervisordConfigMapRefName,omitempty"`

// SSHDConfigMapRefName is the name of the SSHD config, which runs in slurmd container
//
// +kubebuilder:validation:Optional
SSHDConfigMapRefName string `json:"sshdConfigMapRefName,omitempty"`

// Volumes represents the volume configurations for the worker node
//
// +kubebuilder:validation:Required
Expand Down Expand Up @@ -713,6 +731,11 @@ type SlurmNodeLogin struct {
// +kubebuilder:validation:Optional
SshdServiceAnnotations map[string]string `json:"sshdServiceAnnotations,omitempty"`

// SSHDConfigMapRefName is the name of the SSHD config, which runs in login container
//
// +kubebuilder:validation:Optional
SSHDConfigMapRefName string `json:"sshdConfigMapRefName,omitempty"`

// SshRootPublicKeys represents the list of public authorized_keys for SSH connection to Slurm login nodes
//
// +kubebuilder:validation:Required
Expand Down Expand Up @@ -871,11 +894,30 @@ type NodeVolumeJailSubMount struct {
// +kubebuilder:validation:Required
MountPath string `json:"mountPath"`

// SubPath points to a specific entry inside the volume.
// Corresponds to the subPath field in the K8s volumeMount structure.
// See official docs for details: https://kubernetes.io/docs/concepts/storage/volumes/#using-subpath
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=""
SubPath string `json:"subPath"`

// ReadOnly defines whether the mount point should be read-only
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
ReadOnly bool `json:"readOnly"`

// VolumeSourceName defines the name of the volume source for the sub-mount.
// Must correspond to the name of one of [VolumeSource]
//
// +kubebuilder:validation:Required
VolumeSourceName string `json:"volumeSourceName"`
// +kubebuilder:validation:Optional
VolumeSourceName *string `json:"volumeSourceName"`

// VolumeClaimTemplateSpec defines the [corev1.PersistentVolumeClaim] template specification
//
// +kubebuilder:validation:Optional
VolumeClaimTemplateSpec *corev1.PersistentVolumeClaimSpec `json:"volumeClaimTemplateSpec,omitempty"`
}

type Telemetry struct {
Expand Down
29 changes: 27 additions & 2 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ import (
mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
otelv1beta1 "github.com/open-telemetry/opentelemetry-operator/apis/v1beta1"
prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
apparmor "sigs.k8s.io/security-profiles-operator/api/apparmorprofile/v1alpha1"

slurmv1 "nebius.ai/slurm-operator/api/v1"
"nebius.ai/slurm-operator/internal/check"
Expand All @@ -65,6 +66,9 @@ func init() {
if check.IsMariaDbCRDInstalled() {
utilruntime.Must(mariadbv1alpha1.AddToScheme(scheme))
}
if check.IsAppArmorCRDInstalled() {
utilruntime.Must(apparmor.AddToScheme(scheme))
}

utilruntime.Must(slurmv1.AddToScheme(scheme))

Expand Down
Loading

0 comments on commit 9b33f35

Please sign in to comment.