Skip to content

Commit

Permalink
504: customisable slurm config
Browse files Browse the repository at this point in the history
  • Loading branch information
itechdima committed Feb 28, 2025
1 parent 20891f9 commit d532666
Show file tree
Hide file tree
Showing 8 changed files with 132 additions and 5 deletions.
20 changes: 20 additions & 0 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,10 @@ type SlurmClusterSpec struct {
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", epilog: "", prolog: "", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`

// SlurmConfigRaw represents the raw Slurm configuration in slurm.conf. All options are provided in a raw string.
//
// +kubebuilder:validation:Optional
SlurmConfigRaw *SlurmConfigRaw `json:"slurmConfigRaw,omitempty"`
// MPIConfig represents the PMIx configuration in mpi.conf. Not all options are supported.
//
// +kubebuilder:validation:Optional
Expand Down Expand Up @@ -156,6 +160,22 @@ type SlurmConfig struct {
MessageTimeout *int32 `json:"messageTimeout,omitempty"`
}

// SlurmConfigRaw represents the raw Slurm configuration in slurm.conf. All options are provided in a raw string.
type SlurmConfigRaw struct {
// Strategy defines how is raw config should be applied to the SlurmConfig.
// patch - patches SlurmConfig values (default).
// override - only raw config will be applied.
//
// +kubebuilder:validation:Enum=patch;override
// +kubebuilder:validation:Optional
// +kubebuilder:default=patch
Strategy string `json:"strategy"`
// RawContent of the Slurm config.
//
// +kubebuilder:validation:Required
RawContent string `json:"rawContent"`
}

type MPIConfig struct {
// Semicolon separated list of environment variables to be set in job environments to be used by PMIx.
// Defaults to "OMPI_MCA_btl_tcp_if_include=eth0" to avoid "lo" and "docker" interfaces to be selected by OpenMPI.
Expand Down
20 changes: 20 additions & 0 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions config/crd/bases/slurm.nebius.ai_slurmclusters.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1565,6 +1565,26 @@ spec:
pattern: ^(|((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+)$
type: string
type: object
slurmConfigRaw:
description: SlurmConfigRaw represents the raw Slurm configuration
in slurm.conf. All options are provided in a raw string.
properties:
rawContent:
description: RawContent of the Slurm config.
type: string
strategy:
default: patch
description: |-
Strategy defines how is raw config should be applied to the SlurmConfig.
patch - patches SlurmConfig values (default).
override - only raw config will be applied.
enum:
- patch
- override
type: string
required:
- rawContent
type: object
slurmNodes:
description: SlurmNodes define the desired state of Slurm nodes
properties:
Expand Down
20 changes: 20 additions & 0 deletions helm/soperator-crds/templates/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6116,6 +6116,26 @@ spec:
pattern: ^(|((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+)$
type: string
type: object
slurmConfigRaw:
description: SlurmConfigRaw represents the raw Slurm configuration
in slurm.conf. All options are provided in a raw string.
properties:
rawContent:
description: RawContent of the Slurm config.
type: string
strategy:
default: patch
description: |-
Strategy defines how is raw config should be applied to the SlurmConfig.
patch - patches SlurmConfig values (default).
override - only raw config will be applied.
enum:
- patch
- override
type: string
required:
- rawContent
type: object
slurmNodes:
description: SlurmNodes define the desired state of Slurm nodes
properties:
Expand Down
20 changes: 20 additions & 0 deletions helm/soperator/crds/slurmcluster-crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -6116,6 +6116,26 @@ spec:
pattern: ^(|((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+)$
type: string
type: object
slurmConfigRaw:
description: SlurmConfigRaw represents the raw Slurm configuration
in slurm.conf. All options are provided in a raw string.
properties:
rawContent:
description: RawContent of the Slurm config.
type: string
strategy:
default: patch
description: |-
Strategy defines how is raw config should be applied to the SlurmConfig.
patch - patches SlurmConfig values (default).
override - only raw config will be applied.
enum:
- patch
- override
type: string
required:
- rawContent
type: object
slurmNodes:
description: SlurmNodes define the desired state of Slurm nodes
properties:
Expand Down
5 changes: 5 additions & 0 deletions internal/consts/slurm.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,8 @@ var SlurmNodeReasonsMap = map[string]struct{}{
SlurmNodeReasonNodeReplacement: {},
SlurmNodeReasonNodeReboot: {},
}

const (
SlurmConfigRawStrategyPatch = "patch"
SlurmConfigRawStrategyOverride = "override"
)
20 changes: 20 additions & 0 deletions internal/render/common/configmap.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,26 @@ func generateSlurmConfig(cluster *values.SlurmCluster) renderutils.ConfigFile {
res.AddProperty("AuthAltParameters", "jwt_key="+consts.RESTJWTKeyPath)
}
}

// apply raw config
if cluster.SlurmConfigRaw != nil {
switch cluster.SlurmConfigRaw.Strategy {
case consts.SlurmConfigRawStrategyOverride:
multilineCfg := &renderutils.MultilineStringConfig{}
multilineCfg.AddLine(cluster.SlurmConfigRaw.RawContent)
return multilineCfg
case consts.SlurmConfigRawStrategyPatch:
fallthrough
default:
res.AddComment("")
res.AddComment("RAW CONFIG")

multilineCfg := &renderutils.MultilineStringConfig{}
multilineCfg.AddLine(res.Render())
multilineCfg.AddLine(cluster.SlurmConfigRaw.RawContent)
return multilineCfg
}
}
return res
}

Expand Down
12 changes: 7 additions & 5 deletions internal/values/slurm_cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ type SlurmCluster struct {
Telemetry *slurmv1.Telemetry
SlurmExporter SlurmExporter
SlurmConfig slurmv1.SlurmConfig
SlurmConfigRaw *slurmv1.SlurmConfigRaw
MPIConfig slurmv1.MPIConfig
}

Expand Down Expand Up @@ -71,11 +72,12 @@ func BuildSlurmClusterFrom(ctx context.Context, cluster *slurmv1.SlurmCluster) (
&cluster.Spec.NCCLSettings,
cluster.Spec.UseDefaultAppArmorProfile,
),
NodeLogin: buildSlurmLoginFrom(cluster.Name, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Login, cluster.Spec.UseDefaultAppArmorProfile),
Telemetry: cluster.Spec.Telemetry,
SlurmExporter: buildSlurmExporterFrom(cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Exporter),
SlurmConfig: cluster.Spec.SlurmConfig,
MPIConfig: cluster.Spec.MPIConfig,
NodeLogin: buildSlurmLoginFrom(cluster.Name, cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Login, cluster.Spec.UseDefaultAppArmorProfile),
Telemetry: cluster.Spec.Telemetry,
SlurmExporter: buildSlurmExporterFrom(cluster.Spec.Maintenance, &cluster.Spec.SlurmNodes.Exporter),
SlurmConfig: cluster.Spec.SlurmConfig,
SlurmConfigRaw: cluster.Spec.SlurmConfigRaw,
MPIConfig: cluster.Spec.MPIConfig,
}

if err := res.Validate(ctx); err != nil {
Expand Down

0 comments on commit d532666

Please sign in to comment.