Skip to content

Commit

Permalink
Merge pull request #303 from nebius/many-small-changes-aggregated/0
Browse files Browse the repository at this point in the history
Many small changes: Various fixes, fancy SSH banner, preinstall IB RDMA packages, unshare enroot runtime on login nodes, colored bash for root, keep more failed gpubench jobs, SSH debug logs
  • Loading branch information
rdjjke authored Jan 9, 2025
2 parents 21b8ac2 + 10b7bc2 commit eabe2af
Show file tree
Hide file tree
Showing 27 changed files with 1,682 additions and 80 deletions.
33 changes: 26 additions & 7 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,11 +83,11 @@ type SlurmClusterSpec struct {
// SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "Verbose", maxJobCount: 10000, minJobAge: 86400}
// +kubebuilder:default={defMemPerNode: 1228800, defCpuPerGPU: 16, completeWait: 5, debugFlags: "Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs", taskPluginParam: "", maxJobCount: 10000, minJobAge: 86400}
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
// Generate and set default AppArmor profile for the Slurm worker and login nodes. The Security Profiles Operator must be installed.
//
// +kubebuilder:default=true
// +kubebuilder:default=false
UseDefaultAppArmorProfile bool `json:"useDefaultAppArmorProfile,omitempty"`
}

Expand Down Expand Up @@ -117,8 +117,8 @@ type SlurmConfig struct {
// Additional parameters for the task plugin
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="Verbose"
// +kubebuilder:validation:Pattern="^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$"
// +kubebuilder:default=""
// +kubebuilder:validation:Pattern="^(|((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+)$"
TaskPluginParam *string `json:"taskPluginParam,omitempty"`
// Keep N last jobs in controller memory
//
Expand Down Expand Up @@ -237,7 +237,7 @@ type NCCLBenchmark struct {
// FailedJobsHistoryLimit defines the number of failed finished jobs to retain
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=3
// +kubebuilder:default=16
FailedJobsHistoryLimit int32 `json:"failedJobsHistoryLimit,omitempty"`

// Image defines the nccl container image
Expand Down Expand Up @@ -894,11 +894,30 @@ type NodeVolumeJailSubMount struct {
// +kubebuilder:validation:Required
MountPath string `json:"mountPath"`

// SubPath points to a specific entry inside the volume.
// Corresponds to the subPath field in the K8s volumeMount structure.
// See official docs for details: https://kubernetes.io/docs/concepts/storage/volumes/#using-subpath
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=""
SubPath string `json:"subPath"`

// ReadOnly defines whether the mount point should be read-only
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
ReadOnly bool `json:"readOnly"`

// VolumeSourceName defines the name of the volume source for the sub-mount.
// Must correspond to the name of one of [VolumeSource]
//
// +kubebuilder:validation:Required
VolumeSourceName string `json:"volumeSourceName"`
// +kubebuilder:validation:Optional
VolumeSourceName *string `json:"volumeSourceName"`

// VolumeClaimTemplateSpec defines the [corev1.PersistentVolumeClaim] template specification
//
// +kubebuilder:validation:Optional
VolumeClaimTemplateSpec *corev1.PersistentVolumeClaimSpec `json:"volumeClaimTemplateSpec,omitempty"`
}

type Telemetry struct {
Expand Down
18 changes: 16 additions & 2 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit eabe2af

Please sign in to comment.