Skip to content

Commit

Permalink
mount topology config map to accounting
Browse files Browse the repository at this point in the history
  • Loading branch information
webconn committed Feb 28, 2025
1 parent 0e2ffd7 commit 87786f5
Show file tree
Hide file tree
Showing 9 changed files with 84 additions and 37 deletions.
5 changes: 3 additions & 2 deletions internal/consts/configmap.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
package consts

const (
slurmConfigs = slurmPrefix + "configs"
slurmdbdSecret = "slurm-secrets"
slurmConfigs = slurmPrefix + "configs"
slurmTopologyConfig = slurmPrefix + "topology-config"
slurmdbdSecret = "slurm-secrets"
)

const (
Expand Down
50 changes: 26 additions & 24 deletions internal/consts/volume.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ const (

const (
VolumeNameSlurmConfigs = slurmConfigs
VolumeNameSlurmTopologyConfig = slurmTopologyConfig
VolumeNameSlurmdbdSecret = slurmdbdSecret
VolumeNameSpool = spool
VolumeNameJail = jail
Expand All @@ -49,28 +50,29 @@ const (
VolumeNameInMemorySubmount = "in-memory"
VolumeNameTmpDisk = "tmp-disk"

VolumeMountPathSlurmConfigs = "/mnt/" + slurmConfigs
VolumeMountPathSlurmdbdSecret = "/mnt/" + slurmdbdSecret
VolumeMountPathSpool = "/var/" + spool
VolumeMountPathSpoolSlurmdbd = "/var/spool/slurmdbd"
VolumeMountPathJail = "/mnt/" + jail
VolumeMountPathJailSnapshot = "/jail"
VolumeMountPathJailUpper = "/mnt/" + jail + ".upper"
VolumeMountPathMungeSocket = "/run/" + Munge
VolumeMountPathMungeKey = "/mnt/" + mungeKey
VolumeMountPathRESTJWTKey = "/mnt/" + RESTJWTKey
VolumeMountPathNvidia = "/run/" + nvidia
VolumeMountPathBoot = "/" + boot
VolumeMountPathSSHConfigs = "/mnt/" + sshConfigs
VolumeMountPathSSHRootKeys = "/root/.ssh/" + authorizedKeys
VolumeMountSubPathSSHRootKeys = authorizedKeys
VolumeMountPathSecurityLimits = "/etc/security/" + securityLimitsConfFile
VolumeMountSubPathSecurityLimits = securityLimitsConfFile
VolumeMountPathNCCLTopology = "/run/nvidia-topologyd"
VolumeMountPathSharedMemory = "/dev/shm"
VolumeMountPathSysctl = "/etc/" + sysctlConfFile
VolumeMountSubPathSysctl = sysctlConfFile
VolumeMountPathSupervisordConfig = "/etc/supervisor/conf.d/"
VolumeMountPathInMemorySubmount = VolumeMountPathJailUpper + "/mnt/memory"
VolumeMountPathTmpDisk = "/tmp"
VolumeMountPathSlurmConfigs = "/mnt/" + slurmConfigs
VolumeMountPathSlurmTopologyConfig = "/mnt/" + slurmConfigs // intended to overlay original config
VolumeMountPathSlurmdbdSecret = "/mnt/" + slurmdbdSecret
VolumeMountPathSpool = "/var/" + spool
VolumeMountPathSpoolSlurmdbd = "/var/spool/slurmdbd"
VolumeMountPathJail = "/mnt/" + jail
VolumeMountPathJailSnapshot = "/jail"
VolumeMountPathJailUpper = "/mnt/" + jail + ".upper"
VolumeMountPathMungeSocket = "/run/" + Munge
VolumeMountPathMungeKey = "/mnt/" + mungeKey
VolumeMountPathRESTJWTKey = "/mnt/" + RESTJWTKey
VolumeMountPathNvidia = "/run/" + nvidia
VolumeMountPathBoot = "/" + boot
VolumeMountPathSSHConfigs = "/mnt/" + sshConfigs
VolumeMountPathSSHRootKeys = "/root/.ssh/" + authorizedKeys
VolumeMountSubPathSSHRootKeys = authorizedKeys
VolumeMountPathSecurityLimits = "/etc/security/" + securityLimitsConfFile
VolumeMountSubPathSecurityLimits = securityLimitsConfFile
VolumeMountPathNCCLTopology = "/run/nvidia-topologyd"
VolumeMountPathSharedMemory = "/dev/shm"
VolumeMountPathSysctl = "/etc/" + sysctlConfFile
VolumeMountSubPathSysctl = sysctlConfFile
VolumeMountPathSupervisordConfig = "/etc/supervisor/conf.d/"
VolumeMountPathInMemorySubmount = VolumeMountPathJailUpper + "/mnt/memory"
VolumeMountPathTmpDisk = "/tmp"
)
1 change: 1 addition & 0 deletions internal/controller/clustercontroller/accounting.go
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,7 @@ func (r SlurmClusterReconciler) ReconcileAccounting(
&clusterValues.NodeAccounting,
clusterValues.NodeFilters,
clusterValues.VolumeSources,
clusterValues.SlurmTopologyConfigMapRefName,
)
if err != nil {
stepLogger.Error(err, "Failed to render")
Expand Down
23 changes: 15 additions & 8 deletions internal/render/accounting/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,27 @@ import (
)

// renderContainerAccounting renders [corev1.Container] for slurmctld
func renderContainerAccounting(container values.Container) corev1.Container {
func renderContainerAccounting(container values.Container, slurmTopologyConfigMapRefName string) corev1.Container {
if container.Port == 0 {
container.Port = consts.DefaultAccountingPort
}
container.NodeContainer.Resources.Storage()

// Create a copy of the container's limits and add non-CPU resources from Requests
limits := common.CopyNonCPUResources(container.Resources)

volumeMounts := []corev1.VolumeMount{
common.RenderVolumeMountSlurmConfigs(),
common.RenderVolumeMountMungeSocket(),
common.RenderVolumeMountRESTJWTKey(),
RenderVolumeMountSlurmdbdConfigs(),
RenderVolumeMountSlurmdbdSpool(),
}

if slurmTopologyConfigMapRefName != "" {
volumeMounts = append(volumeMounts, common.RenderVolumeMountSlurmConfigs())
}

return corev1.Container{
Name: consts.ContainerNameAccounting,
Image: container.Image,
Expand All @@ -27,13 +40,7 @@ func renderContainerAccounting(container values.Container) corev1.Container {
ContainerPort: container.Port,
Protocol: corev1.ProtocolTCP,
}},
VolumeMounts: []corev1.VolumeMount{
common.RenderVolumeMountSlurmConfigs(),
common.RenderVolumeMountMungeSocket(),
common.RenderVolumeMountRESTJWTKey(),
RenderVolumeMountSlurmdbdConfigs(),
RenderVolumeMountSlurmdbdSpool(),
},
VolumeMounts: volumeMounts,
ReadinessProbe: &corev1.Probe{
ProbeHandler: corev1.ProbeHandler{
TCPSocket: &corev1.TCPSocketAction{
Expand Down
2 changes: 2 additions & 0 deletions internal/render/accounting/deployment.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ func RenderDeployment(
accounting *values.SlurmAccounting,
nodeFilter []slurmv1.K8sNodeFilter,
volumeSources []slurmv1.VolumeSource,
slurmTopologyConfigMapRefName string,
) (deployment *appsv1.Deployment, err error) {
labels := common.RenderLabels(consts.ComponentTypeAccounting, clusterName)
matchLabels := common.RenderMatchLabels(consts.ComponentTypeAccounting, clusterName)
Expand All @@ -29,6 +30,7 @@ func RenderDeployment(
nodeFilter,
volumeSources,
matchLabels,
slurmTopologyConfigMapRefName,
)
if err != nil {
return nil, err
Expand Down
7 changes: 6 additions & 1 deletion internal/render/accounting/pod.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ func BasePodTemplateSpec(
nodeFilters []slurmv1.K8sNodeFilter,
volumeSources []slurmv1.VolumeSource,
matchLabels map[string]string,
slurmTopologyConfigMapRefName string,
) (*corev1.PodTemplateSpec, error) {
volumes := []corev1.Volume{
common.RenderVolumeJailFromSource(volumeSources, *accounting.VolumeJail.VolumeSourceName),
Expand All @@ -30,6 +31,10 @@ func BasePodTemplateSpec(
RenderVolumeSlurmdbdSpool(accounting),
}

if slurmTopologyConfigMapRefName != "" {
volumes = append(volumes, common.RenderVolumeSlurmTopologyConfig(slurmTopologyConfigMapRefName))
}

var affinity *corev1.Affinity = nil
var nodeSelector map[string]string

Expand Down Expand Up @@ -66,7 +71,7 @@ func BasePodTemplateSpec(
common.RenderContainerMunge(&accounting.ContainerMunge),
},
Containers: []corev1.Container{
renderContainerAccounting(accounting.ContainerAccounting),
renderContainerAccounting(accounting.ContainerAccounting, slurmTopologyConfigMapRefName),
},
Volumes: volumes,
},
Expand Down
4 changes: 2 additions & 2 deletions internal/render/accounting/pod_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"nebius.ai/slurm-operator/internal/consts"
accounting "nebius.ai/slurm-operator/internal/render/accounting"
"nebius.ai/slurm-operator/internal/render/accounting"
)

func Test_BasePodTemplateSpec(t *testing.T) {
Expand Down Expand Up @@ -59,7 +59,7 @@ func Test_BasePodTemplateSpec(t *testing.T) {
}

result, err := accounting.BasePodTemplateSpec(
defaultNameCluster, acc, defaultNodeFilter, defaultVolumeSources, matchLabels,
defaultNameCluster, acc, defaultNodeFilter, defaultVolumeSources, matchLabels, slurmTopologyConfigMapRefName,
)
assert.NoError(t, err)

Expand Down
2 changes: 2 additions & 0 deletions internal/render/accounting/vars_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -133,4 +133,6 @@ var (
passwordKey: []byte("test-password"),
},
}

slurmTopologyConfigMapRefName = "topology-config"
)
27 changes: 27 additions & 0 deletions internal/render/common/volume.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,33 @@ func RenderVolumeMountSlurmConfigs() corev1.VolumeMount {

// endregion Slurm configs

// region Slurm topology config

// RenderVolumeSlurmTopologyConfig renders [corev1.Volume] containing Slurm topology config file
func RenderVolumeSlurmTopologyConfig(slurmTopologyConfigMapRefName string) corev1.Volume {
return corev1.Volume{
Name: consts.VolumeNameSlurmTopologyConfig,
VolumeSource: corev1.VolumeSource{
ConfigMap: &corev1.ConfigMapVolumeSource{
LocalObjectReference: corev1.LocalObjectReference{
Name: slurmTopologyConfigMapRefName,
},
},
},
}
}

// RenderVolumeMountSlurmTopologyConfig renders [corev1.VolumeMount] defining the mounting path for Slurm topology config path
func RenderVolumeMountSlurmTopologyConfig() corev1.VolumeMount {
return corev1.VolumeMount{
Name: consts.VolumeNameSlurmTopologyConfig,
MountPath: consts.VolumeMountPathSlurmConfigs, // intended to be the same as configs, it's a dedicated file
ReadOnly: true,
}
}

// endregion Slurm topology config

// region Spool

func RenderVolumeNameSpool(componentType consts.ComponentType) string {
Expand Down

0 comments on commit 87786f5

Please sign in to comment.