Skip to content

Commit

Permalink
Merge pull request #151 from nebius/release/soperator
Browse files Browse the repository at this point in the history
Release Soperator 1.17.0-1
  • Loading branch information
Uburro authored Jan 9, 2025
2 parents b705104 + 0359986 commit d04a6d7
Show file tree
Hide file tree
Showing 16 changed files with 523 additions and 281 deletions.
2 changes: 1 addition & 1 deletion soperator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ Get the Slurm cluster IP address
```bash
export SLURM_IP=$(terraform state show module.login_script.terraform_data.connection_ip | grep 'input' | grep -oE '[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+' | head -n 1
)
ssh root@$SLURM_IP -i ~/.ssh/<public_id_rsa_key> -p <node_port_if_not_default>
ssh root@$SLURM_IP -i ~/.ssh/<private_id_rsa_key> -p <node_port_if_not_default>
```

or connect using the login script:
Expand Down
2 changes: 1 addition & 1 deletion soperator/SUBVERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
3
1
2 changes: 1 addition & 1 deletion soperator/VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.16.1
1.17.0
26 changes: 16 additions & 10 deletions soperator/installations/example/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,17 @@ locals {
}

use_node_port = var.slurm_login_service_type == "NodePort"

slurm_cluster_name = "soperator"
k8s_cluster_name = format("soperator-%s", var.company_name)
}

module "filestore" {
source = "../../modules/filestore"

iam_project_id = data.nebius_iam_v1_project.this.id

k8s_cluster_name = var.k8s_cluster_name
k8s_cluster_name = local.k8s_cluster_name

controller_spool = {
spec = var.filestore_controller_spool.spec != null ? {
Expand Down Expand Up @@ -99,8 +102,8 @@ module "k8s" {
vpc_subnet_id = data.nebius_vpc_v1_subnet.this.id

k8s_version = var.k8s_version
name = var.k8s_cluster_name
slurm_cluster_name = var.slurm_cluster_name
name = local.k8s_cluster_name
slurm_cluster_name = local.slurm_cluster_name
company_name = var.company_name

node_group_system = var.slurm_nodeset_system
Expand Down Expand Up @@ -196,7 +199,7 @@ module "slurm" {

source = "../../modules/slurm"

name = var.slurm_cluster_name
name = local.slurm_cluster_name
operator_version = var.slurm_operator_version
k8s_cluster_context = module.k8s.cluster_context

Expand Down Expand Up @@ -261,10 +264,13 @@ module "slurm" {
} : null
}

login_service_type = var.slurm_login_service_type
login_node_port = var.slurm_login_node_port
login_allocation_id = module.k8s.allocation_id
login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys
login_service_type = var.slurm_login_service_type
login_node_port = var.slurm_login_node_port
login_allocation_id = module.k8s.allocation_id
login_sshd_config_map_ref_name = var.slurm_login_sshd_config_map_ref_name
login_ssh_root_public_keys = var.slurm_login_ssh_root_public_keys

worker_sshd_config_map_ref_name = var.slurm_worker_sshd_config_map_ref_name

exporter_enabled = var.slurm_exporter_enabled
rest_enabled = var.slurm_rest_enabled
Expand Down Expand Up @@ -302,7 +308,7 @@ module "slurm" {

shared_memory_size_gibibytes = var.slurm_shared_memory_size_gibibytes

nccl_topology_type = var.slurm_nodeset_workers[0].resource.platform == "gpu-h100-sxm" ? "H100 GPU cluster" : "auto"
nccl_topology_type = "auto"
nccl_benchmark_enable = var.nccl_benchmark_enable
nccl_benchmark_schedule = var.nccl_benchmark_schedule
nccl_benchmark_min_threshold = var.nccl_benchmark_min_threshold
Expand All @@ -327,7 +333,7 @@ module "login_script" {
used = local.use_node_port
port = var.slurm_login_node_port
}
slurm_cluster_name = var.slurm_cluster_name
slurm_cluster_name = local.slurm_cluster_name

k8s_cluster_context = module.k8s.cluster_context

Expand Down
55 changes: 8 additions & 47 deletions soperator/installations/example/terraform.tfvars
Original file line number Diff line number Diff line change
Expand Up @@ -101,47 +101,16 @@ filestore_accounting = {

# nfs = {
# enabled = true
# size_gibibytes = 93
# mount_path = "/mnt/nfs"
# size_gibibytes = 930
# mount_path = "/home"
# resource = {
# platform = "cpu-e2"
# preset = "16vcpu-64gb"
# preset = "32vcpu-128gb"
# }
# }

# endregion nfs-server

#----------------------------------------------------------------------------------------------------------------------#
# #
# #
# Cloud #
# #
# #
#----------------------------------------------------------------------------------------------------------------------#
# region Cloud

# IAM token used for communicating with Nebius services.
# Token is being passed via .envrc file.
# Uncomment to override.
# ---
# iam_token = "<YOUR-IAM-TOKEN>"

# ID of the IAM project.
# Project ID is being passed via .envrc file.
# Uncomment to override.
# ---
# iam_project_id = "project-<YOUR-PROJECT-ID>"

# ID of VPC subnet.
# Subnet ID is being passed via .envrc file.
# Uncomment to override.
# ---
#vpc_subnet_id = "vpcsubnet-<YOUR-SUBNET-ID>"

# endregion Cloud

# endregion Infrastructure

#----------------------------------------------------------------------------------------------------------------------#
# #
# #
Expand All @@ -151,13 +120,9 @@ filestore_accounting = {
#----------------------------------------------------------------------------------------------------------------------#
# region Slurm

# Name of the Slurm cluster in k8s cluster.
# ---
slurm_cluster_name = "soperator"

# Version of soperator.
# ---
slurm_operator_version = "1.16.1"
slurm_operator_version = "1.17.0"

# Type of the Slurm partition config. Could be either `default` or `custom`.
# By default, "default".
Expand All @@ -183,10 +148,10 @@ slurm_partition_config_type = "default"
# Configuration of System node set for system resources created by Soperator.
# ---
slurm_nodeset_system = {
size = 1
size = 3
resource = {
platform = "cpu-e2"
preset = "16vcpu-64gb"
preset = "8vcpu-32gb"
}
boot_disk = {
type = "NETWORK_SSD"
Expand All @@ -198,10 +163,10 @@ slurm_nodeset_system = {
# Configuration of Slurm Controller node set.
# ---
slurm_nodeset_controller = {
size = 1
size = 2
resource = {
platform = "cpu-e2"
preset = "8vcpu-32gb"
preset = "4vcpu-16gb"
}
boot_disk = {
type = "NETWORK_SSD"
Expand Down Expand Up @@ -404,10 +369,6 @@ accounting_enabled = true
# ---
k8s_version = "1.30"

# Name of the k8s cluster.
# ---
k8s_cluster_name = "soperator"

# SSH user credentials for accessing k8s nodes.
# That option add public ip address to every node.
# By default, empty list.
Expand Down
81 changes: 32 additions & 49 deletions soperator/installations/example/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -35,12 +35,23 @@ data "nebius_vpc_v1_subnet" "this" {
}

variable "company_name" {
description = "Name of the company. It is used for context name of the cluster in .kubeconfig file."
description = "Name of the company. It is used for naming Slurm & K8s clusters."
type = string

validation {
condition = var.company_name != ""
error_message = "Company name is not provided"
condition = (
length(var.company_name) >= 1 &&
length(var.company_name) <= 32 &&
length(regexall("^[a-z][a-z\\d\\-]*[a-z\\d]+$", var.company_name)) == 1
)
error_message = <<EOF
The company name must:
- be 1 to 32 characters long
- start with a letter
- end with a letter or digit
- consist of letters, digits, or hyphens (-)
- contain only lowercase letters
EOF
}
}

Expand Down Expand Up @@ -143,7 +154,7 @@ variable "nfs" {
type = object({
enabled = bool
size_gibibytes = number
mount_path = optional(string, "/mnt/nfs")
mount_path = optional(string, "/home")
resource = object({
platform = string
preset = string
Expand All @@ -154,7 +165,7 @@ variable "nfs" {
size_gibibytes = 93
resource = {
platform = "cpu-e2"
preset = "16vcpu-64gb"
preset = "32vcpu-128gb"
}
}

Expand All @@ -179,28 +190,6 @@ variable "k8s_version" {
}
}

variable "k8s_cluster_name" {
description = "Name of the k8s cluster."
type = string
nullable = false

validation {
condition = (
length(var.k8s_cluster_name) >= 1 &&
length(var.k8s_cluster_name) <= 64 &&
length(regexall("^[a-z][a-z\\d\\-]*[a-z\\d]+$", var.k8s_cluster_name)) == 1
)
error_message = <<EOF
The k8s cluster name must:
- be 1 to 64 characters long
- start with a letter
- end with a letter or digit
- consist of letters, digits, or hyphens (-)
- contain only lowercase letters
EOF
}
}

variable "k8s_cluster_node_ssh_access_users" {
description = "SSH user credentials for accessing k8s nodes."
type = list(object({
Expand All @@ -217,28 +206,6 @@ variable "k8s_cluster_node_ssh_access_users" {

# region Slurm

variable "slurm_cluster_name" {
description = "Name of the Slurm cluster in k8s cluster."
type = string
nullable = false

validation {
condition = (
length(var.slurm_cluster_name) >= 1 &&
length(var.slurm_cluster_name) <= 64 &&
length(regexall("^[a-z][a-z\\d\\-]*[a-z\\d]+$", var.slurm_cluster_name)) == 1
)
error_message = <<EOF
The Slurm cluster name must:
- be 1 to 64 characters long
- start with a letter
- end with a letter or digit
- consist of letters, digits, or hyphens (-)
- contain only lowercase letters
EOF
}
}

variable "slurm_operator_version" {
description = "Version of soperator."
type = string
Expand Down Expand Up @@ -462,6 +429,16 @@ resource "terraform_data" "check_slurm_nodeset" {
}
}

# region Worker

variable "slurm_worker_sshd_config_map_ref_name" {
description = "Name of configmap with SSHD config, which runs in slurmd container."
type = string
default = ""
}

# endregion Worker

# region Login

variable "slurm_login_service_type" {
Expand All @@ -486,6 +463,12 @@ variable "slurm_login_node_port" {
}
}

variable "slurm_login_sshd_config_map_ref_name" {
description = "Name of configmap with SSHD config, which runs in slurmd container."
type = string
default = ""
}

variable "slurm_login_ssh_root_public_keys" {
description = "Authorized keys accepted for connecting to Slurm login nodes via SSH as 'root' user."
type = list(string)
Expand Down
Loading

0 comments on commit d04a6d7

Please sign in to comment.