Skip to content

Commit

Permalink
Merge pull request #268 from nebius/dev
Browse files Browse the repository at this point in the history
Release 1.16.0
  • Loading branch information
asteny authored Dec 17, 2024
2 parents 145f189 + 948d380 commit 4c06b19
Show file tree
Hide file tree
Showing 122 changed files with 5,592 additions and 20,294 deletions.
21 changes: 0 additions & 21 deletions .github/auto_assign.yml

This file was deleted.

2 changes: 1 addition & 1 deletion .github/workflows/github_release.yml
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ jobs:
token: ${{ secrets.GITHUB_TOKEN }}

- name: Create GitHub Release with changelog
uses: softprops/action-gh-release@01570a1f39cb168c169c802c3bceb9e93fb10974 # v2.1.0
uses: softprops/action-gh-release@7b4da11513bf3f43f9999e90eabced41ab8bb048 # v2.2.0
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
with:
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/gpubench_only.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Install GO
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0
with:
go-version-file: 'go.mod'

Expand All @@ -64,7 +64,7 @@ jobs:
run: make test-version-sync

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@c47758b77c9736f4b2ef4073d4d51994fabfe349 # v3.7.1
uses: docker/setup-buildx-action@6524bf65af31da8d45b59e8c27de4bd072b392f5 # v3.8.0

- name: Log in to the Github Container registry
uses: docker/login-action@7ca345011ac4304463197fac0e56eab1bc7e6af0
Expand Down
8 changes: 6 additions & 2 deletions .github/workflows/one_job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@ on:
- 'LICENSE'
- 'PROJECT'
- 'README.md'
- 'SECURITY.md'
- 'images/jail/gpubench/**'
pull_request:
branches:
- main

permissions:
contents: read
Expand Down Expand Up @@ -58,7 +62,7 @@ jobs:
uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2

- name: Install GO
uses: actions/setup-go@41dfa10bad2bb2ae585af6ee5bb4d7d973ad74ed # v5.1.0
uses: actions/setup-go@3041bf56c941b39c61721a86cd11f3bb1338122a # v5.2.0
with:
go-version-file: 'go.mod'

Expand All @@ -71,7 +75,7 @@ jobs:
run: make test-version-sync

- name: Set up Docker Buildx
uses: docker/setup-buildx-action@c47758b77c9736f4b2ef4073d4d51994fabfe349 # v3.7.1
uses: docker/setup-buildx-action@6524bf65af31da8d45b59e8c27de4bd072b392f5 # v3.8.0

- name: Log in to the Github Container registry
uses: docker/login-action@7ca345011ac4304463197fac0e56eab1bc7e6af0
Expand Down
4 changes: 2 additions & 2 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM golang:1.22@sha256:4594271250150c1a322ed749abfd218e1a8c6eb1ade90872e325a664412e2037 AS operator_builder
FROM golang:1.23@sha256:70031844b8c225351d0bb63e2c383f80db85d92ba894e3da7e13bcf80efa9a37 AS operator_builder

ARG GO_LDFLAGS=""
ARG BUILD_TIME
Expand All @@ -16,7 +16,7 @@ RUN GOOS=$GOOS GOARCH=$GOARCH CGO_ENABLED=$CGO_ENABLED GO_LDFLAGS=$GO_LDFLAGS \
go build -o slurm_operator ./cmd/

#######################################################################################################################
FROM alpine:latest@sha256:1e42bbe2508154c9126d48c2b8a75420c3544343bf86fd041fb7527e017a4b4a AS slurm-operator
FROM alpine:latest@sha256:21dc6063fd678b478f57c0e13f47560d0ea4eeba26dfc947b2a4f81f686b9f45 AS slurm-operator

COPY --from=operator_builder /operator/slurm_operator /usr/bin/

Expand Down
17 changes: 8 additions & 9 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,12 @@ lint-fix: golangci-lint ## Run golangci-lint linter and perform fixes
$(GOLANGCI_LINT) run --fix

.PHONY: helm
helm: kustomize helmify yq ## Update soperator Helm chart
rm -rf $(CHART_OPERATOR_PATH)
$(KUSTOMIZE) build config/default | $(HELMIFY) --crd-dir $(CHART_OPERATOR_PATH)
rm -f $(CHART_PATH)/operatorAppVersion
cp -r $(CHART_OPERATOR_PATH)/crds/* $(CHART_OPERATOR_CRDS_PATH)/templates/
@$(YQ) -i ".name = \"helm-soperator\"" "$(CHART_OPERATOR_PATH)/Chart.yaml"
@$(SED_COMMAND) '/^#/d' "$(CHART_OPERATOR_PATH)/Chart.yaml"
helm: generate manifests ## Update soperator Helm chart
$(KUSTOMIZE) build config/crd > $(CHART_OPERATOR_PATH)/crds/slurmcluster-crd.yaml
$(KUSTOMIZE) build config/crd > $(CHART_OPERATOR_CRDS_PATH)/templates/slurmcluster-crd.yaml
mv $(CHART_OPERATOR_PATH)/values.yaml $(CHART_OPERATOR_PATH)/values.yaml.bak
$(KUSTOMIZE) build --load-restrictor LoadRestrictionsNone config/rbac/soperator-helm | $(HELMIFY) $(CHART_OPERATOR_PATH)
mv $(CHART_OPERATOR_PATH)/values.yaml.bak $(CHART_OPERATOR_PATH)/values.yaml

.PHONY: get-version
get-version:
Expand Down Expand Up @@ -297,11 +296,11 @@ YQ ?= $(LOCALBIN)/yq

## Tool Versions
KUSTOMIZE_VERSION ?= v5.5.0
CONTROLLER_TOOLS_VERSION ?= v0.14.0
CONTROLLER_TOOLS_VERSION ?= v0.16.4
ENVTEST_VERSION ?= release-0.17
GOLANGCI_LINT_VERSION ?= v1.57.2
HELMIFY_VERSION ?= 0.4.13
YQ_VERSION ?= 4.44.1
YQ_VERSION ?= 4.44.3

.PHONY: kustomize
kustomize: $(KUSTOMIZE) ## Download kustomize locally if necessary.
Expand Down
8 changes: 8 additions & 0 deletions PROJECT
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,12 @@ resources:
kind: SlurmCluster
path: nebius.ai/slurm-operator/api/v1
version: v1
- core: true
group: core
kind: Secret
path: k8s.io/api/core/v1
version: v1
webhooks:
validation: true
webhookVersion: v1
version: "3"
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ Slurm's accounting system records detailed job information such as:
- User and group identities
- Job start/end times
- Resource requests and allocations
- If `protectedSecret` is set to `true`, the user secret for MariaDB will not be deleted after the MariaDB CR is deleted

This helps cluster administrators and users monitor resource utilization, enforce quotas, and generate usage reports for performance optimization or billing purposes.

Expand All @@ -114,11 +115,10 @@ This helps cluster administrators and users monitor resource utilization, enforc
[22.04](https://releases.ubuntu.com/jammy/).
- Slurm: versions `23.11.6` and `24.05.3`.
- CUDA: version [12.2.2](https://developer.nvidia.com/cuda-12-2-2-download-archive).
- Kubernetes: >= [1.28](https://kubernetes.io/blog/2023/08/15/kubernetes-v1-28-release/).
- Kubernetes: >= [1.29](https://kubernetes.io/blog/2023/08/15/kubernetes-v1-28-release/).
- Versions of some preinstalled software packages can't be changed.



## 🚀 Installation
The steps required to deploy Soperator to your Kubernetes cluster depend on whether you are using Kubernetes
on premises or in a cloud.
Expand Down
11 changes: 11 additions & 0 deletions SECURITY.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# Reporting Security Issues

The Nebius team takes security bugs seriously. We appreciate your efforts to responsibly disclose your findings, and will make every effort to acknowledge your contributions.

To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/nebius/soperator/security/advisories/new) tab.

The Nebius team will send a response indicating the next steps in handling your report. After the initial reply to your report, the Nebius team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance.

## Learning More About Security in Nebius

To learn more about security in Nebius, please see [this page](https://nebius.ai/docs/security).
2 changes: 1 addition & 1 deletion VERSION
Original file line number Diff line number Diff line change
@@ -1 +1 @@
1.15.3
1.16.0
74 changes: 66 additions & 8 deletions api/v1/slurmcluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import (
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

mariadv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
mariadbv1alpha1 "github.com/mariadb-operator/mariadb-operator/api/v1alpha1"
prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
)

Expand Down Expand Up @@ -73,6 +73,40 @@ type SlurmClusterSpec struct {
// https://slurm.schedmd.com/slurm.conf.html#SECTION_PARTITION-CONFIGURATION
// +kubebuilder:validation:Optional
PartitionConfiguration PartitionConfiguration `json:"partitionConfiguration,omitempty"`

// SlurmConfig represents the Slurm configuration in slurm.conf. Not all options are supported.
//
// +kubebuilder:validation:Optional
SlurmConfig SlurmConfig `json:"slurmConfig,omitempty"`
}

// SlurmConfig represents the Slurm configuration in slurm.conf
type SlurmConfig struct {
// Default real memory size available per allocated node in mebibytes.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=1228800
DefMemPerNode int32 `json:"defMemPerNode,omitempty"`
// Default count of CPUs allocated per allocated GPU
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=16
DefCpuPerGPU int32 `json:"defCpuPerGPU,omitempty"`
// The time to wait, in seconds, when any job is in the COMPLETING state before any additional jobs are scheduled.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=5
CompleteWait int32 `json:"completeWait,omitempty"`
// Defines specific subsystems which should provide more detailed event logging.
//
// +kubebuilder:validation:Optional
// +kubebuilder:default="Cgroup,CPU_Bind,Gres,JobComp,Priority,Script,SelectType,Steps,TraceJobs"
// +kubebuilder:validation:Pattern="^((Accrue|Agent|AuditRPCs|Backfill|BackfillMap|BurstBuffer|Cgroup|ConMgr|CPU_Bind|CpuFrequency|Data|DBD_Agent|Dependency|Elasticsearch|Energy|Federation|FrontEnd|Gres|Hetjob|Gang|GLOB_SILENCE|JobAccountGather|JobComp|JobContainer|License|Network|NetworkRaw|NodeFeatures|NO_CONF_HASH|Power|Priority|Profile|Protocol|Reservation|Route|Script|SelectType|Steps|Switch|TLS|TraceJobs|Triggers)(,)?)+$"
DebugFlags string `json:"debugFlags,omitempty"`
// +kubebuilder:validation:Optional
// +kubebuilder:default="Verbose"
// +kubebuilder:validation:Pattern="^((None|Cores|Sockets|Threads|SlurmdOffSpec|OOMKillStep|Verbose|Autobind)(,)?)+$"
TaskPluginParam string `json:"taskPluginParam,omitempty"`
}

type PartitionConfiguration struct {
Expand Down Expand Up @@ -277,7 +311,6 @@ type K8sNodeFilter struct {
Name string `json:"name"`

// Affinity defines the desired affinity for the node
//
// NOTE: Affinity could not be set if NodeSelector is specified
//
// +kubebuilder:validation:Optional
Expand Down Expand Up @@ -442,13 +475,26 @@ type MariaDbOperator struct {
// +kubebuilder:validation:Optional
Enabled bool `json:"enabled"`

// If enabled, secret cannot be deleted until custom resource slurmcluster is deleted
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
// +kubebuilder:validation:Immutable
ProtectedSecret bool `json:"protectedSecret"`

NodeContainer `json:",inline"`
PodSecurityContext *corev1.PodSecurityContext `json:"podSecurityContext,omitempty"`
SecurityContext *corev1.SecurityContext `json:"securityContext,omitempty"`
Replicas int32 `json:"replicas,omitempty"`
Metrics *mariadv1alpha1.MariadbMetrics `json:"metrics,omitempty"`
Replication *mariadv1alpha1.Replication `json:"replication,omitempty"`
Storage mariadv1alpha1.Storage `json:"storage,omitempty"`
PodSecurityContext *mariadbv1alpha1.PodSecurityContext `json:"podSecurityContext,omitempty"`
SecurityContext *mariadbv1alpha1.SecurityContext `json:"securityContext,omitempty"`
Replicas int32 `json:"replicas,omitempty"`
Metrics MariadbMetrics `json:"metrics,omitempty"`
Replication *mariadbv1alpha1.Replication `json:"replication,omitempty"`
Storage mariadbv1alpha1.Storage `json:"storage,omitempty"`
}

type MariadbMetrics struct {
// +kubebuilder:validation:Optional
// +kubebuilder:default=true
Enabled bool `json:"enabled,omitempty"`
}

type SlurmdbdConfig struct {
Expand Down Expand Up @@ -577,6 +623,11 @@ type SlurmNodeWorker struct {
// +kubebuilder:validation:Required
Munge NodeContainer `json:"munge"`

// SupervisordConfigMapRefName is the name of the supervisord config, which runs in slurmd container
//
// +kubebuilder:validation:Optional
SupervisordConfigMapRefName string `json:"supervisordConfigMapRefName,omitempty"`

// Volumes represents the volume configurations for the worker node
//
// +kubebuilder:validation:Required
Expand All @@ -587,6 +638,13 @@ type SlurmNodeWorker struct {
// +kubebuilder:default="v2"
// +kubebuilder:validation:Enum="v1";"v2"
CgroupVersion string `json:"cgroupVersion,omitempty"`

// EnableGDRCopy driver propagation into containers (this feature must also be enabled in NVIDIA GPU operator)
// https://developer.nvidia.com/gdrcopy
//
// +kubebuilder:validation:Optional
// +kubebuilder:default=false
EnableGDRCopy bool `json:"enableGDRCopy,omitempty"`
}

// SlurmNodeWorkerVolumes defines the volumes for the Slurm worker node
Expand Down
41 changes: 34 additions & 7 deletions api/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 4c06b19

Please sign in to comment.