-
Notifications
You must be signed in to change notification settings - Fork 21
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
508: Added initial setup for sconfigcontroller
- Loading branch information
Andrei Pokhila
committed
Mar 7, 2025
1 parent
2bfb4c3
commit c7c9b9b
Showing
7 changed files
with
514 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,237 @@ | ||
/* | ||
Copyright 2024. | ||
Licensed under the Apache License, Version 2.0 (the "License"); | ||
you may not use this file except in compliance with the License. | ||
You may obtain a copy of the License at | ||
http://www.apache.org/licenses/LICENSE-2.0 | ||
Unless required by applicable law or agreed to in writing, software | ||
distributed under the License is distributed on an "AS IS" BASIS, | ||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
See the License for the specific language governing permissions and | ||
limitations under the License. | ||
*/ | ||
|
||
package main | ||
|
||
import ( | ||
"crypto/tls" | ||
"flag" | ||
"os" | ||
"time" | ||
|
||
"go.uber.org/zap/zapcore" | ||
"sigs.k8s.io/controller-runtime/pkg/cache" | ||
|
||
// Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC, etc.) | ||
// to ensure that exec-entrypoint and run can make use of them. | ||
_ "k8s.io/client-go/plugin/pkg/client/auth" | ||
|
||
"k8s.io/apimachinery/pkg/labels" | ||
"k8s.io/apimachinery/pkg/runtime" | ||
"k8s.io/apimachinery/pkg/types" | ||
utilruntime "k8s.io/apimachinery/pkg/util/runtime" | ||
clientgoscheme "k8s.io/client-go/kubernetes/scheme" | ||
ctrl "sigs.k8s.io/controller-runtime" | ||
"sigs.k8s.io/controller-runtime/pkg/healthz" | ||
"sigs.k8s.io/controller-runtime/pkg/log/zap" | ||
metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" | ||
"sigs.k8s.io/controller-runtime/pkg/webhook" | ||
|
||
slurmv1 "nebius.ai/slurm-operator/api/v1" | ||
"nebius.ai/slurm-operator/internal/controller/sconfigcontroller" | ||
"nebius.ai/slurm-operator/internal/jwt" | ||
"nebius.ai/slurm-operator/internal/slurmapi" | ||
//+kubebuilder:scaffold:imports | ||
) | ||
|
||
var ( | ||
scheme = runtime.NewScheme() | ||
) | ||
|
||
func init() { | ||
//+kubebuilder:scaffold:scheme | ||
|
||
utilruntime.Must(clientgoscheme.AddToScheme(scheme)) | ||
utilruntime.Must(slurmv1.AddToScheme(scheme)) | ||
} | ||
|
||
func getZapOpts(logFormat, logLevel string) []zap.Opts { | ||
var zapOpts []zap.Opts | ||
|
||
// Configure log format | ||
if logFormat == "json" { | ||
zapOpts = append(zapOpts, zap.UseDevMode(false)) | ||
} else { | ||
zapOpts = append(zapOpts, zap.UseDevMode(true)) | ||
} | ||
|
||
// Configure log level | ||
var level zapcore.Level | ||
switch logLevel { | ||
case "debug": | ||
level = zapcore.DebugLevel | ||
case "info": | ||
level = zapcore.InfoLevel | ||
case "warn": | ||
level = zapcore.WarnLevel | ||
case "error": | ||
level = zapcore.ErrorLevel | ||
case "dpanic": | ||
level = zapcore.DPanicLevel | ||
case "panic": | ||
level = zapcore.PanicLevel | ||
case "fatal": | ||
level = zapcore.FatalLevel | ||
default: | ||
level = zapcore.InfoLevel | ||
} | ||
zapOpts = append(zapOpts, zap.Level(level)) | ||
return zapOpts | ||
} | ||
|
||
func main() { | ||
var ( | ||
metricsAddr string | ||
enableLeaderElection bool | ||
probeAddr string | ||
secureMetrics bool | ||
enableHTTP2 bool | ||
logFormat string | ||
logLevel string | ||
|
||
reconcileTimeout time.Duration | ||
maxConcurrency int | ||
cacheSyncTimeout time.Duration | ||
) | ||
|
||
var watchNsCacheByName = make(map[string]cache.Config) | ||
|
||
ns := os.Getenv("SLURM_OPERATOR_WATCH_NAMESPACES") | ||
if ns != "" && ns != "*" { | ||
watchNsCacheByName = make(map[string]cache.Config) | ||
watchNsCacheByName[ns] = cache.Config{} | ||
} | ||
|
||
flag.StringVar(&metricsAddr, "metrics-bind-address", ":8080", "The address the metric endpoint binds to.") | ||
flag.StringVar(&probeAddr, "health-probe-bind-address", ":8081", "The address the probe endpoint binds to.") | ||
flag.BoolVar(&enableLeaderElection, "leader-elect", false, | ||
"Enable leader election for controller manager. "+ | ||
"Enabling this will ensure there is only one active controller manager.") | ||
flag.BoolVar(&secureMetrics, "metrics-secure", false, | ||
"If set the metrics endpoint is served securely") | ||
flag.BoolVar(&enableHTTP2, "enable-http2", false, | ||
"If set, HTTP/2 will be enabled for the metrics and webhook servers") | ||
flag.StringVar(&logFormat, "log-format", "json", "Log format: plain or json") | ||
flag.StringVar(&logLevel, "log-level", "debug", "Log level: debug, info, warn, error, dpanic, panic, fatal") | ||
flag.DurationVar(&reconcileTimeout, "reconcile-timeout", 5*time.Minute, "The maximum duration allowed for a single reconcile") | ||
flag.IntVar(&maxConcurrency, "max-concurrent-reconciles", 1, "Configures number of concurrent reconciles. It should improve performance for clusters with many objects.") | ||
flag.DurationVar(&cacheSyncTimeout, "cache-sync-timeout", 5*time.Minute, "The maximum duration allowed for caching sync") | ||
|
||
opts := getZapOpts(logFormat, logLevel) | ||
ctrl.SetLogger(zap.New(opts...)) | ||
setupLog := ctrl.Log.WithName("setup") | ||
|
||
// if the enable-http2 flag is false (the default), http/2 should be disabled | ||
// due to its vulnerabilities. More specifically, disabling http/2 will | ||
// prevent from being vulnerable to the HTTP/2 Stream Cancellation and | ||
// Rapid Reset CVEs. For more information see: | ||
// - https://github.com/advisories/GHSA-qppj-fm5r-hxr3 | ||
// - https://github.com/advisories/GHSA-4374-p667-p6c8 | ||
disableHTTP2 := func(c *tls.Config) { | ||
setupLog.Info("disabling http/2") | ||
c.NextProtos = []string{"http/1.1"} | ||
} | ||
|
||
tlsOpts := []func(*tls.Config){} | ||
if !enableHTTP2 { | ||
tlsOpts = append(tlsOpts, disableHTTP2) | ||
} | ||
|
||
webhookServer := webhook.NewServer(webhook.Options{ | ||
TLSOpts: tlsOpts, | ||
}) | ||
|
||
mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ | ||
Scheme: scheme, | ||
Metrics: metricsserver.Options{ | ||
BindAddress: metricsAddr, | ||
SecureServing: secureMetrics, | ||
TLSOpts: tlsOpts, | ||
}, | ||
WebhookServer: webhookServer, | ||
HealthProbeBindAddress: probeAddr, | ||
LeaderElection: enableLeaderElection, | ||
LeaderElectionID: "vqeyz6an.nebius.ai", | ||
// LeaderElectionReleaseOnCancel defines if the leader should step down voluntarily | ||
// when the Manager ends. This requires the binary to immediately end when the | ||
// Manager is stopped, otherwise, this setting is unsafe. Setting this significantly | ||
// speeds up voluntary leader transitions as the new leader don't have to wait | ||
// LeaseDuration time first. | ||
// | ||
// In the default scaffold provided, the program ends immediately after | ||
// the manager stops, so would be fine to enable this option. However, | ||
// if you are doing or is intended to do any operation such as perform cleanups | ||
// after the manager stops then its usage might be unsafe. | ||
// LeaderElectionReleaseOnCancel: true, | ||
Cache: cache.Options{ | ||
DefaultNamespaces: watchNsCacheByName, | ||
DefaultLabelSelector: labels.SelectorFromSet(labels.Set{ // TODO: move to app config | ||
"app.kubernetes.io/component": "controller", | ||
"app.kubernetes.io/instance": "soperator", | ||
}), | ||
}, | ||
}) | ||
if err != nil { | ||
setupLog.Error(err, "unable to start manager") | ||
os.Exit(1) | ||
} | ||
|
||
slurmAPIServer := os.Getenv("SLURM_API_SERVER") | ||
if len(slurmAPIServer) == 0 { | ||
slurmAPIServer = "http://localhost:6820" | ||
} | ||
|
||
// TODO: init jwt controller | ||
slurmClusterName := types.NamespacedName{ | ||
Namespace: "soperator", | ||
Name: "soperator", | ||
} | ||
jwtToken := jwt.NewToken(mgr.GetClient()).For(slurmClusterName, "root").WithRegistry(jwt.NewTokenRegistry().Build()) | ||
slurmapiClient, err := slurmapi.NewClient(slurmAPIServer, jwtToken, slurmapi.DefaultHTTPClient()) | ||
if err != nil { | ||
os.Exit(1) | ||
} | ||
slurmapiClients := map[types.NamespacedName]slurmapi.Client{ | ||
slurmClusterName: slurmapiClient, | ||
} | ||
|
||
if err = sconfigcontroller.NewController( | ||
mgr.GetClient(), | ||
mgr.GetScheme(), | ||
mgr.GetEventRecorderFor(sconfigcontroller.SConfigControllerName), | ||
slurmapiClients, | ||
reconcileTimeout, | ||
).SetupWithManager(mgr, maxConcurrency, cacheSyncTimeout); err != nil { | ||
setupLog.Error(err, "unable to create controller", sconfigcontroller.SConfigControllerName) | ||
os.Exit(1) | ||
} | ||
//+kubebuilder:scaffold:builder | ||
|
||
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { | ||
setupLog.Error(err, "unable to set up health check") | ||
os.Exit(1) | ||
} | ||
if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { | ||
setupLog.Error(err, "unable to set up ready check") | ||
os.Exit(1) | ||
} | ||
|
||
setupLog.Info("starting manager") | ||
if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { | ||
setupLog.Error(err, "problem running manager") | ||
os.Exit(1) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
--- | ||
apiVersion: rbac.authorization.k8s.io/v1 | ||
kind: ClusterRole | ||
metadata: | ||
name: sconfigcontroller-role | ||
rules: | ||
- apiGroups: | ||
- slurm.nebius.ai | ||
resources: | ||
- configmaps | ||
verbs: | ||
- get | ||
- list | ||
- watch |
Oops, something went wrong.