Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Maintenance window metric #2238

Merged
merged 8 commits into from
Feb 6, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 0 additions & 7 deletions .github/actions/deploy-lifecycle-manager-e2e/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -151,13 +151,6 @@ runs:
klm_image_repo: ${{ inputs.klm_image_repo }}
- name: Expose Metrics Endpoint
working-directory: lifecycle-manager
if: ${{ matrix.e2e-test == 'kyma-metrics' ||
LeelaChacha marked this conversation as resolved.
Show resolved Hide resolved
matrix.e2e-test == 'module-status-decoupling-with-statefulset' ||
matrix.e2e-test == 'module-status-decoupling-with-deployment' ||
matrix.e2e-test == 'purge-metrics' ||
matrix.e2e-test == 'self-signed-certificate-rotation' ||
matrix.e2e-test == 'mandatory-module-metrics' ||
matrix.e2e-test == 'mandatory-module-metrics-with-old-naming-pattern'}}
shell: bash
run: |
kubectl patch svc klm-controller-manager-metrics -p '{"spec": {"type": "LoadBalancer"}}' -n kcp-system
115 changes: 68 additions & 47 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,31 +143,12 @@ func pprofStartServer(addr string, timeout time.Duration, setupLog logr.Logger)
}
}

//nolint:funlen // setupManager is a main function that sets up the manager
func setupManager(flagVar *flags.FlagVar, cacheOptions cache.Options, scheme *machineryruntime.Scheme,
setupLog logr.Logger,
logger logr.Logger,
) {
config := ctrl.GetConfigOrDie()
config.QPS = float32(flagVar.ClientQPS)
config.Burst = flagVar.ClientBurst

mgr, err := ctrl.NewManager(
config, ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
BindAddress: flagVar.MetricsAddr,
},
HealthProbeBindAddress: flagVar.ProbeAddr,
LeaderElection: flagVar.EnableLeaderElection,
LeaderElectionID: "893110f7.kyma-project.io",
LeaseDuration: &flagVar.LeaderElectionLeaseDuration,
RenewDeadline: &flagVar.LeaderElectionRenewDeadline,
RetryPeriod: &flagVar.LeaderElectionRetryPeriod,
Cache: cacheOptions,
},
)
mgr, err := configManager(flagVar, cacheOptions, scheme)
if err != nil {
setupLog.Error(err, "unable to start manager")
logger.Error(err, "unable to start manager")
os.Exit(bootstrapFailedExitCode)
}
kcpRestConfig := mgr.GetConfig()
Expand All @@ -179,13 +160,13 @@ func setupManager(flagVar *flags.FlagVar, cacheOptions cache.Options, scheme *ma
var options ctrlruntime.Options
if flagVar.EnableKcpWatcher {
if skrWebhookManager, err = createSkrWebhookManager(mgr, skrContextProvider, flagVar); err != nil {
setupLog.Error(err, "failed to create skr webhook manager")
logger.Error(err, "failed to create skr webhook manager")
os.Exit(bootstrapFailedExitCode)
}
setupKcpWatcherReconciler(mgr, options, eventRecorder, flagVar, setupLog)
setupKcpWatcherReconciler(mgr, options, eventRecorder, flagVar, logger)
err = istiogatewaysecret.SetupReconciler(mgr, flagVar, options)
if err != nil {
setupLog.Error(err, "unable to create controller", "controller", "Istio")
logger.Error(err, "unable to create controller", "controller", "Istio")
os.Exit(bootstrapFailedExitCode)
}
}
Expand All @@ -194,44 +175,83 @@ func setupManager(flagVar *flags.FlagVar, cacheOptions cache.Options, scheme *ma
descriptorProvider := provider.NewCachedDescriptorProvider()
kymaMetrics := metrics.NewKymaMetrics(sharedMetrics)
mandatoryModulesMetrics := metrics.NewMandatoryModulesMetrics()
maintenanceWindow := initMaintenanceWindow(logger)

maintenanceWindow, err := maintenancewindows.InitializeMaintenanceWindow(setupLog,
maintenanceWindowPoliciesDirectory,
maintenanceWindowPolicyName,
// align the configuration values before rollout
// https://github.com/kyma-project/lifecycle-manager/issues/2165
true,
minMaintenanceWindowSize)
if err != nil {
setupLog.Error(err, "unable to set maintenance windows policy")
}
setupKymaReconciler(mgr, descriptorProvider, skrContextProvider, eventRecorder, flagVar, options, skrWebhookManager,
kymaMetrics, setupLog, maintenanceWindow)
setupManifestReconciler(mgr, flagVar, options, sharedMetrics, mandatoryModulesMetrics, setupLog,
kymaMetrics, logger, maintenanceWindow)
setupManifestReconciler(mgr, flagVar, options, sharedMetrics, mandatoryModulesMetrics, logger,
eventRecorder)
setupMandatoryModuleReconciler(mgr, descriptorProvider, flagVar, options, mandatoryModulesMetrics, setupLog)
setupMandatoryModuleDeletionReconciler(mgr, descriptorProvider, eventRecorder, flagVar, options, setupLog)
setupMandatoryModuleReconciler(mgr, descriptorProvider, flagVar, options, mandatoryModulesMetrics, logger)
setupMandatoryModuleDeletionReconciler(mgr, descriptorProvider, eventRecorder, flagVar, options, logger)
if flagVar.EnablePurgeFinalizer {
setupPurgeReconciler(mgr, skrContextProvider, eventRecorder, flagVar, options, setupLog)
setupPurgeReconciler(mgr, skrContextProvider, eventRecorder, flagVar, options, logger)
}

if flagVar.EnableWebhooks {
// enable conversion webhook for CRDs here

setupLog.Info("currently no configured webhooks")
logger.Info("currently no configured webhooks")
}

addHealthChecks(mgr, setupLog)
addHealthChecks(mgr, logger)

go cleanupStoredVersions(flagVar.DropCrdStoredVersionMap, mgr, setupLog)
go scheduleMetricsCleanup(kymaMetrics, flagVar.MetricsCleanupIntervalInMinutes, mgr, setupLog)
go cleanupStoredVersions(flagVar.DropCrdStoredVersionMap, mgr, logger)
go scheduleMetricsCleanup(kymaMetrics, flagVar.MetricsCleanupIntervalInMinutes, mgr, logger)

if err = mgr.Start(ctrl.SetupSignalHandler()); err != nil {
setupLog.Error(err, "problem running manager")
logger.Error(err, "problem running manager")
os.Exit(runtimeProblemExitCode)
}
}

func initMaintenanceWindow(logger logr.Logger) maintenancewindows.MaintenanceWindow {
maintenanceWindowsMetrics := metrics.NewMaintenanceWindowMetrics()
maintenanceWindow, err := maintenancewindows.InitializeMaintenanceWindow(logger,
maintenanceWindowPoliciesDirectory,
maintenanceWindowPolicyName,
// align the configuration values before rollout
// https://github.com/kyma-project/lifecycle-manager/issues/2165
true,
minMaintenanceWindowSize)
if err != nil {
maintenanceWindowsMetrics.RecordConfigReadSuccess(false)
logger.Error(err, "unable to set maintenance windows policy")
} else {
maintenanceWindowsMetrics.RecordConfigReadSuccess(true)
}
return maintenanceWindow
}

//nolint:ireturn // the implementation is not a part of the public API
func configManager(flagVar *flags.FlagVar, cacheOptions cache.Options,
scheme *machineryruntime.Scheme,
) (manager.Manager, error) {
config := ctrl.GetConfigOrDie()

config.QPS = float32(flagVar.ClientQPS)
config.Burst = flagVar.ClientBurst

mgr, err := ctrl.NewManager(
config, ctrl.Options{
Scheme: scheme,
Metrics: metricsserver.Options{
BindAddress: flagVar.MetricsAddr,
},
HealthProbeBindAddress: flagVar.ProbeAddr,
LeaderElection: flagVar.EnableLeaderElection,
LeaderElectionID: "893110f7.kyma-project.io",
LeaseDuration: &flagVar.LeaderElectionLeaseDuration,
RenewDeadline: &flagVar.LeaderElectionRenewDeadline,
RetryPeriod: &flagVar.LeaderElectionRetryPeriod,
Cache: cacheOptions,
},
)
if err != nil {
return nil, fmt.Errorf("unable to create manager: %w", err)
}
return mgr, nil
}

func addHealthChecks(mgr manager.Manager, setupLog logr.Logger) {
// +kubebuilder:scaffold:builder
if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil {
Expand Down Expand Up @@ -286,7 +306,7 @@ func scheduleMetricsCleanup(kymaMetrics *metrics.KymaMetrics, cleanupIntervalInM
func setupKymaReconciler(mgr ctrl.Manager, descriptorProvider *provider.CachedDescriptorProvider,
skrContextFactory remote.SkrContextProvider, event event.Event, flagVar *flags.FlagVar, options ctrlruntime.Options,
skrWebhookManager *watcher.SKRWebhookManifestManager, kymaMetrics *metrics.KymaMetrics,
setupLog logr.Logger, maintenanceWindow *maintenancewindows.MaintenanceWindow,
setupLog logr.Logger, maintenanceWindow maintenancewindows.MaintenanceWindow,
) {
options.RateLimiter = internal.RateLimiter(flagVar.FailureBaseDelay,
flagVar.FailureMaxDelay, flagVar.RateLimiterFrequency, flagVar.RateLimiterBurst)
Expand Down Expand Up @@ -319,7 +339,8 @@ func setupKymaReconciler(mgr ctrl.Manager, descriptorProvider *provider.CachedDe
Metrics: kymaMetrics,
RemoteCatalog: remote.NewRemoteCatalogFromKyma(mgr.GetClient(), skrContextFactory,
flagVar.RemoteSyncNamespace),
TemplateLookup: templatelookup.NewTemplateLookup(mgr.GetClient(), descriptorProvider, moduleTemplateInfoLookupStrategies),
TemplateLookup: templatelookup.NewTemplateLookup(mgr.GetClient(), descriptorProvider,
moduleTemplateInfoLookupStrategies),
}).SetupWithManager(
mgr, options, kyma.SetupOptions{
ListenerAddr: flagVar.KymaListenerAddr,
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -215,6 +215,7 @@ require (
github.com/json-iterator/go v1.1.12 // indirect
github.com/klauspost/compress v1.17.11 // indirect
github.com/klauspost/pgzip v1.2.6 // indirect
github.com/kylelemons/godebug v1.1.0 // indirect
github.com/letsencrypt/boulder v0.0.0-20241010192615-6692160cedfa // indirect
github.com/liggitt/tabwriter v0.0.0-20181228230101-89fcab3d43de // indirect
github.com/magiconair/properties v1.8.7 // indirect
Expand Down
27 changes: 18 additions & 9 deletions internal/maintenancewindows/maintenance_window.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,10 @@ import (
"github.com/kyma-project/lifecycle-manager/maintenancewindows/resolver"
)

var ErrNoMaintenanceWindowPolicyConfigured = errors.New("no maintenance window policy configured")
var (
ErrNoMaintenanceWindowPolicyConfigured = errors.New("no maintenance window policy configured")
ErrPolicyFileNotFound = errors.New("maintenance window policy file not found")
)

type MaintenanceWindowPolicy interface {
Resolve(runtime *resolver.Runtime, opts ...interface{}) (*resolver.ResolvedWindow, error)
Expand All @@ -31,30 +34,36 @@ func InitializeMaintenanceWindow(log logr.Logger,
policyName string,
ongoingWindow bool,
minWindowSize time.Duration,
) (*MaintenanceWindow, error) {
) (MaintenanceWindow, error) {
if err := os.Setenv(resolver.PolicyPathENV, policiesDirectory); err != nil {
return nil, fmt.Errorf("failed to set the policy path env variable, %w", err)
return MaintenanceWindow{
MaintenanceWindowPolicy: nil,
}, fmt.Errorf("failed to set the policy path env variable, %w", err)
}

policyFilePath := fmt.Sprintf("%s/%s.json", policiesDirectory, policyName)
if !MaintenancePolicyFileExists(policyFilePath) {
log.Info("maintenance windows policy file does not exist")
return &MaintenanceWindow{
log.Error(ErrPolicyFileNotFound, "maintenance windows policy file does not exist")
return MaintenanceWindow{
MaintenanceWindowPolicy: nil,
}, nil
}, fmt.Errorf("maintenance windows policy file does not exist, %w", ErrPolicyFileNotFound)
}

maintenancePolicyPool, err := resolver.GetMaintenancePolicyPool()
if err != nil {
return nil, fmt.Errorf("failed to get maintenance policy pool, %w", err)
return MaintenanceWindow{
MaintenanceWindowPolicy: nil,
}, fmt.Errorf("failed to get maintenance policy pool, %w", err)
}

maintenancePolicy, err := resolver.GetMaintenancePolicy(maintenancePolicyPool, policyName)
if err != nil {
return nil, fmt.Errorf("failed to get maintenance window policy, %w", err)
return MaintenanceWindow{
MaintenanceWindowPolicy: nil,
}, fmt.Errorf("failed to get maintenance window policy, %w", err)
}

return &MaintenanceWindow{
return MaintenanceWindow{
MaintenanceWindowPolicy: maintenancePolicy,
ongoing: resolver.OngoingWindow(ongoingWindow),
minDuration: resolver.MinWindowSize(minWindowSize),
Expand Down
26 changes: 17 additions & 9 deletions internal/maintenancewindows/maintenance_window_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,26 +30,26 @@ func TestMaintenancePolicyFileExists_FileExists(t *testing.T) {
require.True(t, got)
}

func TestInitializeMaintenanceWindowsPolicy_FileNotExist_NoError(t *testing.T) {
func TestInitializeMaintenanceWindowsPolicy_FileNotExist(t *testing.T) {
got, err := maintenancewindows.InitializeMaintenanceWindow(logr.Logger{},
"testdata",
"policy-1",
true,
20*time.Minute)

require.Nil(t, got.MaintenanceWindowPolicy)
require.NoError(t, err)
require.ErrorContains(t, err, maintenancewindows.ErrPolicyFileNotFound.Error())
}

func TestInitializeMaintenanceWindowsPolicy_DirectoryNotExist_NoError(t *testing.T) {
func TestInitializeMaintenanceWindowsPolicy_DirectoryNotExist(t *testing.T) {
got, err := maintenancewindows.InitializeMaintenanceWindow(logr.Logger{},
"files",
"policy",
true,
20*time.Minute)

require.Nil(t, got.MaintenanceWindowPolicy)
require.NoError(t, err)
require.ErrorContains(t, err, maintenancewindows.ErrPolicyFileNotFound.Error())
}

func TestInitializeMaintenanceWindowsPolicy_InvalidPolicy(t *testing.T) {
Expand All @@ -59,7 +59,7 @@ func TestInitializeMaintenanceWindowsPolicy_InvalidPolicy(t *testing.T) {
true,
20*time.Minute)

require.Nil(t, got)
require.Nil(t, got.MaintenanceWindowPolicy)
require.ErrorContains(t, err, "failed to get maintenance window policy")
}

Expand Down Expand Up @@ -321,7 +321,9 @@ func Test_IsActive_Returns_False_And_Error_WhenNoPolicyConfigured(t *testing.T)

type maintenanceWindowInactiveStub struct{}

func (s maintenanceWindowInactiveStub) Resolve(runtime *resolver.Runtime, opts ...interface{}) (*resolver.ResolvedWindow, error) {
func (s maintenanceWindowInactiveStub) Resolve(runtime *resolver.Runtime,
opts ...interface{},
) (*resolver.ResolvedWindow, error) {
return &resolver.ResolvedWindow{
Begin: time.Now().Add(1 * time.Hour),
End: time.Now().Add(2 * time.Hour),
Expand All @@ -330,7 +332,9 @@ func (s maintenanceWindowInactiveStub) Resolve(runtime *resolver.Runtime, opts .

type maintenanceWindowActiveStub struct{}

func (s maintenanceWindowActiveStub) Resolve(runtime *resolver.Runtime, opts ...interface{}) (*resolver.ResolvedWindow, error) {
func (s maintenanceWindowActiveStub) Resolve(runtime *resolver.Runtime, opts ...interface{}) (*resolver.ResolvedWindow,
error,
) {
return &resolver.ResolvedWindow{
Begin: time.Now().Add(-1 * time.Hour),
End: time.Now().Add(1 * time.Hour),
Expand All @@ -339,15 +343,19 @@ func (s maintenanceWindowActiveStub) Resolve(runtime *resolver.Runtime, opts ...

type maintenanceWindowErrorStub struct{}

func (s maintenanceWindowErrorStub) Resolve(runtime *resolver.Runtime, opts ...interface{}) (*resolver.ResolvedWindow, error) {
func (s maintenanceWindowErrorStub) Resolve(runtime *resolver.Runtime, opts ...interface{}) (*resolver.ResolvedWindow,
error,
) {
return &resolver.ResolvedWindow{}, errors.New("test error")
}

type maintenanceWindowRuntimeArgStub struct {
receivedRuntime *resolver.Runtime
}

func (s maintenanceWindowRuntimeArgStub) Resolve(runtime *resolver.Runtime, opts ...interface{}) (*resolver.ResolvedWindow, error) {
func (s maintenanceWindowRuntimeArgStub) Resolve(runtime *resolver.Runtime,
opts ...interface{},
) (*resolver.ResolvedWindow, error) {
*s.receivedRuntime = *runtime

return &resolver.ResolvedWindow{}, nil
Expand Down
33 changes: 33 additions & 0 deletions internal/pkg/metrics/maintance_window.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
package metrics

import (
"github.com/prometheus/client_golang/prometheus"
ctrlmetrics "sigs.k8s.io/controller-runtime/pkg/metrics"
)

const (
MetricMaintenanceWindowConfigReadSuccess = "maintenance_window_config_read_success"
)

type MaintenanceWindowMetrics struct {
ConfigReadSuccessGauge prometheus.Gauge
}

func NewMaintenanceWindowMetrics() *MaintenanceWindowMetrics {
metrics := &MaintenanceWindowMetrics{
ConfigReadSuccessGauge: prometheus.NewGauge(prometheus.GaugeOpts{
Name: MetricMaintenanceWindowConfigReadSuccess,
Help: "Indicates whether the maintenance window configuration was read successfully (1 for success, 0 for failure)",
}),
}
ctrlmetrics.Registry.MustRegister(metrics.ConfigReadSuccessGauge)
return metrics
}

func (m *MaintenanceWindowMetrics) RecordConfigReadSuccess(success bool) {
value := 0.0
if success {
value = 1.0
}
m.ConfigReadSuccessGauge.Set(value)
}
Loading
Loading