From fcb87109487f172c206d41372b510684bb237c46 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 11:44:06 +0300 Subject: [PATCH 01/10] Updated clusterrole.yaml to include permissions for coordination.k8s.io leases, modified daemonset.yaml for rolling update strategy and added tolerations, and adjusted resource limits in values.yaml. --- chart/templates/clusterrole.yaml | 2 +- chart/templates/daemonset.yaml | 9 +++++++++ chart/values.yaml | 12 ++++++++---- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/chart/templates/clusterrole.yaml b/chart/templates/clusterrole.yaml index f54b56b..a24da84 100644 --- a/chart/templates/clusterrole.yaml +++ b/chart/templates/clusterrole.yaml @@ -11,5 +11,5 @@ rules: verbs: [ "get" ] - apiGroups: [ "coordination.k8s.io" ] resources: [ "leases" ] - verbs: [ "create", "get", "delete" ] + verbs: [ "create", "delete", "get" ] {{- end }} diff --git a/chart/templates/daemonset.yaml b/chart/templates/daemonset.yaml index 9287b7f..8d11e50 100644 --- a/chart/templates/daemonset.yaml +++ b/chart/templates/daemonset.yaml @@ -8,6 +8,10 @@ spec: selector: matchLabels: app.kubernetes.io/name: {{ include "kubeip.name" . }} + updateStrategy: + type: RollingUpdate + rollingUpdate: + maxUnavailable: 1 template: metadata: labels: @@ -20,6 +24,11 @@ spec: {{- if .Values.daemonSet.nodeSelector }} {{- toYaml .Values.daemonSet.nodeSelector | nindent 8 }} {{- end }} + tolerations: + - operator: "Exists" + effect: "NoSchedule" + - operator: "Exists" + effect: "NoExecute" containers: - name: kubeip image: "{{ .Values.image.repository }}" diff --git a/chart/values.yaml b/chart/values.yaml index cd96d49..08dca3c 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -17,10 +17,10 @@ serviceAccount: name: kubeip-service-account annotations: gcpServiceAccountEmail: kubeip-service-account@workload-id-117715.iam.gserviceaccount.com -# annotations: -# awsRoleArn: "your-aws-role-arn" -# gcpServiceAccountEmail: "your-google-service-account-email" - + # annotations: + # awsRoleArn: "your-aws-role-arn" + # gcpServiceAccountEmail: "your-google-service-account-email" + # Role-Based Access Control (RBAC) configuration. rbac: @@ -40,3 +40,7 @@ daemonSet: resources: requests: cpu: 100m + memory: 64Mi + limits: + cpu: 100m + memory: 128Mi From 801f69da381278b76ad52d7edc388da5a5cf54ac Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 11:38:36 +0300 Subject: [PATCH 02/10] Updated main.go to introduce a non-blocking delay with loopTicker for better CPU usage efficiency. --- cmd/main.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmd/main.go b/cmd/main.go index 98e4b96..10a5a31 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -28,6 +28,7 @@ const ( unassignTimeout = 5 * time.Minute kubeipLockName = "kubeip-lock" defaultLeaseDuration = 5 + loopInterval = 100 * time.Millisecond ) var ( @@ -177,6 +178,10 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { } }() + // Create a ticker for non-blocking delay + loopTicker := time.NewTicker(loopInterval) + defer loopTicker.Stop() + for { select { case err = <-errorCh: @@ -202,6 +207,9 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { log.Infof("static public IP address released") } return nil + case <-loopTicker.C: + // Wait for the next tick before continuing the loop + // This case prevents high CPU usage by introducing a non-blocking delay } } } From 7c685f391d8bd23599a8a07200c9126335412ab2 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 11:56:03 +0300 Subject: [PATCH 03/10] Updated resource requests and limits for CPU and memory in EKS and GKE configurations. --- examples/aws/eks.tf | 7 ++++++- examples/gcp/gke.tf | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/aws/eks.tf b/examples/aws/eks.tf index f3aa18a..90529dc 100644 --- a/examples/aws/eks.tf +++ b/examples/aws/eks.tf @@ -260,7 +260,12 @@ resource "kubernetes_daemonset" "kubeip_daemonset" { } resources { requests = { - cpu = "100m" + cpu = "100m" + memory = "64Mi" + } + limits = { + cpu = "100m" + memory = "128Mi" } } } diff --git a/examples/gcp/gke.tf b/examples/gcp/gke.tf index a6a12b0..4275882 100644 --- a/examples/gcp/gke.tf +++ b/examples/gcp/gke.tf @@ -336,7 +336,12 @@ resource "kubernetes_daemonset" "kubeip_daemonset" { } resources { requests = { - cpu = "100m" + cpu = "100m" + memory = "64Mi" + } + limits = { + cpu = "100m" + memory = "128Mi" } } } From 24941af468fbe694c2cb7586e6bce8bfbd7d0bf8 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 12:13:22 +0300 Subject: [PATCH 04/10] Refactor error handling in main loop to correctly handle assignment errors --- cmd/main.go | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 10a5a31..56bc246 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -28,7 +28,6 @@ const ( unassignTimeout = 5 * time.Minute kubeipLockName = "kubeip-lock" defaultLeaseDuration = 5 - loopInterval = 100 * time.Millisecond ) var ( @@ -178,15 +177,11 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { } }() - // Create a ticker for non-blocking delay - loopTicker := time.NewTicker(loopInterval) - defer loopTicker.Stop() - for { select { - case err = <-errorCh: - if err != nil { - return errors.Wrap(err, "assigning static public IP address") + case assignErr, ok := <-errorCh: + if ok && assignErr != nil { + return errors.Wrap(assignErr, "assigning static public IP address") } case <-ctx.Done(): log.Infof("kubeip agent gracefully stopped") @@ -207,9 +202,6 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { log.Infof("static public IP address released") } return nil - case <-loopTicker.C: - // Wait for the next tick before continuing the loop - // This case prevents high CPU usage by introducing a non-blocking delay } } } From 54d9d88f86146abc1e0927a2c2be67050eb7bd67 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 12:28:17 +0300 Subject: [PATCH 05/10] Updated resource requests and limits for CPU and memory in AWS EKS and GCP GKE configurations. --- examples/aws/eks.tf | 8 ++------ examples/gcp/gke.tf | 8 ++------ 2 files changed, 4 insertions(+), 12 deletions(-) diff --git a/examples/aws/eks.tf b/examples/aws/eks.tf index 90529dc..eaa7432 100644 --- a/examples/aws/eks.tf +++ b/examples/aws/eks.tf @@ -260,12 +260,8 @@ resource "kubernetes_daemonset" "kubeip_daemonset" { } resources { requests = { - cpu = "100m" - memory = "64Mi" - } - limits = { - cpu = "100m" - memory = "128Mi" + cpu = "10m" + memory = "32Mi" } } } diff --git a/examples/gcp/gke.tf b/examples/gcp/gke.tf index 4275882..0b37eaf 100644 --- a/examples/gcp/gke.tf +++ b/examples/gcp/gke.tf @@ -336,12 +336,8 @@ resource "kubernetes_daemonset" "kubeip_daemonset" { } resources { requests = { - cpu = "100m" - memory = "64Mi" - } - limits = { - cpu = "100m" - memory = "128Mi" + cpu = "10m" + memory = "32Mi" } } } From 25e70770bc924546d910fe951761764aab692744 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 12:44:27 +0300 Subject: [PATCH 06/10] Refactor loop interval to use loopTicker for non-blocking delay in main.go --- cmd/main.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmd/main.go b/cmd/main.go index 56bc246..8e906d6 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -28,6 +28,7 @@ const ( unassignTimeout = 5 * time.Minute kubeipLockName = "kubeip-lock" defaultLeaseDuration = 5 + loopInterval = 250 * time.Millisecond ) var ( @@ -177,6 +178,10 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { } }() + // Create a ticker for non-blocking delay + loopTicker := time.NewTicker(loopInterval) + defer loopTicker.Stop() + for { select { case assignErr, ok := <-errorCh: @@ -202,6 +207,9 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { log.Infof("static public IP address released") } return nil + case <-loopTicker.C: + // Wait for the next tick before continuing the loop + // This case prevents high CPU usage by introducing a non-blocking delay } } } From 2bc8c1b3f26e8acec0e2f1bad39ba4f953bcc863 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 12:46:27 +0300 Subject: [PATCH 07/10] Refactor loop interval to use loopTicker for non-blocking delay in main.go --- cmd/main.go | 1 + 1 file changed, 1 insertion(+) diff --git a/cmd/main.go b/cmd/main.go index 8e906d6..000feb9 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -137,6 +137,7 @@ func assignAddress(c context.Context, log *logrus.Entry, client kubernetes.Inter return errors.New("reached maximum number of retries") } +//nolint:gocyclo func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { ctx, cancel := context.WithCancel(c) defer cancel() From 2ec20cede332ad20f176cc4e08ebd1f5b85c492f Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 13:08:41 +0300 Subject: [PATCH 08/10] Refactor loopTicker to use time.Sleep instead to prevent high CPU usage --- cmd/main.go | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index 000feb9..655a415 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -28,7 +28,7 @@ const ( unassignTimeout = 5 * time.Minute kubeipLockName = "kubeip-lock" defaultLeaseDuration = 5 - loopInterval = 250 * time.Millisecond + pauseInterval = 3 * time.Second ) var ( @@ -179,10 +179,6 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { } }() - // Create a ticker for non-blocking delay - loopTicker := time.NewTicker(loopInterval) - defer loopTicker.Stop() - for { select { case assignErr, ok := <-errorCh: @@ -208,9 +204,8 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { log.Infof("static public IP address released") } return nil - case <-loopTicker.C: - // Wait for the next tick before continuing the loop - // This case prevents high CPU usage by introducing a non-blocking delay + default: + time.Sleep(pauseInterval) } } } From d7a8814f8b16deca631716045050afecc91628d3 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 13:28:00 +0300 Subject: [PATCH 09/10] Refactor loopTicker to use time.Sleep instead to prevent high CPU usage --- cmd/main.go | 1 - 1 file changed, 1 deletion(-) diff --git a/cmd/main.go b/cmd/main.go index 655a415..c518c3d 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -137,7 +137,6 @@ func assignAddress(c context.Context, log *logrus.Entry, client kubernetes.Inter return errors.New("reached maximum number of retries") } -//nolint:gocyclo func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { ctx, cancel := context.WithCancel(c) defer cancel() From 82209e0f3400e2f56dcf4ff5243dc4fad78c5cd1 Mon Sep 17 00:00:00 2001 From: Alexei Ledenev Date: Thu, 4 Apr 2024 14:51:34 +0300 Subject: [PATCH 10/10] Refactor main.go to improve readability and error handling. Add log messages for better tracking of agent operations. --- cmd/main.go | 69 ++++++++++++++++++++++++----------------------------- 1 file changed, 31 insertions(+), 38 deletions(-) diff --git a/cmd/main.go b/cmd/main.go index c518c3d..3256a14 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -28,7 +28,6 @@ const ( unassignTimeout = 5 * time.Minute kubeipLockName = "kubeip-lock" defaultLeaseDuration = 5 - pauseInterval = 3 * time.Second ) var ( @@ -140,6 +139,7 @@ func assignAddress(c context.Context, log *logrus.Entry, client kubernetes.Inter func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { ctx, cancel := context.WithCancel(c) defer cancel() + // add debug mode to context if cfg.DevelopMode { ctx = context.WithValue(ctx, developModeKey, true) @@ -168,45 +168,37 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { if err != nil { return errors.Wrap(err, "initializing assigner") } - // assign static public IP address - errorCh := make(chan error, 1) // buffered channel to avoid goroutine leak - go func() { - defer close(errorCh) // close the channel when the goroutine exits to avoid goroutine leak - e := assignAddress(ctx, log, clientset, assigner, n, cfg) - if e != nil { - errorCh <- e - } - }() - for { - select { - case assignErr, ok := <-errorCh: - if ok && assignErr != nil { - return errors.Wrap(assignErr, "assigning static public IP address") - } - case <-ctx.Done(): - log.Infof("kubeip agent gracefully stopped") - if cfg.ReleaseOnExit { - log.Infof("releasing static public IP address") - err = func() error { - releaseCtx, releaseCancel := context.WithTimeout(context.Background(), unassignTimeout) // release the static public IP address within 5 minutes - defer releaseCancel() - // use a different context for releasing the static public IP address since the main context is canceled - if err = assigner.Unassign(releaseCtx, n.Instance, n.Zone); err != nil { - return errors.Wrap(err, "failed to release static public IP address") - } - return nil - }() - if err != nil { - return err //nolint:wrapcheck - } - log.Infof("static public IP address released") - } - return nil - default: - time.Sleep(pauseInterval) + err = assignAddress(ctx, log, clientset, assigner, n, cfg) + if err != nil { + return errors.Wrap(err, "assigning static public IP address") + } + + // pause the agent to prevent it from exiting immediately after assigning the static public IP address + // wait for the context to be done: SIGTERM, SIGINT + <-ctx.Done() + log.Infof("shutting down kubeip agent") + + // release the static public IP address on exit + if cfg.ReleaseOnExit { + log.Infof("releasing static public IP address") + if releaseErr := releaseIP(assigner, n); releaseErr != nil { //nolint:contextcheck + return releaseErr } + log.Infof("static public IP address released") } + return nil +} + +func releaseIP(assigner address.Assigner, n *types.Node) error { + releaseCtx, releaseCancel := context.WithTimeout(context.Background(), unassignTimeout) + defer releaseCancel() + + if err := assigner.Unassign(releaseCtx, n.Instance, n.Zone); err != nil { + return errors.Wrap(err, "failed to release static public IP address") + } + + return nil } func runCmd(c *cli.Context) error { @@ -216,7 +208,8 @@ func runCmd(c *cli.Context) error { cfg := config.NewConfig(c) if err := run(ctx, log, cfg); err != nil { - log.Fatalf("eks-lens agent failed: %v", err) + log.WithError(err).Error("error running kubeip agent") + return err } return nil