From 0b548618cc1a817de13eee806999afef9f825b51 Mon Sep 17 00:00:00 2001
From: Mahmoud Gaballah <mahmoud.gaballah@zalando.de>
Date: Tue, 1 Aug 2023 14:19:07 +0200
Subject: [PATCH] refactor EC2NodePoolBackend and decommission karpenter nodes
 based on provisioner tag

Signed-off-by: Mahmoud Gaballah <mahmoud.gaballah@zalando.de>
---
 pkg/updatestrategy/aws_ec2.go | 66 +++++++++++++++++++++++++----------
 provisioner/clusterpy.go      | 16 +++------
 2 files changed, 52 insertions(+), 30 deletions(-)

diff --git a/pkg/updatestrategy/aws_ec2.go b/pkg/updatestrategy/aws_ec2.go
index c26231cc..7a401074 100644
--- a/pkg/updatestrategy/aws_ec2.go
+++ b/pkg/updatestrategy/aws_ec2.go
@@ -14,6 +14,8 @@ import (
 	"github.com/zalando-incubator/cluster-lifecycle-manager/pkg/util"
 )
 
+const karpenterProvisionerTag = "karpenter.sh/provisioner-name"
+
 type InstanceConfig struct {
 	UserData string
 	ImageID  string
@@ -85,7 +87,7 @@ func NewEC2NodePoolBackend(clusterID string, sess *session.Session, opts ...Opti
 // userData,ImageID and tags and 'outdated' for nodes with an outdated
 // configuration.
 func (n *EC2NodePoolBackend) Get(ctx context.Context, nodePool *api.NodePool) (*NodePool, error) {
-	instances, err := n.getInstances(nodePool)
+	instances, err := n.getInstances(n.filterWithNodePool(nodePool))
 	if err != nil {
 		return nil, fmt.Errorf("failed to list EC2 instances of the node pool: %w", err)
 	}
@@ -127,24 +129,28 @@ func (n *EC2NodePoolBackend) Get(ctx context.Context, nodePool *api.NodePool) (*
 	}, nil
 }
 
-// getInstances lists all running instances of the node pool.
-func (n *EC2NodePoolBackend) getInstances(nodePool *api.NodePool) ([]*ec2.Instance, error) {
-	params := &ec2.DescribeInstancesInput{
-		Filters: []*ec2.Filter{
-			{
-				Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID),
-				Values: []*string{
-					aws.String(resourceLifecycleOwned),
-				},
+func (n *EC2NodePoolBackend) filterWithNodePool(nodePool *api.NodePool) []*ec2.Filter {
+	return []*ec2.Filter{
+		{
+			Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID),
+			Values: []*string{
+				aws.String(resourceLifecycleOwned),
 			},
-			{
-				Name: aws.String("tag:" + nodePoolTag),
-				Values: []*string{
-					aws.String(nodePool.Name),
-				},
+		},
+		{
+			Name: aws.String("tag:" + nodePoolTag),
+			Values: []*string{
+				aws.String(nodePool.Name),
 			},
 		},
 	}
+}
+
+// getInstances lists all running instances of the node pool.
+func (n *EC2NodePoolBackend) getInstances(filters []*ec2.Filter) ([]*ec2.Instance, error) {
+	params := &ec2.DescribeInstancesInput{
+		Filters: filters,
+	}
 
 	instances := make([]*ec2.Instance, 0)
 	err := n.ec2Client.DescribeInstancesPagesWithContext(context.TODO(), params, func(output *ec2.DescribeInstancesOutput, lastPage bool) bool {
@@ -200,8 +206,30 @@ func (n *EC2NodePoolBackend) Terminate(context.Context, *Node, bool) error {
 	return nil
 }
 
-func (n *EC2NodePoolBackend) Decommission(ctx context.Context, nodePool *api.NodePool) error {
-	instances, err := n.getInstances(nodePool)
+func (n *EC2NodePoolBackend) DecommissionNodePool(ctx context.Context, nodePool *api.NodePool) error {
+	filters := n.filterWithNodePool(nodePool)
+	return n.decommission(ctx, filters)
+}
+
+func (n *EC2NodePoolBackend) DecommissionKarpenterNodes(ctx context.Context) error {
+	return n.decommission(ctx, []*ec2.Filter{
+		{
+			Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID),
+			Values: []*string{
+				aws.String(resourceLifecycleOwned),
+			},
+		},
+		{
+			Name: aws.String("tag-key"),
+			Values: []*string{
+				aws.String(karpenterProvisionerTag),
+			},
+		},
+	})
+}
+
+func (n *EC2NodePoolBackend) decommission(ctx context.Context, filters []*ec2.Filter) error {
+	instances, err := n.getInstances(filters)
 	if err != nil {
 		return fmt.Errorf("failed to list EC2 instances of the node pool: %w", err)
 	}
@@ -220,14 +248,14 @@ func (n *EC2NodePoolBackend) Decommission(ctx context.Context, nodePool *api.Nod
 	}
 	_, err = n.ec2Client.TerminateInstancesWithContext(ctx, params)
 	if err != nil {
-		return fmt.Errorf("failed to terminate EC2 instances of node pool '%s': %w", nodePool.Name, err)
+		return fmt.Errorf("failed to terminate EC2 instances of the filters '%s': %w", filters, err)
 	}
 
 	// wait for all instances to be terminated
 	for {
 		select {
 		case <-time.After(15 * time.Second):
-			instances, err := n.getInstances(nodePool)
+			instances, err := n.getInstances(filters)
 			if err != nil {
 				return fmt.Errorf("failed to list EC2 instances of the node pool: %w", err)
 			}
diff --git a/provisioner/clusterpy.go b/provisioner/clusterpy.go
index 348c0a97..9ea50806 100644
--- a/provisioner/clusterpy.go
+++ b/provisioner/clusterpy.go
@@ -630,19 +630,13 @@ func (p *clusterpyProvisioner) Decommission(ctx context.Context, logger *log.Ent
 	if err != nil {
 		logger.Errorf("Unable to downscale the deployments, proceeding anyway: %s", err)
 	}
-	karpenterNodePoolBackend := updatestrategy.NewEC2NodePoolBackend(cluster.ID, awsAdapter.session)
-	// decommission karpenter node-pools, since karpenter controller is decommissioned. we need to clean up ec2 resources
-	for _, nodePool := range cluster.NodePools {
-		if nodePool.Profile == karpenterNodePoolProfile {
-			logger.Infof("Decommissioning Node Pool: %s (profile: %s)", nodePool.Name, nodePool.Profile)
 
-			err := karpenterNodePoolBackend.Decommission(ctx, nodePool)
-			if err != nil {
-				return err
-			}
-		}
+	// decommission karpenter node-pools, since karpenter controller is decommissioned. we need to clean up ec2 resources
+	ec2Backend := updatestrategy.NewEC2NodePoolBackend(cluster.ID, awsAdapter.session)
+	err = ec2Backend.DecommissionKarpenterNodes(ctx)
+	if err != nil {
+		return err
 	}
-
 	// make E2E tests and deletions less flaky
 	// The problem is that we scale down kube-ingress-aws-controller deployment
 	// and just after that we delete CF stacks, but if the pod