From 0b548618cc1a817de13eee806999afef9f825b51 Mon Sep 17 00:00:00 2001 From: Mahmoud Gaballah Date: Tue, 1 Aug 2023 14:19:07 +0200 Subject: [PATCH] refactor EC2NodePoolBackend and decommission karpenter nodes based on provisioner tag Signed-off-by: Mahmoud Gaballah --- pkg/updatestrategy/aws_ec2.go | 66 +++++++++++++++++++++++++---------- provisioner/clusterpy.go | 16 +++------ 2 files changed, 52 insertions(+), 30 deletions(-) diff --git a/pkg/updatestrategy/aws_ec2.go b/pkg/updatestrategy/aws_ec2.go index c26231cc..7a401074 100644 --- a/pkg/updatestrategy/aws_ec2.go +++ b/pkg/updatestrategy/aws_ec2.go @@ -14,6 +14,8 @@ import ( "github.com/zalando-incubator/cluster-lifecycle-manager/pkg/util" ) +const karpenterProvisionerTag = "karpenter.sh/provisioner-name" + type InstanceConfig struct { UserData string ImageID string @@ -85,7 +87,7 @@ func NewEC2NodePoolBackend(clusterID string, sess *session.Session, opts ...Opti // userData,ImageID and tags and 'outdated' for nodes with an outdated // configuration. func (n *EC2NodePoolBackend) Get(ctx context.Context, nodePool *api.NodePool) (*NodePool, error) { - instances, err := n.getInstances(nodePool) + instances, err := n.getInstances(n.filterWithNodePool(nodePool)) if err != nil { return nil, fmt.Errorf("failed to list EC2 instances of the node pool: %w", err) } @@ -127,24 +129,28 @@ func (n *EC2NodePoolBackend) Get(ctx context.Context, nodePool *api.NodePool) (* }, nil } -// getInstances lists all running instances of the node pool. -func (n *EC2NodePoolBackend) getInstances(nodePool *api.NodePool) ([]*ec2.Instance, error) { - params := &ec2.DescribeInstancesInput{ - Filters: []*ec2.Filter{ - { - Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID), - Values: []*string{ - aws.String(resourceLifecycleOwned), - }, +func (n *EC2NodePoolBackend) filterWithNodePool(nodePool *api.NodePool) []*ec2.Filter { + return []*ec2.Filter{ + { + Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID), + Values: []*string{ + aws.String(resourceLifecycleOwned), }, - { - Name: aws.String("tag:" + nodePoolTag), - Values: []*string{ - aws.String(nodePool.Name), - }, + }, + { + Name: aws.String("tag:" + nodePoolTag), + Values: []*string{ + aws.String(nodePool.Name), }, }, } +} + +// getInstances lists all running instances of the node pool. +func (n *EC2NodePoolBackend) getInstances(filters []*ec2.Filter) ([]*ec2.Instance, error) { + params := &ec2.DescribeInstancesInput{ + Filters: filters, + } instances := make([]*ec2.Instance, 0) err := n.ec2Client.DescribeInstancesPagesWithContext(context.TODO(), params, func(output *ec2.DescribeInstancesOutput, lastPage bool) bool { @@ -200,8 +206,30 @@ func (n *EC2NodePoolBackend) Terminate(context.Context, *Node, bool) error { return nil } -func (n *EC2NodePoolBackend) Decommission(ctx context.Context, nodePool *api.NodePool) error { - instances, err := n.getInstances(nodePool) +func (n *EC2NodePoolBackend) DecommissionNodePool(ctx context.Context, nodePool *api.NodePool) error { + filters := n.filterWithNodePool(nodePool) + return n.decommission(ctx, filters) +} + +func (n *EC2NodePoolBackend) DecommissionKarpenterNodes(ctx context.Context) error { + return n.decommission(ctx, []*ec2.Filter{ + { + Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID), + Values: []*string{ + aws.String(resourceLifecycleOwned), + }, + }, + { + Name: aws.String("tag-key"), + Values: []*string{ + aws.String(karpenterProvisionerTag), + }, + }, + }) +} + +func (n *EC2NodePoolBackend) decommission(ctx context.Context, filters []*ec2.Filter) error { + instances, err := n.getInstances(filters) if err != nil { return fmt.Errorf("failed to list EC2 instances of the node pool: %w", err) } @@ -220,14 +248,14 @@ func (n *EC2NodePoolBackend) Decommission(ctx context.Context, nodePool *api.Nod } _, err = n.ec2Client.TerminateInstancesWithContext(ctx, params) if err != nil { - return fmt.Errorf("failed to terminate EC2 instances of node pool '%s': %w", nodePool.Name, err) + return fmt.Errorf("failed to terminate EC2 instances of the filters '%s': %w", filters, err) } // wait for all instances to be terminated for { select { case <-time.After(15 * time.Second): - instances, err := n.getInstances(nodePool) + instances, err := n.getInstances(filters) if err != nil { return fmt.Errorf("failed to list EC2 instances of the node pool: %w", err) } diff --git a/provisioner/clusterpy.go b/provisioner/clusterpy.go index 348c0a97..9ea50806 100644 --- a/provisioner/clusterpy.go +++ b/provisioner/clusterpy.go @@ -630,19 +630,13 @@ func (p *clusterpyProvisioner) Decommission(ctx context.Context, logger *log.Ent if err != nil { logger.Errorf("Unable to downscale the deployments, proceeding anyway: %s", err) } - karpenterNodePoolBackend := updatestrategy.NewEC2NodePoolBackend(cluster.ID, awsAdapter.session) - // decommission karpenter node-pools, since karpenter controller is decommissioned. we need to clean up ec2 resources - for _, nodePool := range cluster.NodePools { - if nodePool.Profile == karpenterNodePoolProfile { - logger.Infof("Decommissioning Node Pool: %s (profile: %s)", nodePool.Name, nodePool.Profile) - err := karpenterNodePoolBackend.Decommission(ctx, nodePool) - if err != nil { - return err - } - } + // decommission karpenter node-pools, since karpenter controller is decommissioned. we need to clean up ec2 resources + ec2Backend := updatestrategy.NewEC2NodePoolBackend(cluster.ID, awsAdapter.session) + err = ec2Backend.DecommissionKarpenterNodes(ctx) + if err != nil { + return err } - // make E2E tests and deletions less flaky // The problem is that we scale down kube-ingress-aws-controller deployment // and just after that we delete CF stacks, but if the pod