Skip to content

Commit

Permalink
refactor EC2NodePoolBackend and decommission karpenter nodes based on…
Browse files Browse the repository at this point in the history
… provisioner tag

Signed-off-by: Mahmoud Gaballah <[email protected]>
  • Loading branch information
myaser committed Aug 1, 2023
1 parent c3a85cb commit cde87e8
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 31 deletions.
2 changes: 1 addition & 1 deletion cmd/clm/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ import (

var (
provisionCmd = kingpin.Command("provision", "Provision a cluster.")
decommissionCmd = kingpin.Command("decommission", "Decommission a cluster.")
decommissionCmd = kingpin.Command("decommission", "decommission a cluster.")
controllerCmd = kingpin.Command("controller", "Run controller loop.")
version = "unknown"
)
Expand Down
66 changes: 47 additions & 19 deletions pkg/updatestrategy/aws_ec2.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ import (
"github.com/zalando-incubator/cluster-lifecycle-manager/pkg/util"
)

const karpenterProvisionerTag = "karpenter.sh/provisioner-name"

type InstanceConfig struct {
UserData string
ImageID string
Expand Down Expand Up @@ -85,7 +87,7 @@ func NewEC2NodePoolBackend(clusterID string, sess *session.Session, opts ...Opti
// userData,ImageID and tags and 'outdated' for nodes with an outdated
// configuration.
func (n *EC2NodePoolBackend) Get(ctx context.Context, nodePool *api.NodePool) (*NodePool, error) {
instances, err := n.getInstances(nodePool)
instances, err := n.getInstances(n.filterWithNodePool(nodePool))
if err != nil {
return nil, fmt.Errorf("failed to list EC2 instances of the node pool: %w", err)
}
Expand Down Expand Up @@ -127,24 +129,28 @@ func (n *EC2NodePoolBackend) Get(ctx context.Context, nodePool *api.NodePool) (*
}, nil
}

// getInstances lists all running instances of the node pool.
func (n *EC2NodePoolBackend) getInstances(nodePool *api.NodePool) ([]*ec2.Instance, error) {
params := &ec2.DescribeInstancesInput{
Filters: []*ec2.Filter{
{
Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID),
Values: []*string{
aws.String(resourceLifecycleOwned),
},
func (n *EC2NodePoolBackend) filterWithNodePool(nodePool *api.NodePool) []*ec2.Filter {
return []*ec2.Filter{
{
Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID),
Values: []*string{
aws.String(resourceLifecycleOwned),
},
{
Name: aws.String("tag:" + nodePoolTag),
Values: []*string{
aws.String(nodePool.Name),
},
},
{
Name: aws.String("tag:" + nodePoolTag),
Values: []*string{
aws.String(nodePool.Name),
},
},
}
}

// getInstances lists all running instances of the node pool.
func (n *EC2NodePoolBackend) getInstances(filters []*ec2.Filter) ([]*ec2.Instance, error) {
params := &ec2.DescribeInstancesInput{
Filters: filters,
}

instances := make([]*ec2.Instance, 0)
err := n.ec2Client.DescribeInstancesPagesWithContext(context.TODO(), params, func(output *ec2.DescribeInstancesOutput, lastPage bool) bool {
Expand Down Expand Up @@ -200,8 +206,30 @@ func (n *EC2NodePoolBackend) Terminate(context.Context, *Node, bool) error {
return nil
}

func (n *EC2NodePoolBackend) Decommission(ctx context.Context, nodePool *api.NodePool) error {
instances, err := n.getInstances(nodePool)
func (n *EC2NodePoolBackend) DecommissionNodePool(ctx context.Context, nodePool *api.NodePool) error {
filters := n.filterWithNodePool(nodePool)
return n.decommission(ctx, filters)
}

func (n *EC2NodePoolBackend) DecommissionKarpenterNodes(ctx context.Context) error {
return n.decommission(ctx, []*ec2.Filter{
{
Name: aws.String("tag:" + clusterIDTagPrefix + n.clusterID),
Values: []*string{
aws.String(resourceLifecycleOwned),
},
},
{
Name: aws.String("tag-key"),
Values: []*string{
aws.String(karpenterProvisionerTag),
},
},
})
}

func (n *EC2NodePoolBackend) decommission(ctx context.Context, filters []*ec2.Filter) error {
instances, err := n.getInstances(filters)
if err != nil {
return fmt.Errorf("failed to list EC2 instances of the node pool: %w", err)
}
Expand All @@ -220,14 +248,14 @@ func (n *EC2NodePoolBackend) Decommission(ctx context.Context, nodePool *api.Nod
}
_, err = n.ec2Client.TerminateInstancesWithContext(ctx, params)
if err != nil {
return fmt.Errorf("failed to terminate EC2 instances of node pool '%s': %w", nodePool.Name, err)
return fmt.Errorf("failed to terminate EC2 instances of the filters '%s': %w", filters, err)
}

// wait for all instances to be terminated
for {
select {
case <-time.After(15 * time.Second):
instances, err := n.getInstances(nodePool)
instances, err := n.getInstances(filters)
if err != nil {
return fmt.Errorf("failed to list EC2 instances of the node pool: %w", err)
}
Expand Down
16 changes: 5 additions & 11 deletions provisioner/clusterpy.go
Original file line number Diff line number Diff line change
Expand Up @@ -630,19 +630,13 @@ func (p *clusterpyProvisioner) Decommission(ctx context.Context, logger *log.Ent
if err != nil {
logger.Errorf("Unable to downscale the deployments, proceeding anyway: %s", err)
}
karpenterNodePoolBackend := updatestrategy.NewEC2NodePoolBackend(cluster.ID, awsAdapter.session)
// decommission karpenter node-pools, since karpenter controller is decommissioned. we need to clean up ec2 resources
for _, nodePool := range cluster.NodePools {
if nodePool.Profile == karpenterNodePoolProfile {
logger.Infof("Decommissioning Node Pool: %s (profile: %s)", nodePool.Name, nodePool.Profile)

err := karpenterNodePoolBackend.Decommission(ctx, nodePool)
if err != nil {
return err
}
}
// decommission karpenter node-pools, since karpenter controller is decommissioned. we need to clean up ec2 resources
ec2Backend := updatestrategy.NewEC2NodePoolBackend(cluster.ID, awsAdapter.session)
err = ec2Backend.DecommissionKarpenterNodes(ctx)
if err != nil {
return err
}

// make E2E tests and deletions less flaky
// The problem is that we scale down kube-ingress-aws-controller deployment
// and just after that we delete CF stacks, but if the pod
Expand Down

0 comments on commit cde87e8

Please sign in to comment.