diff --git a/.github/actions/e2e/cleanup/action.yaml b/.github/actions/e2e/cleanup/action.yaml index 84bc1ea8cb42..c62a85cc7e4c 100644 --- a/.github/actions/e2e/cleanup/action.yaml +++ b/.github/actions/e2e/cleanup/action.yaml @@ -30,6 +30,13 @@ runs: - uses: ./.github/actions/e2e/install-eksctl with: eksctl_version: v0.147.0 + - name: delete-security-group + shell: bash + run: | + export sgID = aws ec2 describe-security-groups \ + --filters Name=tag:karpenter.sh/discovery,Values=drift-229136498 \ + --query "SecurityGroups[*].{ID:GroupId}"| jq '.[].ID' + aws ec2 delete-security-group --group-id ${sgID} - name: delete-cluster shell: bash run: | diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index de85f30407f5..cd2a020a5bc1 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -119,7 +119,7 @@ jobs: - name: run the ${{ inputs.suite }} test suite run: | aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} - TEST_SUITE="${{ inputs.suite }}" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests + FOCUS="should deprovision nodes that have drifted due to securitygroup" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests - name: notify slack of success or failure uses: ./.github/actions/e2e/slack/notify if: (success() || failure()) && inputs.event_name != 'workflow_run' && inputs.event_name != 'conformance' diff --git a/.github/workflows/sweeper.yaml b/.github/workflows/sweeper.yaml index 0ad247a5ccbd..24745257198b 100644 --- a/.github/workflows/sweeper.yaml +++ b/.github/workflows/sweeper.yaml @@ -9,6 +9,10 @@ permissions: jobs: sweeper: if: github.repository == 'aws/karpenter' || github.event_name == 'workflow_dispatch' + strategy: + fail-fast: false + matrix: + region: [us-east-2, us-west-2, eu-west-1] runs-on: ubuntu-latest steps: - uses: actions/checkout@v3 @@ -16,7 +20,7 @@ jobs: uses: aws-actions/configure-aws-credentials@v2 with: role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }} - aws-region: ${{ vars.AWS_REGION }} + aws-region: ${{ matrix.region }} - uses: actions/setup-go@v4 with: go-version-file: test/hack/cleanup/go.mod diff --git a/test/hack/cleanup/go.mod b/test/hack/cleanup/go.mod index 6913122a71d0..10e2a3b0e703 100644 --- a/test/hack/cleanup/go.mod +++ b/test/hack/cleanup/go.mod @@ -7,6 +7,7 @@ require ( github.com/aws/aws-sdk-go-v2/service/cloudformation v1.30.0 github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.26.2 github.com/aws/aws-sdk-go-v2/service/ec2 v1.102.0 + github.com/aws/aws-sdk-go-v2/service/iam v1.21.0 github.com/samber/lo v1.38.1 go.uber.org/zap v1.24.0 ) diff --git a/test/hack/cleanup/go.sum b/test/hack/cleanup/go.sum index 838f7547d5f3..c75c22fa2e42 100644 --- a/test/hack/cleanup/go.sum +++ b/test/hack/cleanup/go.sum @@ -18,6 +18,8 @@ github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.26.2 h1:PWGu2JhCb/XJlJ7SSFJq7 github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.26.2/go.mod h1:2KOZkkzMDZCo/aLzPhys06mHNkiU74u85aMJA3PLRvg= github.com/aws/aws-sdk-go-v2/service/ec2 v1.102.0 h1:P4dyjm49F2kKws0FpouBC6fjVImACXKt752+CWa01lM= github.com/aws/aws-sdk-go-v2/service/ec2 v1.102.0/go.mod h1:tIctCeX9IbzsUTKHt53SVEcgyfxV2ElxJeEB+QUbc4M= +github.com/aws/aws-sdk-go-v2/service/iam v1.21.0 h1:8hEpu60CWlrp7iEBUFRZhgPoX6+gadaGL1sD4LoRYS0= +github.com/aws/aws-sdk-go-v2/service/iam v1.21.0/go.mod h1:aQZ8BI+reeaY7RI/QQp7TKCSUHOesTdrzzylp3CW85c= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.28 h1:bkRyG4a929RCnpVSTvLM2j/T4ls015ZhhYApbmYs15s= github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.28/go.mod h1:jj7znCIg05jXlaGBlFMGP8+7UN3VtCkRBG2spnmRQkU= github.com/aws/aws-sdk-go-v2/service/sso v1.12.12 h1:nneMBM2p79PGWBQovYO/6Xnc2ryRMw3InnDJq1FHkSY= diff --git a/test/hack/cleanup/main.go b/test/hack/cleanup/main.go index 7890e34dda72..26517eef327e 100644 --- a/test/hack/cleanup/main.go +++ b/test/hack/cleanup/main.go @@ -16,6 +16,8 @@ package main import ( "context" + "fmt" + "strings" "time" "github.com/aws/aws-sdk-go-v2/config" @@ -25,6 +27,7 @@ import ( cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types" "github.com/aws/aws-sdk-go-v2/service/ec2" ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/aws/aws-sdk-go-v2/service/iam" "github.com/samber/lo" "go.uber.org/zap" ) @@ -35,6 +38,7 @@ const ( karpenterProvisionerNameTag = "karpenter.sh/provisioner-name" karpenterLaunchTemplateTag = "karpenter.k8s.aws/cluster" + karpenterSecurityGroupTag = "karpenter.sh/discovery" githubRunURLTag = "github.com/run-url" ) @@ -51,9 +55,17 @@ func main() { ec2Client := ec2.NewFromConfig(cfg) cloudFormationClient := cloudformation.NewFromConfig(cfg) cloudWatchClient := cloudwatch.NewFromConfig(cfg) + iamClient := iam.NewFromConfig(cfg) - // Terminate any old instances that were provisioned by Karpenter as part of testing - // We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively + cleanupInstances(ctx, ec2Client, cloudWatchClient, expirationTime, logger) + cleanupStack(ctx, ec2Client, cloudWatchClient, cloudFormationClient, expirationTime, logger) + cleanupLaunchTemplates(ctx, ec2Client, cloudWatchClient, expirationTime, logger) + cleanupOIDCProvider(ctx, iamClient, cloudWatchClient, expirationTime, logger) +} + +// Terminate any old instances that were provisioned by Karpenter as part of testing +// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively +func cleanupInstances(ctx context.Context, ec2Client *ec2.Client, cloudWatchClient *cloudwatch.Client, expirationTime time.Time, logger *zap.SugaredLogger) { ids := getOldInstances(ctx, ec2Client, expirationTime) logger.With("ids", ids, "count", len(ids)).Infof("discovered test instances to delete") if len(ids) > 0 { @@ -68,13 +80,37 @@ func main() { } } } +} - // Terminate any old stacks that were provisioned as part of testing - // We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively +// Terminate any old stacks that were provisioned as part of testing +// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively +func cleanupStack(ctx context.Context, ec2Client *ec2.Client, cloudWatchClient *cloudwatch.Client, cloudFormationClient *cloudformation.Client, expirationTime time.Time, logger *zap.SugaredLogger) { + sgInAccount := lo.Must(ec2Client.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{ + Filters: []ec2types.Filter{ + { + Name: lo.ToPtr("group-name"), + Values: []string{"security-group-drift"}, + }, + }, + })) names := getOldStacks(ctx, cloudFormationClient, expirationTime) logger.With("names", names, "count", len(names)).Infof("discovered test stacks to delete") deleted := 0 for i := range names { + if strings.HasSuffix(names[i], "-cluster") && strings.Contains(names[i], "drift") { + stackName := strings.Split(names[i], "-") + sgName := fmt.Sprintf("drift-%s", stackName[2]) + sg, _ := lo.Find(sgInAccount.SecurityGroups, func(sg ec2types.SecurityGroup) bool { + return *sg.Tags[0].Key == karpenterSecurityGroupTag && *sg.Tags[0].Value == sgName + }) + if _, err := ec2Client.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{ + GroupId: sg.GroupId, + }); err != nil { + logger.With("name", names[i]).Errorf("deleting test stack sg, %v", err) + } else { + logger.With("name", names[i]).Infof("deleted test stack sg") + } + } if _, err := cloudFormationClient.DeleteStack(ctx, &cloudformation.DeleteStackInput{ StackName: lo.ToPtr(names[i]), }); err != nil { @@ -87,11 +123,14 @@ func main() { if err := fireMetric(ctx, cloudWatchClient, "StacksDeleted", float64(deleted)); err != nil { logger.With("name", "StacksDeleted").Errorf("firing metric, %v", err) } +} - // Terminate any old launch templates that were managed by Karpenter and were provisioned as part of testing - names = getOldLaunchTemplates(ctx, ec2Client, expirationTime) +// Terminate any old launch templates that were managed by Karpenter and were provisioned as part of testing +// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively +func cleanupLaunchTemplates(ctx context.Context, ec2Client *ec2.Client, cloudWatchClient *cloudwatch.Client, expirationTime time.Time, logger *zap.SugaredLogger) { + names := getOldLaunchTemplates(ctx, ec2Client, expirationTime) logger.With("names", names, "count", len(names)).Infof("discovered test launch templates to delete") - deleted = 0 + deleted := 0 for i := range names { if _, err := ec2Client.DeleteLaunchTemplate(ctx, &ec2.DeleteLaunchTemplateInput{ LaunchTemplateName: lo.ToPtr(names[i]), @@ -107,6 +146,26 @@ func main() { } } +// Terminate any old OIDC providers that were are remaining as part of testing +// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively +func cleanupOIDCProvider(ctx context.Context, iamClient *iam.Client, cloudWatchClient *cloudwatch.Client, expirationTime time.Time, logger *zap.SugaredLogger) { + arns := getOldOIDCProviders(ctx, iamClient, expirationTime) + deleted := 0 + for i := range arns { + if _, err := iamClient.DeleteOpenIDConnectProvider(ctx, &iam.DeleteOpenIDConnectProviderInput{ + OpenIDConnectProviderArn: lo.ToPtr(arns[i]), + }); err != nil { + logger.With("arn", arns[i]).Errorf("deleting test cluster oidc provider, %v", err) + } else { + logger.With("arn", arns[i]).Infof("deleted test cluster oidc provider") + deleted++ + } + } + if err := fireMetric(ctx, cloudWatchClient, "OIDCDeleted", float64(deleted)); err != nil { + logger.With("name", "OIDCDeleted").Errorf("firing metric, %v", err) + } +} + func fireMetric(ctx context.Context, cloudWatchClient *cloudwatch.Client, name string, value float64) error { _, err := cloudWatchClient.PutMetricData(ctx, &cloudwatch.PutMetricDataInput{ Namespace: lo.ToPtr(karpenterMetricNamespace), @@ -208,3 +267,23 @@ func getOldLaunchTemplates(ctx context.Context, ec2Client *ec2.Client, expiratio } return names } + +func getOldOIDCProviders(ctx context.Context, iamClient *iam.Client, expirationTime time.Time) (names []string) { + testSuite := []string{"upgrade", "chaos", "consolidation", "drift", "integration", "interruption", "ipv6", "machine", "scale", "utilization"} + out := lo.Must(iamClient.ListOpenIDConnectProviders(ctx, &iam.ListOpenIDConnectProvidersInput{})) + + for _, oicdArn := range out.OpenIDConnectProviderList { + oicd := lo.Must(iamClient.GetOpenIDConnectProvider(ctx, &iam.GetOpenIDConnectProviderInput{ + OpenIDConnectProviderArn: oicdArn.Arn, + })) + + for _, t := range oicd.Tags { + if lo.FromPtr(t.Key) == "alpha.eksctl.io/cluster-name" && + lo.SomeBy(testSuite, func(s string) bool { return strings.HasPrefix(lo.FromPtr(t.Value), fmt.Sprintf("%s-", s)) }) && + oicd.CreateDate.Before(expirationTime) { + names = append(names, lo.FromPtr(oicdArn.Arn)) + } + } + } + return names +}