Skip to content

Commit

Permalink
fix SG and OICD leak
Browse files Browse the repository at this point in the history
  • Loading branch information
engedaam committed Jul 26, 2023
1 parent 4cecea2 commit e00681d
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 9 deletions.
7 changes: 7 additions & 0 deletions .github/actions/e2e/cleanup/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ runs:
- uses: ./.github/actions/e2e/install-eksctl
with:
eksctl_version: v0.147.0
- name: delete-security-group
shell: bash
run: |
export sgID = aws ec2 describe-security-groups \
--filters Name=tag:karpenter.sh/discovery,Values=drift-229136498 \
--query "SecurityGroups[*].{ID:GroupId}"| jq '.[].ID'
aws ec2 delete-security-group --group-id ${sgID}
- name: delete-cluster
shell: bash
run: |
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/e2e.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ jobs:
- name: run the ${{ inputs.suite }} test suite
run: |
aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }}
TEST_SUITE="${{ inputs.suite }}" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests
FOCUS="should deprovision nodes that have drifted due to securitygroup" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests
- name: notify slack of success or failure
uses: ./.github/actions/e2e/slack/notify
if: (success() || failure()) && inputs.event_name != 'workflow_run' && inputs.event_name != 'conformance'
Expand Down
6 changes: 5 additions & 1 deletion .github/workflows/sweeper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,18 @@ permissions:
jobs:
sweeper:
if: github.repository == 'aws/karpenter' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
region: [us-east-2, us-west-2, eu-west-1]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v3
- name: configure aws credentials
uses: aws-actions/configure-aws-credentials@v2
with:
role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }}
aws-region: ${{ vars.AWS_REGION }}
aws-region: ${{ matrix.region }}
- uses: actions/setup-go@v4
with:
go-version-file: test/hack/cleanup/go.mod
Expand Down
1 change: 1 addition & 0 deletions test/hack/cleanup/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ require (
github.com/aws/aws-sdk-go-v2/service/cloudformation v1.30.0
github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.26.2
github.com/aws/aws-sdk-go-v2/service/ec2 v1.102.0
github.com/aws/aws-sdk-go-v2/service/iam v1.21.0
github.com/samber/lo v1.38.1
go.uber.org/zap v1.24.0
)
Expand Down
2 changes: 2 additions & 0 deletions test/hack/cleanup/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.26.2 h1:PWGu2JhCb/XJlJ7SSFJq7
github.com/aws/aws-sdk-go-v2/service/cloudwatch v1.26.2/go.mod h1:2KOZkkzMDZCo/aLzPhys06mHNkiU74u85aMJA3PLRvg=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.102.0 h1:P4dyjm49F2kKws0FpouBC6fjVImACXKt752+CWa01lM=
github.com/aws/aws-sdk-go-v2/service/ec2 v1.102.0/go.mod h1:tIctCeX9IbzsUTKHt53SVEcgyfxV2ElxJeEB+QUbc4M=
github.com/aws/aws-sdk-go-v2/service/iam v1.21.0 h1:8hEpu60CWlrp7iEBUFRZhgPoX6+gadaGL1sD4LoRYS0=
github.com/aws/aws-sdk-go-v2/service/iam v1.21.0/go.mod h1:aQZ8BI+reeaY7RI/QQp7TKCSUHOesTdrzzylp3CW85c=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.28 h1:bkRyG4a929RCnpVSTvLM2j/T4ls015ZhhYApbmYs15s=
github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.9.28/go.mod h1:jj7znCIg05jXlaGBlFMGP8+7UN3VtCkRBG2spnmRQkU=
github.com/aws/aws-sdk-go-v2/service/sso v1.12.12 h1:nneMBM2p79PGWBQovYO/6Xnc2ryRMw3InnDJq1FHkSY=
Expand Down
93 changes: 86 additions & 7 deletions test/hack/cleanup/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ package main

import (
"context"
"fmt"
"strings"
"time"

"github.com/aws/aws-sdk-go-v2/config"
Expand All @@ -25,6 +27,7 @@ import (
cloudwatchtypes "github.com/aws/aws-sdk-go-v2/service/cloudwatch/types"
"github.com/aws/aws-sdk-go-v2/service/ec2"
ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types"
"github.com/aws/aws-sdk-go-v2/service/iam"
"github.com/samber/lo"
"go.uber.org/zap"
)
Expand All @@ -35,6 +38,7 @@ const (

karpenterProvisionerNameTag = "karpenter.sh/provisioner-name"
karpenterLaunchTemplateTag = "karpenter.k8s.aws/cluster"
karpenterSecurityGroupTag = "karpenter.sh/discovery"
githubRunURLTag = "github.com/run-url"
)

Expand All @@ -51,9 +55,17 @@ func main() {
ec2Client := ec2.NewFromConfig(cfg)
cloudFormationClient := cloudformation.NewFromConfig(cfg)
cloudWatchClient := cloudwatch.NewFromConfig(cfg)
iamClient := iam.NewFromConfig(cfg)

// Terminate any old instances that were provisioned by Karpenter as part of testing
// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively
cleanupInstances(ctx, ec2Client, cloudWatchClient, expirationTime, logger)
cleanupStack(ctx, ec2Client, cloudWatchClient, cloudFormationClient, expirationTime, logger)
cleanupLaunchTemplates(ctx, ec2Client, cloudWatchClient, expirationTime, logger)
cleanupOIDCProvider(ctx, iamClient, cloudWatchClient, expirationTime, logger)
}

// Terminate any old instances that were provisioned by Karpenter as part of testing
// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively
func cleanupInstances(ctx context.Context, ec2Client *ec2.Client, cloudWatchClient *cloudwatch.Client, expirationTime time.Time, logger *zap.SugaredLogger) {
ids := getOldInstances(ctx, ec2Client, expirationTime)
logger.With("ids", ids, "count", len(ids)).Infof("discovered test instances to delete")
if len(ids) > 0 {
Expand All @@ -68,13 +80,37 @@ func main() {
}
}
}
}

// Terminate any old stacks that were provisioned as part of testing
// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively
// Terminate any old stacks that were provisioned as part of testing
// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively
func cleanupStack(ctx context.Context, ec2Client *ec2.Client, cloudWatchClient *cloudwatch.Client, cloudFormationClient *cloudformation.Client, expirationTime time.Time, logger *zap.SugaredLogger) {
sgInAccount := lo.Must(ec2Client.DescribeSecurityGroups(ctx, &ec2.DescribeSecurityGroupsInput{
Filters: []ec2types.Filter{
{
Name: lo.ToPtr("group-name"),
Values: []string{"security-group-drift"},
},
},
}))
names := getOldStacks(ctx, cloudFormationClient, expirationTime)
logger.With("names", names, "count", len(names)).Infof("discovered test stacks to delete")
deleted := 0
for i := range names {
if strings.HasSuffix(names[i], "-cluster") && strings.Contains(names[i], "drift") {
stackName := strings.Split(names[i], "-")
sgName := fmt.Sprintf("drift-%s", stackName[2])
sg, _ := lo.Find(sgInAccount.SecurityGroups, func(sg ec2types.SecurityGroup) bool {
return *sg.Tags[0].Key == karpenterSecurityGroupTag && *sg.Tags[0].Value == sgName
})
if _, err := ec2Client.DeleteSecurityGroup(ctx, &ec2.DeleteSecurityGroupInput{
GroupId: sg.GroupId,
}); err != nil {
logger.With("name", names[i]).Errorf("deleting test stack sg, %v", err)
} else {
logger.With("name", names[i]).Infof("deleted test stack sg")
}
}
if _, err := cloudFormationClient.DeleteStack(ctx, &cloudformation.DeleteStackInput{
StackName: lo.ToPtr(names[i]),
}); err != nil {
Expand All @@ -87,11 +123,14 @@ func main() {
if err := fireMetric(ctx, cloudWatchClient, "StacksDeleted", float64(deleted)); err != nil {
logger.With("name", "StacksDeleted").Errorf("firing metric, %v", err)
}
}

// Terminate any old launch templates that were managed by Karpenter and were provisioned as part of testing
names = getOldLaunchTemplates(ctx, ec2Client, expirationTime)
// Terminate any old launch templates that were managed by Karpenter and were provisioned as part of testing
// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively
func cleanupLaunchTemplates(ctx context.Context, ec2Client *ec2.Client, cloudWatchClient *cloudwatch.Client, expirationTime time.Time, logger *zap.SugaredLogger) {
names := getOldLaunchTemplates(ctx, ec2Client, expirationTime)
logger.With("names", names, "count", len(names)).Infof("discovered test launch templates to delete")
deleted = 0
deleted := 0
for i := range names {
if _, err := ec2Client.DeleteLaunchTemplate(ctx, &ec2.DeleteLaunchTemplateInput{
LaunchTemplateName: lo.ToPtr(names[i]),
Expand All @@ -107,6 +146,26 @@ func main() {
}
}

// Terminate any old OIDC providers that were are remaining as part of testing
// We execute these in serial since we will most likely get rate limited if we try to delete these too aggressively
func cleanupOIDCProvider(ctx context.Context, iamClient *iam.Client, cloudWatchClient *cloudwatch.Client, expirationTime time.Time, logger *zap.SugaredLogger) {
arns := getOldOIDCProviders(ctx, iamClient, expirationTime)
deleted := 0
for i := range arns {
if _, err := iamClient.DeleteOpenIDConnectProvider(ctx, &iam.DeleteOpenIDConnectProviderInput{
OpenIDConnectProviderArn: lo.ToPtr(arns[i]),
}); err != nil {
logger.With("arn", arns[i]).Errorf("deleting test cluster oidc provider, %v", err)
} else {
logger.With("arn", arns[i]).Infof("deleted test cluster oidc provider")
deleted++
}
}
if err := fireMetric(ctx, cloudWatchClient, "OIDCDeleted", float64(deleted)); err != nil {
logger.With("name", "OIDCDeleted").Errorf("firing metric, %v", err)
}
}

func fireMetric(ctx context.Context, cloudWatchClient *cloudwatch.Client, name string, value float64) error {
_, err := cloudWatchClient.PutMetricData(ctx, &cloudwatch.PutMetricDataInput{
Namespace: lo.ToPtr(karpenterMetricNamespace),
Expand Down Expand Up @@ -208,3 +267,23 @@ func getOldLaunchTemplates(ctx context.Context, ec2Client *ec2.Client, expiratio
}
return names
}

func getOldOIDCProviders(ctx context.Context, iamClient *iam.Client, expirationTime time.Time) (names []string) {
testSuite := []string{"upgrade", "chaos", "consolidation", "drift", "integration", "interruption", "ipv6", "machine", "scale", "utilization"}
out := lo.Must(iamClient.ListOpenIDConnectProviders(ctx, &iam.ListOpenIDConnectProvidersInput{}))

for _, oicdArn := range out.OpenIDConnectProviderList {
oicd := lo.Must(iamClient.GetOpenIDConnectProvider(ctx, &iam.GetOpenIDConnectProviderInput{
OpenIDConnectProviderArn: oicdArn.Arn,
}))

for _, t := range oicd.Tags {
if lo.FromPtr(t.Key) == "alpha.eksctl.io/cluster-name" &&
lo.SomeBy(testSuite, func(s string) bool { return strings.HasPrefix(lo.FromPtr(t.Value), fmt.Sprintf("%s-", s)) }) &&
oicd.CreateDate.Before(expirationTime) {
names = append(names, lo.FromPtr(oicdArn.Arn))
}
}
}
return names
}

0 comments on commit e00681d

Please sign in to comment.