Skip to content

Commit

Permalink
Add Resource count to Timestream
Browse files Browse the repository at this point in the history
  • Loading branch information
engedaam committed Nov 30, 2023
1 parent 504c71e commit c70e911
Show file tree
Hide file tree
Showing 20 changed files with 289 additions and 19 deletions.
6 changes: 3 additions & 3 deletions .github/actions/e2e/cleanup/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,12 +33,12 @@ runs:
eksctl delete cluster --name ${{ inputs.cluster_name }} --timeout 60m --wait || true
- uses: actions/setup-go@93397bea11091df50f3d7e59dc26a7711a8bcfbe # v4
with:
go-version-file: test/hack/cleanup/go.mod
cache-dependency-path: test/hack/cleanup/go.sum
go-version-file: test/hack/resource-management/go.mod
cache-dependency-path: test/hack/resource-management/go.sum
check-latest: true
cache: false
- name: "Run cleanup script"
run: |
go run main.go ${{ inputs.cluster_name }}
working-directory: ./test/hack/cleanup
working-directory: ./test/hack/resource-management/clean-resources
shell: bash
30 changes: 30 additions & 0 deletions .github/workflows/resource-count.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: ResourceCount
on:
schedule:
- cron: '3 */1 * * *' # every hour
workflow_dispatch:
permissions:
id-token: write # aws-actions/[email protected]
jobs:
counter:
if: vars.ACCOUNT_ID != '' || github.event_name == 'workflow_dispatch'
strategy:
fail-fast: false
matrix:
region: [us-east-2, us-west-2, eu-west-1]
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v4
- name: configure aws credentials
uses: aws-actions/[email protected]
with:
role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }}
aws-region: ${{ matrix.region }}
- uses: actions/setup-go@v4
with:
go-version-file: test/hack/resource-management/go.mod
check-latest: true
cache-dependency-path: "test/hack/resource-management/go.sum"
- run: go run main.go
working-directory: ./test/hack/resource-management/count-resources
name: "Run resource count script"
6 changes: 3 additions & 3 deletions .github/workflows/sweeper.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@ jobs:
aws-region: ${{ matrix.region }}
- uses: actions/setup-go@93397bea11091df50f3d7e59dc26a7711a8bcfbe # v4
with:
go-version-file: test/hack/cleanup/go.mod
go-version-file: test/hack/resource-management/go.mod
check-latest: true
cache-dependency-path: "test/hack/cleanup/go.sum"
cache-dependency-path: "test/hack/resource-management/go.sum"
- run: go run main.go
working-directory: ./test/hack/cleanup
working-directory: ./test/hack/resource-management/clean-resources
name: "Run cleanup script"
4 changes: 2 additions & 2 deletions test/cloudformation/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ aws cloudformation deploy \
aws cloudformation deploy \
--stack-name GithubActionsTimestream \
--template-file timestream_cloudformation.yaml \
--parameter-overrides "DatabaseName=karpenterTesting" "TableName=scaleTestDurations" "SweeperTableName=sweeperCleanedResources"
--parameter-overrides "DatabaseName=karpenterTesting" "TableName=scaleTestDurations" "SweeperTableName=sweeperCleanedResources" "ResourceCountTableName=resourceCount"
```

### [Optional] Deploying ManagedGrafana and its Policy
Expand All @@ -30,7 +30,7 @@ aws cloudformation deploy \
```console
aws cloudformation deploy --stack-name GithubActionsIAM \
--template-file iam_cloudformation.yaml \
--parameter-overrides "DatabaseName=karpenterTesting" "TableName=scaleTestDurations" "SweeperTableName=sweeperCleanedResources" "Repository=<repository>" Branches="*" "PrometheusWorkspaceID=<workspace-id>" Regions="us-east-2,us-west-2,..." \
--parameter-overrides "DatabaseName=karpenterTesting" "TableName=scaleTestDurations" "SweeperTableName=sweeperCleanedResources" "ResourceCountTableName=resourceCount" "Repository=<repository>" Branches="*" "PrometheusWorkspaceID=<workspace-id>" Regions="us-east-2,us-west-2,..." \
--capabilities CAPABILITY_NAMED_IAM
```

Expand Down
4 changes: 4 additions & 0 deletions test/cloudformation/iam_cloudformation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@ Parameters:
SweeperTableName:
Type: String
Description: "Timestream table to forward leaked resources to"
ResourceCountTableName:
Type: String
Description: "Timestream table to count number of resources to"
Resources:
GithubOIDCProvider:
Type: AWS::IAM::OIDCProvider
Expand Down Expand Up @@ -138,6 +141,7 @@ Resources:
Resource:
- !Sub "arn:${AWS::Partition}:timestream:${AWS::Region}:${AWS::AccountId}:database/${DatabaseName}/table/${TableName}"
- !Sub "arn:${AWS::Partition}:timestream:${AWS::Region}:${AWS::AccountId}:database/${DatabaseName}/table/${SweeperTableName}"
- !Sub "arn:${AWS::Partition}:timestream:${AWS::Region}:${AWS::AccountId}:database/${DatabaseName}/table/${ResourceCountTableName}"
- Effect: Allow
Action: timestream:DescribeEndpoints
Resource: "*"
Expand Down
14 changes: 14 additions & 0 deletions test/cloudformation/timestream_cloudformation.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ Parameters:
SweeperTableName:
Type: String
Description: "Timestream table to forward leaked resources to"
ResourceCountTableName:
Type: String
Description: "Timestream table to count number of resources to"
BackupDisasterRecoveryRegion:
Type: String
Default: "us-east-1"
Expand Down Expand Up @@ -39,6 +42,16 @@ Resources:
RetentionProperties:
MemoryStoreRetentionPeriodInHours: "2160" # Three months of memory store retention
MagneticStoreRetentionPeriodInDays: "1825" # 5 years of magnetic store retention
ResourceCountTimestreamTable:
Type: "AWS::Timestream::Table"
Properties:
DatabaseName: !Ref "TimestreamDatabase"
TableName: !Sub "${ResourceCountTableName}"
MagneticStoreWriteProperties:
EnableMagneticStoreWrites: true
RetentionProperties:
MemoryStoreRetentionPeriodInHours: "2160" # Three months of memory store retention
MagneticStoreRetentionPeriodInDays: "1825" # 5 years of magnetic store retention
TimestreamBackupPlan:
Type: "AWS::Backup::BackupPlan"
Properties:
Expand All @@ -65,6 +78,7 @@ Resources:
Resources:
- !GetAtt "TimestreamTable.Arn"
- !GetAtt "SweeperTimestreamTable.Arn"
- !GetAtt "ResourceCountTimestreamTable.Arn"
BackupPlanId: !Ref "TimestreamBackupPlan"
Outputs:
ScaleTimestreamTable:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,12 @@ import (
"github.com/samber/lo"
"go.uber.org/zap"

"github.com/aws/karpenter/test/hack/cleanup/metrics"
"github.com/aws/karpenter/test/hack/cleanup/resourcetypes"
"github.com/aws/karpenter/test/hack/resource-management/pkg/metrics"
"github.com/aws/karpenter/test/hack/resource-management/pkg/resourcetypes"
)

const expirationTTL = time.Hour * 12
const sweeperCleanedResourcesTableName = "sweeperCleanedResources"

func main() {
var clusterName string
Expand Down Expand Up @@ -86,7 +87,7 @@ func main() {
if err != nil {
resourceLogger.Errorf("%v", err)
}
if err = metricsClient.FireMetric(ctx, fmt.Sprintf("%sDeleted", resourceTypes[i].String()), float64(len(cleaned)), cfg.Region); err != nil {
if err = metricsClient.FireMetric(ctx, sweeperCleanedResourcesTableName, fmt.Sprintf("%sDeleted", resourceTypes[i].String()), float64(len(cleaned)), cfg.Region); err != nil {
resourceLogger.Errorf("%v", err)
}
resourceLogger.With("ids", cleaned, "count", len(cleaned)).Infof("deleted resourceTypes")
Expand Down
67 changes: 67 additions & 0 deletions test/hack/resource-management/count-resources/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package main

import (
"context"

"github.com/aws/aws-sdk-go-v2/config"
"github.com/aws/aws-sdk-go-v2/service/cloudformation"
"github.com/aws/aws-sdk-go-v2/service/ec2"
"github.com/aws/aws-sdk-go-v2/service/iam"
"github.com/samber/lo"
"go.uber.org/zap"

"github.com/aws/karpenter/test/hack/resource-management/pkg/metrics"
"github.com/aws/karpenter/test/hack/resource-management/pkg/resourcetypes"
)

const resourceCountTableName = "resourceCount"

func main() {
ctx := context.Background()
cfg := lo.Must(config.LoadDefaultConfig(ctx))

logger := lo.Must(zap.NewProduction()).Sugar()

ec2Client := ec2.NewFromConfig(cfg)
cloudFormationClient := cloudformation.NewFromConfig(cfg)
iamClient := iam.NewFromConfig(cfg)
metricsClient := metrics.Client(metrics.NewTimeStream(cfg))

resourceTypes := []resourcetypes.Type{
resourcetypes.NewInstance(ec2Client),
resourcetypes.NewVPCEndpoint(ec2Client),
resourcetypes.NewENI(ec2Client),
resourcetypes.NewSecurityGroup(ec2Client),
resourcetypes.NewLaunchTemplate(ec2Client),
resourcetypes.NewOIDC(iamClient),
resourcetypes.NewInstanceProfile(iamClient),
resourcetypes.NewStack(cloudFormationClient),
}

for i := range resourceTypes {
resourceLogger := logger.With("type", resourceTypes[i].String())
resourceCount, err := resourceTypes[i].GetCount(ctx)
if err != nil {
resourceLogger.Errorf("%v", err)
}

if err = metricsClient.FireMetric(ctx, resourceCountTableName, resourceTypes[i].String(), float64(resourceCount), cfg.Region); err != nil {
resourceLogger.Errorf("%v", err)
}
resourceLogger.With("count", resourceCount).Infof("counted resourceTypes")
}
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
module github.com/aws/karpenter/test/hack/cleanup
module github.com/aws/karpenter/test/hack/resource-management

go 1.21

Expand Down
File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,12 @@ import (
)

const (
karpenterMetricRegion = "us-east-2"
karpenterMetricDatabase = "karpenterTesting"
karpenterMetricTableName = "sweeperCleanedResources"
karpenterMetricRegion = "us-east-2"
karpenterMetricDatabase = "karpenterTesting"
)

type Client interface {
FireMetric(context.Context, string, float64, string) error
FireMetric(context.Context, string, string, float64, string) error
}

type TimeStream struct {
Expand All @@ -42,10 +41,10 @@ func NewTimeStream(cfg aws.Config) *TimeStream {
return &TimeStream{timestreamClient: timestreamwrite.NewFromConfig(cfg, WithRegion(karpenterMetricRegion))}
}

func (t *TimeStream) FireMetric(ctx context.Context, name string, value float64, region string) error {
func (t *TimeStream) FireMetric(ctx context.Context, tableName string, name string, value float64, region string) error {
_, err := t.timestreamClient.WriteRecords(ctx, &timestreamwrite.WriteRecordsInput{
DatabaseName: aws.String(karpenterMetricDatabase),
TableName: aws.String(karpenterMetricTableName),
TableName: aws.String(tableName),
Records: []timestreamtypes.Record{
{
MeasureName: aws.String(name),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,26 @@ func (e *ENI) GetExpired(ctx context.Context, expirationTime time.Time) (ids []s
return ids, err
}

func (e *ENI) GetCount(ctx context.Context) (count int, err error) {
var nextToken *string
for {
out, err := e.ec2Client.DescribeNetworkInterfaces(ctx, &ec2.DescribeNetworkInterfacesInput{
NextToken: nextToken,
})
if err != nil {
return count, err
}

count += len(out.NetworkInterfaces)

nextToken = out.NextToken
if nextToken == nil {
break
}
}
return count, err
}

func (e *ENI) Get(ctx context.Context, clusterName string) (ids []string, err error) {
var nextToken *string
for {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,49 @@ func (i *Instance) GetExpired(ctx context.Context, expirationTime time.Time) (id
return ids, err
}

func (i *Instance) GetCount(ctx context.Context) (count int, err error) {
var nextToken *string

for {
out, err := i.ec2Client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
Filters: []ec2types.Filter{
{
Name: lo.ToPtr("instance-state-name"),
Values: []string{
string(ec2types.InstanceStateNameRunning),
string(ec2types.InstanceStateNamePending),
string(ec2types.InstanceStateNameShuttingDown),
string(ec2types.InstanceStateNameStopped),
string(ec2types.InstanceStateNameStopping),
},
},
},
NextToken: nextToken,
})
if err != nil {
return count, err
}

for _, res := range out.Reservations {
count += len(res.Instances)
}

nextToken = out.NextToken
if nextToken == nil {
break
}
}
return count, err
}

func (i *Instance) Get(ctx context.Context, clusterName string) (ids []string, err error) {
var nextToken *string

for {
out, err := i.ec2Client.DescribeInstances(ctx, &ec2.DescribeInstancesInput{
Filters: []ec2types.Filter{
{
Name: lo.ToPtr("Instance-state-name"),
Name: lo.ToPtr("instance-state-name"),
Values: []string{string(ec2types.InstanceStateNameRunning)},
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ func (ip *InstanceProfile) GetExpired(ctx context.Context, expirationTime time.T
return names, multierr.Combine(errs...)
}

func (ip *InstanceProfile) GetCount(ctx context.Context) (count int, err error) {
out, err := ip.iamClient.ListInstanceProfiles(ctx, &iam.ListInstanceProfilesInput{})
if err != nil {
return count, err
}

return len(out.InstanceProfiles), nil
}

func (ip *InstanceProfile) Get(ctx context.Context, clusterName string) (names []string, err error) {
out, err := ip.iamClient.ListInstanceProfiles(ctx, &iam.ListInstanceProfilesInput{})
if err != nil {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,26 @@ func (lt *LaunchTemplate) GetExpired(ctx context.Context, expirationTime time.Ti
return names, err
}

func (lt *LaunchTemplate) GetCount(ctx context.Context) (count int, err error) {
var nextToken *string
for {
out, err := lt.ec2Client.DescribeLaunchTemplates(ctx, &ec2.DescribeLaunchTemplatesInput{
NextToken: nextToken,
})
if err != nil {
return count, err
}

count += len(out.LaunchTemplates)

nextToken = out.NextToken
if nextToken == nil {
break
}
}
return count, err
}

func (lt *LaunchTemplate) Get(ctx context.Context, clusterName string) (names []string, err error) {
var nextToken *string
for {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,15 @@ func (o *OIDC) GetExpired(ctx context.Context, expirationTime time.Time) (names
return names, multierr.Combine(errs...)
}

func (o *OIDC) GetCount(ctx context.Context) (count int, err error) {
out, err := o.iamClient.ListOpenIDConnectProviders(ctx, &iam.ListOpenIDConnectProvidersInput{})
if err != nil {
return count, err
}

return len(out.OpenIDConnectProviderList), nil
}

func (o *OIDC) Get(ctx context.Context, clusterName string) (names []string, err error) {
return names, err
}
Expand Down
Loading

0 comments on commit c70e911

Please sign in to comment.