From cd2c4f3c652a7e1850fa9ae22ef3bfa6e5390b40 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda Date: Fri, 7 Jul 2023 09:58:01 -0700 Subject: [PATCH 1/2] Soak to collect data --- .github/workflows/e2e-soak-trigger.yaml | 13 ++ test/suites/soak/suite_test.go | 153 ++++++++++++++++++++++++ 2 files changed, 166 insertions(+) create mode 100644 .github/workflows/e2e-soak-trigger.yaml create mode 100644 test/suites/soak/suite_test.go diff --git a/.github/workflows/e2e-soak-trigger.yaml b/.github/workflows/e2e-soak-trigger.yaml new file mode 100644 index 000000000000..a73883a38154 --- /dev/null +++ b/.github/workflows/e2e-soak-trigger.yaml @@ -0,0 +1,13 @@ +name: E2ESoakTrigger +on: + workflow_dispatch: +jobs: + scale: + if: github.repository == 'aws/karpenter' || github.event_name == 'workflow_dispatch' + uses: ./.github/workflows/e2e.yaml + with: + suite: Soak + event_name: ${{ github.event_name }} + region: "us-west-2" + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} \ No newline at end of file diff --git a/test/suites/soak/suite_test.go b/test/suites/soak/suite_test.go new file mode 100644 index 000000000000..28cd304eaa4d --- /dev/null +++ b/test/suites/soak/suite_test.go @@ -0,0 +1,153 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package soak_test + +import ( + "context" + "fmt" + "math/rand" + "sync/atomic" + "testing" + "time" + + "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/aws/karpenter-core/pkg/test" + nodeutils "github.com/aws/karpenter-core/pkg/utils/node" + "github.com/aws/karpenter/pkg/apis/settings" + "github.com/aws/karpenter/pkg/apis/v1alpha1" + awstest "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/debug" + "github.com/aws/karpenter/test/pkg/environment/aws" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/informers" + "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + + awssdk "github.com/aws/aws-sdk-go/aws" +) + +var env *aws.Environment + +func TestSoak(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + SetDefaultEventuallyTimeout(time.Hour) + }) + RunSpecs(t, "Soak") +} + +var _ = BeforeEach(func() { env.BeforeEach() }) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Soak", func() { + It("should ", Label(debug.NoWatch), Label(debug.NoEvents), func() { + ctx, cancel := context.WithCancel(env.Context) + defer cancel() + + provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": settings.FromContext(env.Context).ClusterName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": settings.FromContext(env.Context).ClusterName}, + }}) + provisioner := test.Provisioner(test.ProvisionerOptions{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + Values: []string{v1alpha5.CapacityTypeSpot}, + }, + }, + Consolidation: &v1alpha5.Consolidation{ + Enabled: lo.ToPtr(true), + }, + ProviderRef: &v1alpha5.MachineTemplateRef{Name: provider.Name}, + }) + numPods := 0 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: lo.ToPtr[int64](0), + }, + }) + + dep.Spec.Template.Spec.Affinity = &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: dep.Spec.Selector, + TopologyKey: v1.LabelHostname, + }, + }, + }, + } + + // Create a deployment with a single pod + env.ExpectCreated(provider, provisioner, dep) + startNodeCountMonitor(ctx, env.Client) + time.Sleep(time.Second * 10) + + // Expect that we never get over a high number of nodes + Consistently(func(g Gomega) { + dep.Spec.Replicas = awssdk.Int32(int32(rand.Intn(20) + 1)) + env.ExpectUpdated(dep) + time.Sleep(time.Minute * 1) + dep.Spec.Replicas = awssdk.Int32(0) + env.ExpectUpdated(dep) + time.Sleep(time.Second * 30) + }, time.Hour*12).Should(Succeed()) + }) +}) + +func startNodeCountMonitor(ctx context.Context, kubeClient client.Client) { + createdNodes := atomic.Int64{} + deletedNodes := atomic.Int64{} + + factory := informers.NewSharedInformerFactoryWithOptions(env.KubeClient, time.Second*30, + informers.WithTweakListOptions(func(l *metav1.ListOptions) { l.LabelSelector = v1alpha5.ProvisionerNameLabelKey })) + nodeInformer := factory.Core().V1().Nodes().Informer() + nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(_ interface{}) { + createdNodes.Add(1) + }, + DeleteFunc: func(_ interface{}) { + deletedNodes.Add(1) + }, + }) + factory.Start(ctx.Done()) + go func() { + for { + list := &v1.NodeList{} + if err := kubeClient.List(ctx, list, client.HasLabels{test.DiscoveryLabel}); err == nil { + readyCount := lo.CountBy(list.Items, func(n v1.Node) bool { + return nodeutils.GetCondition(&n, v1.NodeReady).Status == v1.ConditionTrue + }) + fmt.Printf("[NODE COUNT] CURRENT: %d | READY: %d | CREATED: %d | DELETED: %d\n", len(list.Items), readyCount, createdNodes.Load(), deletedNodes.Load()) + } + select { + case <-ctx.Done(): + return + case <-time.After(time.Second * 5): + } + } + }() +} From 1fc461820c98490acfe296c3645d5156b843b983 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda Date: Fri, 7 Jul 2023 12:41:56 -0700 Subject: [PATCH 2/2] git ref --- .../actions/e2e/install-karpenter/action.yaml | 4 +-- .github/workflows/e2e-soak-trigger.yaml | 29 +++++++++++++------ test/suites/soak/suite_test.go | 7 +++-- 3 files changed, 27 insertions(+), 13 deletions(-) diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index a80d39ac291d..fd9d0f76227b 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -39,7 +39,7 @@ runs: aws eks update-kubeconfig --name ${{ inputs.cluster_name }} helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ -n karpenter \ - --version v0-$(git rev-parse HEAD) \ + --version v0.29.0 \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::${{ inputs.account_id }}:role/karpenter-irsa-${{ inputs.cluster_name }}" \ --set settings.aws.clusterName=${{ inputs.cluster_name }} \ --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${{ inputs.cluster_name }} \ @@ -56,5 +56,5 @@ runs: run: | helm diff upgrade --namespace karpenter \ karpenter oci://public.ecr.aws/karpenter/karpenter \ - --version v0-$(git rev-parse HEAD) \ + --version v0.29.0 \ --reuse-values --three-way-merge --detailed-exitcode \ No newline at end of file diff --git a/.github/workflows/e2e-soak-trigger.yaml b/.github/workflows/e2e-soak-trigger.yaml index a73883a38154..c412f8a2a2df 100644 --- a/.github/workflows/e2e-soak-trigger.yaml +++ b/.github/workflows/e2e-soak-trigger.yaml @@ -1,13 +1,24 @@ name: E2ESoakTrigger on: + schedule: + - cron: '0 */3 * * *' workflow_dispatch: +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + statuses: write jobs: - scale: - if: github.repository == 'aws/karpenter' || github.event_name == 'workflow_dispatch' - uses: ./.github/workflows/e2e.yaml - with: - suite: Soak - event_name: ${{ github.event_name }} - region: "us-west-2" - secrets: - SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} \ No newline at end of file + soak: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }} + aws-region: us-west-2 + role-duration-seconds: 21600 + - name: run the Soak test suite + run: | + aws eks update-kubeconfig --name Soak-testing + TEST_SUITE="Soak" make e2etests \ No newline at end of file diff --git a/test/suites/soak/suite_test.go b/test/suites/soak/suite_test.go index 28cd304eaa4d..c97d276d2894 100644 --- a/test/suites/soak/suite_test.go +++ b/test/suites/soak/suite_test.go @@ -67,6 +67,9 @@ var _ = Describe("Soak", func() { SubnetSelector: map[string]string{"karpenter.sh/discovery": settings.FromContext(env.Context).ClusterName}, }}) provisioner := test.Provisioner(test.ProvisionerOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sock-test-provisioner", + }, Requirements: []v1.NodeSelectorRequirement{ { Key: v1alpha5.LabelCapacityType, @@ -106,7 +109,6 @@ var _ = Describe("Soak", func() { startNodeCountMonitor(ctx, env.Client) time.Sleep(time.Second * 10) - // Expect that we never get over a high number of nodes Consistently(func(g Gomega) { dep.Spec.Replicas = awssdk.Int32(int32(rand.Intn(20) + 1)) env.ExpectUpdated(dep) @@ -114,7 +116,8 @@ var _ = Describe("Soak", func() { dep.Spec.Replicas = awssdk.Int32(0) env.ExpectUpdated(dep) time.Sleep(time.Second * 30) - }, time.Hour*12).Should(Succeed()) + }, time.Hour*2).Should(Succeed()) + env.ExpectDeleted(provisioner, provider, dep) }) })