diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index aaafc680545a..1c28eae5f986 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -39,7 +39,7 @@ runs: aws eks update-kubeconfig --name "${{ inputs.cluster_name }}" helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ -n karpenter \ - --version "v0-$(git rev-parse HEAD)" \ + --version v0.29.2 \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::${{ inputs.account_id }}:role/karpenter-irsa-${{ inputs.cluster_name }}" \ --set settings.aws.clusterName="${{ inputs.cluster_name }}" \ --set settings.aws.defaultInstanceProfile="KarpenterNodeInstanceProfile-${{ inputs.cluster_name }}" \ @@ -64,5 +64,5 @@ runs: run: | helm diff upgrade --namespace karpenter \ karpenter oci://public.ecr.aws/karpenter/karpenter \ - --version v0-$(git rev-parse HEAD) \ + --version v0.29.2 \ --reuse-values --three-way-merge --detailed-exitcode \ No newline at end of file diff --git a/.github/actions/e2e/install-prometheus/values.yaml b/.github/actions/e2e/install-prometheus/values.yaml index 0b7fa6985574..b559e748be38 100644 --- a/.github/actions/e2e/install-prometheus/values.yaml +++ b/.github/actions/e2e/install-prometheus/values.yaml @@ -26,30 +26,14 @@ alertmanager: tolerations: - key: CriticalAddonsOnly operator: Exists -kubelet: - serviceMonitor: - additionalLabels: - scrape: enabled -prometheus: - prometheusSpec: - tolerations: - - key: CriticalAddonsOnly - operator: Exists - resources: - requests: - cpu: 1 - memory: 5Gi - limits: - cpu: 1 - memory: 5Gi - serviceMonitorSelector: - matchLabels: - scrape: enabled - serviceMonitorNamespaceSelector: - matchLabels: - scrape: enabled - remoteWrite: - - queueConfig: - maxSamplesPerSend: 1000 - maxShards: 200 - capacity: 2500 \ No newline at end of file +extraScrapeConfigs: | + - job_name: karpenter + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - karpenter + relabel_configs: + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http-metrics + action: keep diff --git a/.github/workflows/e2e-soak-trigger.yaml b/.github/workflows/e2e-soak-trigger.yaml new file mode 100644 index 000000000000..19c15ad41017 --- /dev/null +++ b/.github/workflows/e2e-soak-trigger.yaml @@ -0,0 +1,13 @@ +name: E2ESoakTrigger +on: + schedule: + - cron: '0 */1 * * *' + workflow_dispatch: +jobs: + soak: + # if: github.repository == 'aws/karpenter' || github.event_name == 'workflow_dispatch' + uses: ./.github/workflows/e2e-soak.yaml + with: + event_name: ${{ github.event_name }} + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} \ No newline at end of file diff --git a/.github/workflows/e2e-soak.yaml b/.github/workflows/e2e-soak.yaml new file mode 100644 index 000000000000..365fd602a570 --- /dev/null +++ b/.github/workflows/e2e-soak.yaml @@ -0,0 +1,132 @@ +name: E2ESoak +on: + workflow_dispatch: + inputs: + git_ref: + type: string + region: + type: choice + options: + - "us-east-2" + - "us-west-2" + default: "us-east-2" + k8s_version: + type: choice + options: + - "1.23" + - "1.24" + - "1.25" + - "1.26" + - "1.27" + default: "1.27" + enable_metrics: + type: boolean + default: false + workflow_call: + inputs: + git_ref: + type: string + region: + type: string + default: "us-east-2" + event_name: + type: string + required: true + k8s_version: + type: string + default: "1.27" + enable_metrics: + type: boolean + default: false + secrets: + SLACK_WEBHOOK_URL: + required: true +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + statuses: write +jobs: + run-suite: + name: suite-Soak + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ inputs.git_ref }} + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }} + aws-region: ${{ inputs.region }} + role-duration-seconds: 21600 + - uses: ./.github/actions/e2e/install-eksctl + with: + eksctl_version: v0.147.0 + - name: find preexisting cluster + run: | + export PREEXISTING=$(eksctl get cluster -o json | jq '.[].Name' | grep soak) + echo "Found existing cluster name \"$PREEXISTING\"" + echo PREEXISTING=$PREEXISTING >> $GITHUB_ENV + - name: generate cluster name + if: env.PREEXISTING == '' + run: | + CLUSTER_NAME=$(echo Soak-$RANDOM$RANDOM | awk '{print tolower($0)}') + echo "Using cluster name \"$CLUSTER_NAME\"" + echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_ENV + - name: create eks cluster '${{ env.CLUSTER_NAME }}' + if: env.PREEXISTING == '' + uses: ./.github/actions/e2e/create-cluster + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ inputs.region }} + cluster_name: ${{ env.CLUSTER_NAME }} + k8s_version: ${{ inputs.k8s_version }} + ip_family: 'IPv4' + git_ref: ${{ inputs.git_ref }} + - name: install prometheus + if: env.PREEXISTING == '' + uses: ./.github/actions/e2e/install-prometheus + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ vars.PROMETHEUS_REGION }} + cluster_name: ${{ env.CLUSTER_NAME }} + workspace_id: ${{ vars.WORKSPACE_ID }} + git_ref: ${{ inputs.git_ref }} + - name: install karpenter + if: env.PREEXISTING == '' + uses: ./.github/actions/e2e/install-karpenter + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ inputs.region }} + cluster_name: ${{ env.CLUSTER_NAME }} + git_ref: ${{ inputs.git_ref }} + - name: run the ${{ inputs.suite }} test suite + if: ${{ env.PREEXISTING }} != "" + run: | + aws eks update-kubeconfig --name ${{ env.PREEXISTING }} + TEST_SUITE="Soak" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests + - name: run the ${{ inputs.suite }} test suite + if: ${{ env.PREEXISTING }} == "" + run: | + aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} + TEST_SUITE="Soak" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests + - name: notify slack of success or failure + uses: ./.github/actions/e2e/slack/notify + if: (success() || failure()) && inputs.event_name != 'workflow_run' && inputs.event_name != 'conformance' + with: + url: ${{ secrets.SLACK_WEBHOOK_URL }} + suite: Soak + k8s_version: ${{ inputs.k8s_version }} + event_name: ${{ inputs.event_name }} + git_ref: ${{ inputs.git_ref }} + - name: dump logs on failure + uses: ./.github/actions/e2e/dump-logs + if: failure() || cancelled() + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ inputs.region }} + cluster_name: ${{ env.CLUSTER_NAME }} \ No newline at end of file diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index 0bd54a431515..3a2fd4678bf2 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -66,6 +66,8 @@ spec: image: {{ include "karpenter.controller.image" . }} imagePullPolicy: {{ .Values.imagePullPolicy }} env: + - name: ENABLE_PROFILING + value: "true" - name: KUBERNETES_MIN_VERSION value: "1.19.0-0" - name: KARPENTER_SERVICE diff --git a/test/cloudformation/iam_cloudformation.yaml b/test/cloudformation/iam_cloudformation.yaml index 99c47daabc24..3ff672c4ac90 100644 --- a/test/cloudformation/iam_cloudformation.yaml +++ b/test/cloudformation/iam_cloudformation.yaml @@ -134,6 +134,7 @@ Resources: Resource: "*" - Effect: Allow Action: + - eks:ListClusters - eks:CreateCluster - eks:CreateAddon - eks:CreateNodegroup diff --git a/test/pkg/environment/common/setup.go b/test/pkg/environment/common/setup.go index 8cf21c6b5d64..da85856b035b 100644 --- a/test/pkg/environment/common/setup.go +++ b/test/pkg/environment/common/setup.go @@ -76,6 +76,7 @@ func (env *Environment) ExpectCleanCluster() { var nodes v1.NodeList Expect(env.Client.List(env.Context, &nodes)).To(Succeed()) for _, node := range nodes.Items { + fmt.Println(node.Name) if len(node.Spec.Taints) == 0 && !node.Spec.Unschedulable { Fail(fmt.Sprintf("expected system pool node %s to be tainted", node.Name)) } diff --git a/test/suites/soak/suite_test.go b/test/suites/soak/suite_test.go new file mode 100644 index 000000000000..073d313615fe --- /dev/null +++ b/test/suites/soak/suite_test.go @@ -0,0 +1,161 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package soak_test + +import ( + "context" + "fmt" + "math/rand" + "os" + "sync/atomic" + "testing" + "time" + + "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/aws/karpenter-core/pkg/test" + nodeutils "github.com/aws/karpenter-core/pkg/utils/node" + "github.com/aws/karpenter/pkg/apis/settings" + "github.com/aws/karpenter/pkg/apis/v1alpha1" + awstest "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/debug" + "github.com/aws/karpenter/test/pkg/environment/aws" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/informers" + "k8s.io/client-go/tools/cache" + "sigs.k8s.io/controller-runtime/pkg/client" + + awssdk "github.com/aws/aws-sdk-go/aws" +) + +var env *aws.Environment + +func TestSoak(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + SetDefaultEventuallyTimeout(time.Hour) + }) + RunSpecs(t, "Soak") +} + +var _ = BeforeEach(func() { env.BeforeEach() }) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Soak", func() { + It("should ", Label(debug.NoWatch), Label(debug.NoEvents), func() { + ctx, cancel := context.WithCancel(env.Context) + defer cancel() + + content, err := os.ReadFile("testdata/user.sh") + Expect(err).NotTo(HaveOccurred()) + provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ + SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": settings.FromContext(env.Context).ClusterName}, + SubnetSelector: map[string]string{"karpenter.sh/discovery": settings.FromContext(env.Context).ClusterName}, + }, + UserData: awssdk.String(string(content)), + }) + provisioner := test.Provisioner(test.ProvisionerOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "sock-test-provisioner", + }, + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1alpha5.LabelCapacityType, + Operator: v1.NodeSelectorOpIn, + Values: []string{v1alpha5.CapacityTypeSpot}, + }, + }, + Consolidation: &v1alpha5.Consolidation{ + Enabled: lo.ToPtr(true), + }, + ProviderRef: &v1alpha5.MachineTemplateRef{Name: provider.Name}, + }) + numPods := 0 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: lo.ToPtr[int64](0), + }, + }) + + dep.Spec.Template.Spec.Affinity = &v1.Affinity{ + PodAntiAffinity: &v1.PodAntiAffinity{ + RequiredDuringSchedulingIgnoredDuringExecution: []v1.PodAffinityTerm{ + { + LabelSelector: dep.Spec.Selector, + TopologyKey: v1.LabelHostname, + }, + }, + }, + } + + // Create a deployment with a single pod + env.ExpectCreated(provider, provisioner, dep) + startNodeCountMonitor(ctx, env.Client) + time.Sleep(time.Second * 10) + + Consistently(func(g Gomega) { + dep.Spec.Replicas = awssdk.Int32(int32(rand.Intn(100) + 1)) + env.ExpectCreatedOrUpdated(dep) + time.Sleep(time.Minute * 5) + dep.Spec.Replicas = awssdk.Int32(0) + env.ExpectCreatedOrUpdated(dep) + time.Sleep(time.Second * 30) + }, time.Hour*2).Should(Succeed()) + env.ExpectDeleted(provisioner, provider, dep) + }) +}) + +func startNodeCountMonitor(ctx context.Context, kubeClient client.Client) { + createdNodes := atomic.Int64{} + deletedNodes := atomic.Int64{} + + factory := informers.NewSharedInformerFactoryWithOptions(env.KubeClient, time.Second*30, + informers.WithTweakListOptions(func(l *metav1.ListOptions) { l.LabelSelector = v1alpha5.ProvisionerNameLabelKey })) + nodeInformer := factory.Core().V1().Nodes().Informer() + nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(_ interface{}) { + createdNodes.Add(1) + }, + DeleteFunc: func(_ interface{}) { + deletedNodes.Add(1) + }, + }) + factory.Start(ctx.Done()) + go func() { + for { + list := &v1.NodeList{} + if err := kubeClient.List(ctx, list, client.HasLabels{test.DiscoveryLabel}); err == nil { + readyCount := lo.CountBy(list.Items, func(n v1.Node) bool { + return nodeutils.GetCondition(&n, v1.NodeReady).Status == v1.ConditionTrue + }) + fmt.Printf("[NODE COUNT] CURRENT: %d | READY: %d | CREATED: %d | DELETED: %d\n", len(list.Items), readyCount, createdNodes.Load(), deletedNodes.Load()) + } + select { + case <-ctx.Done(): + return + case <-time.After(time.Second * 5): + } + } + }() +} diff --git a/test/suites/soak/testdata/user.sh b/test/suites/soak/testdata/user.sh new file mode 100644 index 000000000000..5af30bdf1ef5 --- /dev/null +++ b/test/suites/soak/testdata/user.sh @@ -0,0 +1,28 @@ +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="BOUNDARY" + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +mkdir -p /etc/systemd/logind.conf.d +cat << EOF > /etc/systemd/logind.conf.d/50-max-delay.conf +[Login] +InhibitDelayMaxSec=360 +EOF + +systemctl restart systemd-logind + +sed -i '/"apiVersion*/a \ \ "shutdownGracePeriod": "3m",' /etc/kubernetes/kubelet/kubelet-config.json +sed -i '/"shutdownGracePeriod*/a \ \ "shutdownGracePeriodCriticalPods": "2m",' /etc/kubernetes/kubelet/kubelet-config.json + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 + + +echo $(jq '.containerLogMaxFiles=3|.containerLogMaxSize="100Mi"' /etc/kubernetes/kubelet/kubelet-config.json) > /etc/kubernetes/kubelet/kubelet-config.json + +--BOUNDARY-- \ No newline at end of file