diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index 180f64947065..1c28eae5f986 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -39,7 +39,7 @@ runs: aws eks update-kubeconfig --name "${{ inputs.cluster_name }}" helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ -n karpenter \ - --version v0.29.0 \ + --version v0.29.2 \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::${{ inputs.account_id }}:role/karpenter-irsa-${{ inputs.cluster_name }}" \ --set settings.aws.clusterName="${{ inputs.cluster_name }}" \ --set settings.aws.defaultInstanceProfile="KarpenterNodeInstanceProfile-${{ inputs.cluster_name }}" \ @@ -64,5 +64,5 @@ runs: run: | helm diff upgrade --namespace karpenter \ karpenter oci://public.ecr.aws/karpenter/karpenter \ - --version v0.29.0 \ + --version v0.29.2 \ --reuse-values --three-way-merge --detailed-exitcode \ No newline at end of file diff --git a/.github/actions/e2e/install-prometheus/values.yaml b/.github/actions/e2e/install-prometheus/values.yaml index 0b7fa6985574..4108d6b5b0c7 100644 --- a/.github/actions/e2e/install-prometheus/values.yaml +++ b/.github/actions/e2e/install-prometheus/values.yaml @@ -26,30 +26,19 @@ alertmanager: tolerations: - key: CriticalAddonsOnly operator: Exists -kubelet: - serviceMonitor: - additionalLabels: - scrape: enabled prometheus: prometheusSpec: tolerations: - key: CriticalAddonsOnly operator: Exists - resources: - requests: - cpu: 1 - memory: 5Gi - limits: - cpu: 1 - memory: 5Gi - serviceMonitorSelector: - matchLabels: - scrape: enabled - serviceMonitorNamespaceSelector: - matchLabels: - scrape: enabled - remoteWrite: - - queueConfig: - maxSamplesPerSend: 1000 - maxShards: 200 - capacity: 2500 \ No newline at end of file +extraScrapeConfigs: | + - job_name: karpenter + kubernetes_sd_configs: + - role: endpoints + namespaces: + names: + - karpenter + relabel_configs: + - source_labels: [__meta_kubernetes_endpoint_port_name] + regex: http-metrics + action: keep diff --git a/.github/workflows/e2e-soak-trigger.yaml b/.github/workflows/e2e-soak-trigger.yaml index c412f8a2a2df..19c15ad41017 100644 --- a/.github/workflows/e2e-soak-trigger.yaml +++ b/.github/workflows/e2e-soak-trigger.yaml @@ -1,24 +1,13 @@ name: E2ESoakTrigger on: schedule: - - cron: '0 */3 * * *' + - cron: '0 */1 * * *' workflow_dispatch: -permissions: - id-token: write # This is required for requesting the JWT - contents: read # This is required for actions/checkout - statuses: write -jobs: +jobs: soak: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v3 - - name: configure aws credentials - uses: aws-actions/configure-aws-credentials@v2 - with: - role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }} - aws-region: us-west-2 - role-duration-seconds: 21600 - - name: run the Soak test suite - run: | - aws eks update-kubeconfig --name Soak-testing - TEST_SUITE="Soak" make e2etests \ No newline at end of file + # if: github.repository == 'aws/karpenter' || github.event_name == 'workflow_dispatch' + uses: ./.github/workflows/e2e-soak.yaml + with: + event_name: ${{ github.event_name }} + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} \ No newline at end of file diff --git a/.github/workflows/e2e-soak.yaml b/.github/workflows/e2e-soak.yaml new file mode 100644 index 000000000000..528b0cc6efcb --- /dev/null +++ b/.github/workflows/e2e-soak.yaml @@ -0,0 +1,132 @@ +name: E2ESoak +on: + workflow_dispatch: + inputs: + git_ref: + type: string + region: + type: choice + options: + - "us-east-2" + - "us-west-2" + default: "us-east-2" + k8s_version: + type: choice + options: + - "1.23" + - "1.24" + - "1.25" + - "1.26" + - "1.27" + default: "1.27" + enable_metrics: + type: boolean + default: false + workflow_call: + inputs: + git_ref: + type: string + region: + type: string + default: "us-east-2" + event_name: + type: string + required: true + k8s_version: + type: string + default: "1.27" + enable_metrics: + type: boolean + default: false + secrets: + SLACK_WEBHOOK_URL: + required: true +permissions: + id-token: write # This is required for requesting the JWT + contents: read # This is required for actions/checkout + statuses: write +jobs: + run-suite: + name: suite-Soak + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v3 + with: + ref: ${{ inputs.git_ref }} + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v2 + with: + role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }} + aws-region: ${{ inputs.region }} + role-duration-seconds: 21600 + - uses: ./.github/actions/e2e/install-eksctl + with: + eksctl_version: v0.147.0 + - name: find preexisting cluster + run: | + export PREEXISTING=$(eksctl get cluster -o json | jq '.[].Name' | grep soak) + echo "Found existing cluster name \"$PREEXISTING\"" + echo PREEXISTING=$PREEXISTING >> $GITHUB_ENV + - name: generate cluster name + if: env.PREEXISTING == '' + run: | + CLUSTER_NAME=$(echo Soak-$RANDOM$RANDOM | awk '{print tolower($0)}') + echo "Using cluster name \"$CLUSTER_NAME\"" + echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_ENV + - name: create eks cluster '${{ env.CLUSTER_NAME }}' + if: env.PREEXISTING == '' + uses: ./.github/actions/e2e/create-cluster + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ inputs.region }} + cluster_name: ${{ env.CLUSTER_NAME }} + k8s_version: ${{ inputs.k8s_version }} + ip_family: 'IPv4' + git_ref: ${{ inputs.git_ref }} + - name: install prometheus + if: env.PREEXISTING == '' + uses: ./.github/actions/e2e/install-prometheus + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ vars.PROMETHEUS_REGION }} + cluster_name: ${{ env.CLUSTER_NAME }} + workspace_id: ${{ vars.WORKSPACE_ID }} + git_ref: ${{ inputs.git_ref }} + - name: install karpenter + if: env.PREEXISTING == '' + uses: ./.github/actions/e2e/install-karpenter + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ inputs.region }} + cluster_name: ${{ env.CLUSTER_NAME }} + git_ref: ${{ inputs.git_ref }} + - name: run the ${{ inputs.suite }} test suite new cluster + if: ${{ env.PREEXISTING }} != "" + run: | + aws eks update-kubeconfig --name ${{ env.PREEXISTING }} + TEST_SUITE="Soak" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests + - name: run the ${{ inputs.suite }} test suite preexisiting + if: ${{ env.PREEXISTING }} == "" + run: | + aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} + TEST_SUITE="Soak" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" make e2etests + - name: notify slack of success or failure + uses: ./.github/actions/e2e/slack/notify + if: (success() || failure()) && inputs.event_name != 'workflow_run' && inputs.event_name != 'conformance' + with: + url: ${{ secrets.SLACK_WEBHOOK_URL }} + suite: Soak + k8s_version: ${{ inputs.k8s_version }} + event_name: ${{ inputs.event_name }} + git_ref: ${{ inputs.git_ref }} + - name: dump logs on failure + uses: ./.github/actions/e2e/dump-logs + if: failure() || cancelled() + with: + account_id: ${{ vars.ACCOUNT_ID }} + role: ${{ vars.ROLE_NAME }} + region: ${{ inputs.region }} + cluster_name: ${{ env.CLUSTER_NAME }} \ No newline at end of file diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index d982cc122062..de26dfaa267f 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -66,6 +66,8 @@ spec: image: {{ include "karpenter.controller.image" . }} imagePullPolicy: {{ .Values.imagePullPolicy }} env: + - name: ENABLE_PROFILING + value: "true" - name: KUBERNETES_MIN_VERSION value: "1.19.0-0" - name: KARPENTER_SERVICE diff --git a/test/cloudformation/iam_cloudformation.yaml b/test/cloudformation/iam_cloudformation.yaml index f3bebd59fd0e..da001579058d 100644 --- a/test/cloudformation/iam_cloudformation.yaml +++ b/test/cloudformation/iam_cloudformation.yaml @@ -126,6 +126,7 @@ Resources: Resource: "*" - Effect: Allow Action: + - eks:ListClusters - eks:CreateCluster - eks:CreateAddon - eks:CreateNodegroup diff --git a/test/pkg/environment/common/setup.go b/test/pkg/environment/common/setup.go index 8cf21c6b5d64..da85856b035b 100644 --- a/test/pkg/environment/common/setup.go +++ b/test/pkg/environment/common/setup.go @@ -76,6 +76,7 @@ func (env *Environment) ExpectCleanCluster() { var nodes v1.NodeList Expect(env.Client.List(env.Context, &nodes)).To(Succeed()) for _, node := range nodes.Items { + fmt.Println(node.Name) if len(node.Spec.Taints) == 0 && !node.Spec.Unschedulable { Fail(fmt.Sprintf("expected system pool node %s to be tainted", node.Name)) } diff --git a/test/suites/soak/suite_test.go b/test/suites/soak/suite_test.go index c97d276d2894..63ec4599d2f0 100644 --- a/test/suites/soak/suite_test.go +++ b/test/suites/soak/suite_test.go @@ -62,10 +62,14 @@ var _ = Describe("Soak", func() { ctx, cancel := context.WithCancel(env.Context) defer cancel() + // content, err := os.ReadFile("testdata/user.sh") + // Expect(err).NotTo(HaveOccurred()) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": settings.FromContext(env.Context).ClusterName}, SubnetSelector: map[string]string{"karpenter.sh/discovery": settings.FromContext(env.Context).ClusterName}, - }}) + }, + // UserData: awssdk.String(string(content)), + }) provisioner := test.Provisioner(test.ProvisionerOptions{ ObjectMeta: metav1.ObjectMeta{ Name: "sock-test-provisioner", @@ -110,11 +114,11 @@ var _ = Describe("Soak", func() { time.Sleep(time.Second * 10) Consistently(func(g Gomega) { - dep.Spec.Replicas = awssdk.Int32(int32(rand.Intn(20) + 1)) - env.ExpectUpdated(dep) - time.Sleep(time.Minute * 1) + dep.Spec.Replicas = awssdk.Int32(int32(rand.Intn(100) + 1)) + env.ExpectCreatedOrUpdated(dep) + time.Sleep(time.Minute * 5) dep.Spec.Replicas = awssdk.Int32(0) - env.ExpectUpdated(dep) + env.ExpectCreatedOrUpdated(dep) time.Sleep(time.Second * 30) }, time.Hour*2).Should(Succeed()) env.ExpectDeleted(provisioner, provider, dep) diff --git a/test/suites/soak/testdata/user.sh b/test/suites/soak/testdata/user.sh new file mode 100644 index 000000000000..5af30bdf1ef5 --- /dev/null +++ b/test/suites/soak/testdata/user.sh @@ -0,0 +1,28 @@ +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="BOUNDARY" + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +mkdir -p /etc/systemd/logind.conf.d +cat << EOF > /etc/systemd/logind.conf.d/50-max-delay.conf +[Login] +InhibitDelayMaxSec=360 +EOF + +systemctl restart systemd-logind + +sed -i '/"apiVersion*/a \ \ "shutdownGracePeriod": "3m",' /etc/kubernetes/kubelet/kubelet-config.json +sed -i '/"shutdownGracePeriod*/a \ \ "shutdownGracePeriodCriticalPods": "2m",' /etc/kubernetes/kubelet/kubelet-config.json + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 + + +echo $(jq '.containerLogMaxFiles=3|.containerLogMaxSize="100Mi"' /etc/kubernetes/kubelet/kubelet-config.json) > /etc/kubernetes/kubelet/kubelet-config.json + +--BOUNDARY-- \ No newline at end of file