diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index dec40174c4e3..4fde8855eba8 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -36,18 +36,13 @@ runs: kubectl create ns karpenter || true kubectl label ns karpenter scrape=enabled --overwrite=true kubectl label ns karpenter pod-security.kubernetes.io/enforce=restricted --overwrite=true - - name: login to ecr via docker - uses: docker/login-action@v3 - with: - registry: ${{ inputs.ecr_account_id }}.dkr.ecr.${{ inputs.ecr_region }}.amazonaws.com - logout: true - name: install-karpenter shell: bash run: | aws eks update-kubeconfig --name "${{ inputs.cluster_name }}" - helm upgrade --install karpenter oci://${{ inputs.ecr_account_id }}.dkr.ecr.${{ inputs.ecr_region }}.amazonaws.com/karpenter/snapshot/karpenter \ + helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter \ -n karpenter \ - --version "v0-$(git rev-parse HEAD)" \ + --version "v0.31.1" \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::${{ inputs.account_id }}:role/karpenter-irsa-${{ inputs.cluster_name }}" \ --set settings.clusterName="${{ inputs.cluster_name }}" \ --set settings.aws.defaultInstanceProfile="KarpenterNodeInstanceProfile-${{ inputs.cluster_name }}" \ diff --git a/.github/workflows/e2e-sock-trigger.yaml b/.github/workflows/e2e-sock-trigger.yaml new file mode 100644 index 000000000000..86d341432087 --- /dev/null +++ b/.github/workflows/e2e-sock-trigger.yaml @@ -0,0 +1,67 @@ +name: E2ESoakTrigger +on: + schedule: + - cron: '0 */3 * * *' + workflow_run: + workflows: [ApprovalComment] + types: [completed] + workflow_dispatch: + inputs: + region: + required: true + default: 'us-east-1' + type: choice + options: + - "us-east-1" + - "us-west-2" + cleanup: + required: true + default: true + type: boolean +permissions: + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 + statuses: write # ./.github/actions/commit-status/start +jobs: + reslove_cluster: + runs-on: ubuntu-latest + outputs: + CREATE_CLUSTER: ${{ steps.create_cluster.outputs.CREATE_CLUSTER }} + PREEXISTING: ${{ steps.create_cluster.outputs.PREEXISTING }} + steps: + - uses: actions/checkout@v4 + with: + ref: ${{ inputs.git_ref }} + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v4.0.1 + with: + role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }} + aws-region: ${{ inputs.region || 'us-east-1' }} + role-duration-seconds: 21600 + - uses: ./.github/actions/e2e/install-eksctl + with: + version: v0.163.0 + - id: create_cluster + run: | + export PREEXISTING=$(eksctl get cluster -o json | jq '.[].Name' | grep soak) + echo "Found existing cluster name \"$PREEXISTING\"" + if [[ $PREEXISTING != '' ]]; then + echo Here1 + echo PREEXISTING=$PREEXISTING >> $GITHUB_OUTPUT + echo CREATE_CLUSTER=false >> $GITHUB_OUTPUT + else + echo Here2 + echo CREATE_CLUSTER=true >> $GITHUB_OUTPUT + fi + sock: + needs: [reslove_cluster] + uses: ./.github/workflows/e2e.yaml + with: + suite: Integration + region: ${{ inputs.region || 'us-east-1' }} + workflow_trigger: "soak" + create_cluster: ${{ needs.reslove_cluster.outputs.CREATE_CLUSTER }} + preexisiting_cluster: ${{needs.reslove_cluster.outputs.PREEXISTING || '' }} + # Default to true unless using a workflow_dispatch + cleanup: false + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index d2dee5362735..68684cd70ba0 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -73,6 +73,10 @@ on: required: true workflow_trigger: type: string + preexisiting_cluster: + type: string + create_cluster: + type: string secrets: SLACK_WEBHOOK_URL: required: true @@ -100,15 +104,23 @@ jobs: aws-region: ${{ inputs.region }} role-duration-seconds: 21600 - name: add jitter on cluster creation + if: always() && inputs.create_cluster run: | # Creating jitter so that we can stagger cluster creation to avoid throttling sleep $(( $RANDOM % 300 + 1 )) - id: generate-cluster-name + name: generate-cluster-name + if: always() && inputs.create_cluster run: | - CLUSTER_NAME=$(echo ${{ inputs.suite }}-$RANDOM$RANDOM | awk '{print tolower($0)}' | tr / -) + if [[ ${{ inputs.workflow_trigger }} == 'soak' ]]; then + CLUSTER_NAME=$(echo ${{ inputs.workflow_trigger }}-$RANDOM$RANDOM | awk '{print tolower($0)}' | tr / -) + else + CLUSTER_NAME=$(echo ${{ inputs.suite }}-$RANDOM$RANDOM | awk '{print tolower($0)}' | tr / -) + fi echo "Using cluster name \"$CLUSTER_NAME\"" echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_OUTPUT - - name: create eks cluster '${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}' + - name: create eks cluster '${{ steps.generate-cluster-name.outputs.CLUSTER_NAME || inputs.preexisiting_cluster }}' + if: always() && inputs.create_cluster uses: ./.github/actions/e2e/create-cluster with: account_id: ${{ vars.ACCOUNT_ID }} @@ -120,6 +132,7 @@ jobs: ip_family: ${{ contains(inputs.suite, 'IPv6') && 'IPv6' || 'IPv4' }} # Set the value to IPv6 if IPv6 suite, else IPv4 git_ref: ${{ inputs.git_ref }} - name: install prometheus + if: always() && inputs.create_cluster uses: ./.github/actions/e2e/install-prometheus with: account_id: ${{ vars.ACCOUNT_ID }} @@ -129,6 +142,7 @@ jobs: workspace_id: ${{ vars.WORKSPACE_ID }} git_ref: ${{ inputs.git_ref }} - name: install karpenter + if: always() && inputs.create_cluster uses: ./.github/actions/e2e/install-karpenter with: account_id: ${{ vars.ACCOUNT_ID }} @@ -140,10 +154,10 @@ jobs: git_ref: ${{ inputs.git_ref }} - name: run the ${{ inputs.suite }} test suite run: | - aws eks update-kubeconfig --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + aws eks update-kubeconfig --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME || inputs.preexisiting_cluster }} TEST_SUITE="${{ inputs.suite }}" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" \ - CLUSTER_NAME="${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}" CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" \ - INTERRUPTION_QUEUE="${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}" make e2etests + CLUSTER_NAME="${{ steps.generate-cluster-name.outputs.CLUSTER_NAME || inputs.preexisiting_cluster }}" CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME || inputs.preexisiting_cluster }} --query "cluster.endpoint" --output text)" \ + INTERRUPTION_QUEUE="${{ steps.generate-cluster-name.outputs.CLUSTER_NAME || inputs.preexisiting_cluster }}" make e2etests - name: notify slack of success or failure uses: ./.github/actions/e2e/slack/notify if: (success() || failure()) && github.event_name != 'workflow_run' && inputs.workflow_trigger != 'versionCompatibility' diff --git a/test/cloudformation/iam_cloudformation.yaml b/test/cloudformation/iam_cloudformation.yaml index 5d47f3f6cc13..3f73ff365c0e 100644 --- a/test/cloudformation/iam_cloudformation.yaml +++ b/test/cloudformation/iam_cloudformation.yaml @@ -249,6 +249,21 @@ Resources: - Effect: Allow Action: timestream:DescribeEndpoints Resource: "*" + GithubActionsPolicySecond: + Type: AWS::IAM::ManagedPolicy + Properties: + ManagedPolicyName: GithubActionsPolicySecond + PolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Action: + - eks:ListClusters + Resource: !Sub "arn:${AWS::Partition}:eks:*:${AWS::AccountId}:cluster/*" + Condition: + StringEquals: + aws:RequestedRegion: + Ref: Regions # GithubActionsPermissionsBoundary includes all permissions needed for all designated roles provisioned by the GithubActions # CI task. This includes the cluster ServiceRoles that are generated by EKSCTL and all roles generated with IRSA to interface from the # cluster into AWS services through IAM. @@ -366,6 +381,7 @@ Resources: RoleName: GithubActionsRole ManagedPolicyArns: - !Ref GithubActionsPolicy + - !Ref GithubActionsPolicySecond - !Ref GithubActionsPermissionsBoundary MaxSessionDuration: 21600 # 6 hours is the max session for GHA AssumeRolePolicyDocument: