From dc925db08824444666049702859d7b2483b87f9d Mon Sep 17 00:00:00 2001 From: jigisha620 Date: Mon, 1 Apr 2024 16:07:32 -0700 Subject: [PATCH] Changes to run e2e for private cluster --- .github/actions/e2e/cleanup/action.yaml | 4 + .github/actions/e2e/install-helm/action.yaml | 7 +- .../actions/e2e/install-karpenter/action.yaml | 44 +---- .../e2e/install-prometheus/action.yaml | 28 +-- .../e2e/run-tests-private-cluster/action.yaml | 159 ++++++++++++++++++ .github/actions/e2e/setup-cluster/action.yaml | 44 ++--- .github/actions/e2e/slack/notify/action.yaml | 5 + .github/dependabot.yaml | 8 + .../e2e-private-cluster-trigger.yaml | 18 ++ .github/workflows/e2e.yaml | 38 ++++- .../hack/e2e_scripts/clean_private_cluster.sh | 35 ++++ .../e2e_scripts/configure_private_cluster.sh | 83 +++++++++ test/hack/e2e_scripts/diff_karpenter.sh | 9 + test/hack/e2e_scripts/install_helm.sh | 4 + test/hack/e2e_scripts/install_karpenter.sh | 43 +++++ test/hack/e2e_scripts/install_prometheus.sh | 31 ++++ .../noderole_bootstrap_permission.sh | 7 + test/hack/resource/clean/main.go | 1 + test/hack/resource/count/main.go | 1 + .../resourcetypes/vpc_peering_connection.go | 133 +++++++++++++++ test/pkg/environment/aws/environment.go | 18 +- .../integration/extended_resources_test.go | 5 + .../integration/instance_profile_test.go | 5 + test/suites/integration/scheduling_test.go | 2 +- test/suites/integration/tags_test.go | 5 + 25 files changed, 638 insertions(+), 99 deletions(-) create mode 100644 .github/actions/e2e/run-tests-private-cluster/action.yaml create mode 100644 .github/workflows/e2e-private-cluster-trigger.yaml create mode 100755 test/hack/e2e_scripts/clean_private_cluster.sh create mode 100755 test/hack/e2e_scripts/configure_private_cluster.sh create mode 100755 test/hack/e2e_scripts/diff_karpenter.sh create mode 100755 test/hack/e2e_scripts/install_helm.sh create mode 100755 test/hack/e2e_scripts/install_karpenter.sh create mode 100755 test/hack/e2e_scripts/install_prometheus.sh create mode 100755 test/hack/e2e_scripts/noderole_bootstrap_permission.sh create mode 100644 test/hack/resource/pkg/resourcetypes/vpc_peering_connection.go diff --git a/.github/actions/e2e/cleanup/action.yaml b/.github/actions/e2e/cleanup/action.yaml index d352537b4c39..7237012066c6 100644 --- a/.github/actions/e2e/cleanup/action.yaml +++ b/.github/actions/e2e/cleanup/action.yaml @@ -18,6 +18,9 @@ inputs: eksctl_version: description: "Version of eksctl to install" default: v0.169.0 + private_cluster: + description: "Whether the cluster that has to be deleted is private or not. Valid values are 'true' or 'false'" + default: 'false' runs: using: "composite" steps: @@ -28,6 +31,7 @@ runs: with: version: ${{ inputs.eksctl_version }} - name: delete-cluster + if: ${{ inputs.private_cluster == 'false' }} shell: bash env: CLUSTER_NAME: ${{ inputs.cluster_name }} diff --git a/.github/actions/e2e/install-helm/action.yaml b/.github/actions/e2e/install-helm/action.yaml index 43c807b1b9ac..bd141ae413ab 100644 --- a/.github/actions/e2e/install-helm/action.yaml +++ b/.github/actions/e2e/install-helm/action.yaml @@ -10,12 +10,9 @@ runs: - name: install helm shell: bash env: - VERSION: ${{ inputs.version }} + HELM_VERSION: ${{ inputs.version }} run: | - TEMPDIR=$(mktemp -d) - curl -fsSL -o "${TEMPDIR}/get_helm.sh" https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 - chmod 700 "${TEMPDIR}/get_helm.sh" - "${TEMPDIR}/get_helm.sh" --version "$VERSION" + ./test/hack/e2e_scripts/install_helm.sh - name: install helm-diff shell: bash run: | diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index e8615a8026bf..137f6237bb26 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -24,6 +24,9 @@ inputs: default: "1.29" git_ref: description: "The git commit, tag, or branch to check out. Requires a corresponding Karpenter snapshot release" + private_cluster: + description: "Whether the cluster is private or not. Valid values are 'true' or 'false'" + default: 'false' runs: using: "composite" steps: @@ -53,48 +56,13 @@ runs: ACCOUNT_ID: ${{ inputs.account_id }} CLUSTER_NAME: ${{ inputs.cluster_name }} K8S_VERSION: ${{ inputs.k8s_version }} + PRIVATE_CLUSTER: ${{ inputs.private_cluster }} run: | - aws eks update-kubeconfig --name "$CLUSTER_NAME" - - # Parse minor version to determine whether to enable the webhooks - K8S_VERSION_MINOR="${K8S_VERSION#*.}" - WEBHOOK_ENABLED=false - if (( K8S_VERSION_MINOR < 25 )); then - WEBHOOK_ENABLED=true - fi - - # Remove service account annotation when dropping support for 1.23 - helm upgrade --install karpenter "oci://$ECR_ACCOUNT_ID.dkr.ecr.$ECR_REGION.amazonaws.com/karpenter/snapshot/karpenter" \ - -n kube-system \ - --version "0-$(git rev-parse HEAD)" \ - --set logLevel=debug \ - --set webhook.enabled=${WEBHOOK_ENABLED} \ - --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::$ACCOUNT_ID:role/karpenter-irsa-$CLUSTER_NAME" \ - --set settings.clusterName="$CLUSTER_NAME" \ - --set settings.interruptionQueue="$CLUSTER_NAME" \ - --set settings.featureGates.spotToSpotConsolidation=true \ - --set controller.resources.requests.cpu=3 \ - --set controller.resources.requests.memory=3Gi \ - --set controller.resources.limits.cpu=3 \ - --set controller.resources.limits.memory=3Gi \ - --set serviceMonitor.enabled=true \ - --set serviceMonitor.additionalLabels.scrape=enabled \ - --set "serviceMonitor.endpointConfig.relabelings[0].targetLabel=clusterName" \ - --set "serviceMonitor.endpointConfig.relabelings[0].replacement=$CLUSTER_NAME" \ - --set "serviceMonitor.endpointConfig.relabelings[1].targetLabel=gitRef" \ - --set "serviceMonitor.endpointConfig.relabelings[1].replacement=$(git rev-parse HEAD)" \ - --set "serviceMonitor.endpointConfig.relabelings[2].targetLabel=mostRecentTag" \ - --set "serviceMonitor.endpointConfig.relabelings[2].replacement=$(git describe --abbrev=0 --tags)" \ - --set "serviceMonitor.endpointConfig.relabelings[3].targetLabel=commitsAfterTag" \ - --set "serviceMonitor.endpointConfig.relabelings[3].replacement=\"$(git describe --tags | cut -d '-' -f 2)\"" \ - --wait + ./test/hack/e2e_scripts/install_karpenter.sh - name: diff-karpenter shell: bash env: ECR_ACCOUNT_ID: ${{ inputs.ecr_account_id }} ECR_REGION: ${{ inputs.ecr_region }} run: | - helm diff upgrade --namespace kube-system \ - karpenter oci://$ECR_ACCOUNT_ID.dkr.ecr.$ECR_REGION.amazonaws.com/karpenter/snapshot/karpenter \ - --version 0-$(git rev-parse HEAD) \ - --reuse-values --three-way-merge --detailed-exitcode + ./test/hack/e2e_scripts/diff_karpenter.sh diff --git a/.github/actions/e2e/install-prometheus/action.yaml b/.github/actions/e2e/install-prometheus/action.yaml index ef44de3c47d1..f80dd138c8a6 100644 --- a/.github/actions/e2e/install-prometheus/action.yaml +++ b/.github/actions/e2e/install-prometheus/action.yaml @@ -10,6 +10,9 @@ inputs: region: description: "Region to access AWS" required: true + prometheus_region: + description: "Prometheus region" + required: true cluster_name: description: 'Name of the cluster to be launched by eksctl' required: true @@ -18,6 +21,9 @@ inputs: required: true git_ref: description: "The git commit, tag, or branch to check out. Requires a corresponding Karpenter snapshot release" + private_cluster: + description: "Whether the cluster is private or not. Valid values are 'true' or 'false'" + default: 'false' runs: using: "composite" steps: @@ -39,27 +45,11 @@ runs: - name: install prometheus shell: bash env: + PROMETHEUS_REGION: ${{ inputs.prometheus_region }} REGION: ${{ inputs.region }} WORKSPACE_ID: ${{ inputs.workspace_id }} ACCOUNT_ID: ${{ inputs.account_id }} CLUSTER_NAME: ${{ inputs.cluster_name }} + PRIVATE_CLUSTER: ${{ inputs.private_cluster }} run: | - # Remove service account annotation when dropping support for 1.23 - helm upgrade --install prometheus prometheus-community/kube-prometheus-stack \ - -n prometheus \ - -f ./.github/actions/e2e/install-prometheus/values.yaml \ - --set prometheus.prometheusSpec.remoteWrite[0].url=https://aps-workspaces.$REGION.amazonaws.com/workspaces/$WORKSPACE_ID/api/v1/remote_write \ - --set prometheus.prometheusSpec.remoteWrite[0].sigv4.region=$REGION \ - --set prometheus.serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::$ACCOUNT_ID:role/prometheus-irsa-$CLUSTER_NAME" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[0].targetLabel=metrics_path" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[0].action=replace" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[0].sourceLabels[0]=__metrics_path__" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[1].targetLabel=clusterName" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[1].replacement=$CLUSTER_NAME" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[2].targetLabel=gitRef" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[2].replacement=$(git rev-parse HEAD)" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[3].targetLabel=mostRecentTag" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[3].replacement=$(git describe --abbrev=0 --tags)" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[4].targetLabel=commitsAfterTag" \ - --set "kubelet.serviceMonitor.cAdvisorRelabelings[4].replacement=\"$(git describe --tags | cut -d '-' -f 2)\"" \ - --wait + ./test/hack/e2e_scripts/install_prometheus.sh \ No newline at end of file diff --git a/.github/actions/e2e/run-tests-private-cluster/action.yaml b/.github/actions/e2e/run-tests-private-cluster/action.yaml new file mode 100644 index 000000000000..6ae4604cfec3 --- /dev/null +++ b/.github/actions/e2e/run-tests-private-cluster/action.yaml @@ -0,0 +1,159 @@ +name: RunTestsPrivateCluster +description: 'Installs Karpenter, Prometheus, runs tests on private cluster and performs clean up' +inputs: + account_id: + description: "Account ID to access AWS" + required: true + suite: + type: string + required: true + ecr_account_id: + description: "Account ID to access ECR Repository" + required: true + prometheus_workspace_id: + description: "Workspace ID for the Prometheus workspace" + required: true + metrics_region: + description: "Metrics region" + required: true + node_role: + description: "Private cluster node role" + required: true + region: + description: "Region to access AWS" + required: true + ecr_region: + description: "Region to access ECR Repository" + required: true + prometheus_region: + description: Region to access Prometheus + required: true + cluster_name: + description: 'Name of the cluster to be launched by eksctl' + required: true + k8s_version: + description: 'Version of Kubernetes to use for the launched cluster' + default: "1.29" + private_cluster: + description: "Whether to create a private cluster which does not add access to the public internet. Valid values are 'true' or 'false'" + default: 'false' + enable_metrics: + description: "Whether to enable metrics for the cluster" + default: 'false' + codebuild_sg: + description: "Codebuild security group to run private cluster tests" + required: true + codebuild_vpc: + description: "Codebuild VPC to run private cluster tests" + required: true + cleanup: + description: "Whether to cleanup resources on failure" + default: 'false' +runs: + using: "composite" + steps: + - name: login to ecr via docker + uses: docker/login-action@343f7c4344506bcbf9b4de18042ae17996df046d # v3.0.0 + with: + registry: ${{ inputs.account_id }}.dkr.ecr.${{ inputs.region }}.amazonaws.com + logout: true + - name: configure private cluster + if: ${{ inputs.private_cluster }} + shell: bash + env: + REGION: ${{ inputs.region }} + CLUSTER_NAME: ${{ inputs.cluster_name }} + ACCOUNT_ID: ${{ inputs.account_id }} + REPOSITORY: ${{ github.repository }} + RUN_ID: ${{ github.run_id }} + CODEBUILD_SG: ${{ inputs.codebuild_sg }} + CODEBUILD_VPC: ${{ inputs.codebuild_vpc }} + run: | + ./test/hack/e2e_scripts/configure_private_cluster.sh + - name: run private cluster tests on codebuild + env: + SUITE: ${{ inputs.suite }} + CLUSTER_NAME: ${{ inputs.cluster_name }} + INTERRUPTION_QUEUE: ${{ inputs.cluster_name }} + REGION: ${{ inputs.region }} + HELM_VERSION: v3.12.3 # Pinned to this version since v3.13.0 has issues with anonymous pulls: https://github.com/helm/helm/issues/12423 + PROMETHEUS_REGION: ${{ inputs.prometheus_region }} + WORKSPACE_ID: ${{ inputs.prometheus_workspace_id }} + ACCOUNT_ID: ${{ inputs.account_id }} + K8S_VERSION: ${{ inputs.k8s_version }} + ECR_ACCOUNT_ID: ${{ inputs.ecr_account_id }} + ECR_REGION: ${{ inputs.ecr_region }} + PRIVATE_CLUSTER: ${{ inputs.private_cluster }} + ENABLE_METRICS: ${{ inputs.enable_metrics }} + METRICS_REGION: ${{ inputs.metrics_region }} + VPC_PEERING_CONNECTION_ID: ${{ env.VPC_PEERING_CONNECTION_ID }} + NODE_ROLE: ${{ env.NODE_ROLE }} + SG_CB: ${{ inputs.codebuild_sg }} + VPC_CB: ${{ inputs.codebuild_vpc }} + CLUSTER_VPC_ID: ${{ env.CLUSTER_VPC_ID }} + EKS_CLUSTER_SG: ${{ env.EKS_CLUSTER_SG }} + CLEANUP: ${{ inputs.cleanup }} + uses: aws-actions/aws-codebuild-run-build@bafa4d8b0d8802b5adf3a54861f530792d2e4f24 #v1.0.15 + with: + project-name: E2EPrivateClusterCodeBuildProject-us-east-1 + buildspec-override: | + version: 0.2 + phases: + install: + commands: + # Make sure goenv is up to date + - cd $HOME/.goenv && git pull --ff-only && cd - + # Install Go 1.22 + - goenv install 1.22 && goenv global 1.22 + build: + commands: + - aws eks update-kubeconfig --name $CLUSTER_NAME + - ./test/hack/e2e_scripts/noderole_bootstrap_permission.sh + - ./test/hack/e2e_scripts/install_helm.sh + - helm plugin install https://github.com/databus23/helm-diff || true + - aws ecr get-login-password --region $REGION | docker login --username AWS --password-stdin $ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com + - helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + - helm pull prometheus-community/kube-prometheus-stack + - kubectl create ns prometheus || true + - kubectl label ns prometheus scrape=enabled --overwrite=true + - ./test/hack/e2e_scripts/install_prometheus.sh + - kubectl label ns kube-system scrape=enabled --overwrite=true + - kubectl label ns kube-system pod-security.kubernetes.io/warn=restricted --overwrite=true + - ./test/hack/e2e_scripts/install_karpenter.sh + - ./test/hack/e2e_scripts/diff_karpenter.sh + - kubectl delete nodepool --all + - kubectl delete ec2nodeclass --all + - kubectl delete deployment --all + - PRIVATE_CLUSTER=$CLUSTER_NAME TEST_SUITE=$SUITE ENABLE_METRICS=$ENABLE_METRICS METRICS_REGION=$METRICS_REGION GIT_REF="$(git rev-parse HEAD)" CLUSTER_NAME=$CLUSTER_NAME CLUSTER_ENDPOINT="$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.endpoint" --output text)" INTERRUPTION_QUEUE=$CLUSTER_NAME make e2etests + post_build: + commands: + # Describe karpenter pods + - kubectl describe pods -n kube-system -l app.kubernetes.io/name=karpenter + # Describe nodes + - kubectl describe nodes + - | + if [ "${CLEANUP}" = true ]; then + ./test/hack/e2e_scripts/clean_private_cluster.sh + fi + env-vars-for-codebuild: | + SUITE, + CLUSTER_NAME, + INTERRUPTION_QUEUE, + REGION, + HELM_VERSION, + PROMETHEUS_REGION, + WORKSPACE_ID, + ACCOUNT_ID, + K8S_VERSION, + ECR_ACCOUNT_ID, + ECR_REGION, + PRIVATE_CLUSTER, + ENABLE_METRICS, + METRICS_REGION, + VPC_PEERING_CONNECTION_ID, + NODE_ROLE, + SG_CB, + VPC_CB, + CLUSTER_VPC_ID, + EKS_CLUSTER_SG, + CLEANUP \ No newline at end of file diff --git a/.github/actions/e2e/setup-cluster/action.yaml b/.github/actions/e2e/setup-cluster/action.yaml index 1546aaf5872c..c829662294f0 100644 --- a/.github/actions/e2e/setup-cluster/action.yaml +++ b/.github/actions/e2e/setup-cluster/action.yaml @@ -45,6 +45,8 @@ inputs: cleanup: description: "Whether to cleanup resources on failure" default: 'false' + codebuild_role: + description: "Codebuild Role that must be given an access entry in case of private cluster" runs: using: "composite" steps: @@ -88,6 +90,7 @@ runs: GIT_REF: ${{ inputs.git_ref }} ENABLE_LOCAL_ZONES: ${{ inputs.enable_local_zones }} CLEANUP: ${{ inputs.cleanup }} + CODEBUILD_ROLE: ${{ inputs.codebuild_role }} run: | if [[ "$GIT_REF" == '' ]]; then GIT_REF=$(git rev-parse HEAD) @@ -95,7 +98,7 @@ runs: # Disable Pod Identity for Karpenter on K8s 1.23. Pod Identity is not supported on K8s 1.23 # https://docs.aws.amazon.com/eks/latest/userguide/pod-identities.html#pod-id-considerations - if [[ "$K8S_VERSION" == '1.23' ]]; then + if [[ "$K8S_VERSION" == '1.23' ]] || [[ "$PRIVATE_CLUSTER" == 'true' ]]; then KARPENTER_IAM=""" - metadata: name: karpenter @@ -195,6 +198,11 @@ runs: if [[ $PRIVATE_CLUSTER == 'true' ]]; then yq -i '.privateCluster.enabled=true' clusterconfig.yaml yq -i '.managedNodeGroups[0].privateNetworking=true' clusterconfig.yaml + yq -i '.accessConfig.authenticationMode="API_AND_CONFIG_MAP"' clusterconfig.yaml + CODEBUILD_ROLE_ARN="arn:aws:iam::$ACCOUNT_ID:role/$CODEBUILD_ROLE" + yq -i ".accessConfig.accessEntries[0].principalARN=\"$CODEBUILD_ROLE_ARN\"" clusterconfig.yaml + yq -i '.accessConfig.accessEntries[0].accessPolicies[0].policyARN="arn:aws:eks::aws:cluster-access-policy/AmazonEKSClusterAdminPolicy"' clusterconfig.yaml + yq -i '.accessConfig.accessEntries[0].accessPolicies[0].accessScope.type="cluster"' clusterconfig.yaml fi # Disable rollback of the CloudFormation on Create if we aren't cleaning up the run @@ -203,25 +211,6 @@ runs: else eksctl ${cmd} cluster -f clusterconfig.yaml fi - - # Add the SQS and SSM VPC endpoints if we are creating a private cluster - # We need to grab all of the VPC details for the cluster in order to add the endpoint - if [[ $PRIVATE_CLUSTER == 'true' ]]; then - VPC_CONFIG=$(aws eks describe-cluster --name "$CLUSTER_NAME" --query "cluster.resourcesVpcConfig") - VPC_ID=$(echo $VPC_CONFIG | jq .vpcId -r) - SUBNET_IDS=($(echo $VPC_CONFIG | jq '.subnetIds | join(" ")' -r)) - SECURITY_GROUP_IDS=($(echo $VPC_CONFIG | jq '.securityGroupIds | join(" ")' -r)) - - for SERVICE in "com.amazonaws.$REGION.ssm" "com.amazonaws.$REGION.sqs"; do - aws ec2 create-vpc-endpoint \ - --vpc-id "${VPC_ID}" \ - --vpc-endpoint-type Interface \ - --service-name "${SERVICE}" \ - --subnet-ids ${SUBNET_IDS[@]} \ - --security-group-ids ${SECURITY_GROUP_IDS[@]} \ - --tags "testing/type=e2e" "testing/cluster=$CLUSTER_NAME" "github.com/run-url=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }}" "karpenter.sh/discovery=$CLUSTER_NAME" - done - fi - name: tag oidc provider of the cluster if: always() shell: bash @@ -238,18 +227,13 @@ runs: aws iam tag-open-id-connect-provider --open-id-connect-provider-arn $arn \ --tags Key=test/git_ref,Value=$GIT_REF Key=testing/type,Value=e2e Key=testing/cluster,Value=$CLUSTER_NAME Key=github.com/run-url,Value=https://github.com/${{ github.repository }}/actions/runs/${{ github.run_id }} - name: give KarpenterNodeRole permission to bootstrap + if: ${{ inputs.private_cluster == 'false' }} shell: bash env: ACCOUNT_ID: ${{ inputs.account_id }} CLUSTER_NAME: ${{ inputs.cluster_name }} run: | - eksctl create iamidentitymapping \ - --username system:node:{{EC2PrivateDNSName}} \ - --cluster "$CLUSTER_NAME" \ - --arn "arn:aws:iam::$ACCOUNT_ID:role/KarpenterNodeRole-$CLUSTER_NAME" \ - --group system:bootstrappers \ - --group system:nodes \ - --group eks:kube-proxy-windows + ./test/hack/e2e_scripts/noderole_bootstrap_permission.sh - name: cloudformation describe stack events shell: bash if: failure() @@ -262,15 +246,18 @@ runs: aws cloudformation describe-stack-events --stack-name $stack_name done - name: install prometheus + if: ${{ inputs.private_cluster == 'false' }} uses: ./.github/actions/e2e/install-prometheus with: account_id: ${{ inputs.account_id }} role: ${{ inputs.role }} - region: ${{ inputs.prometheus_region }} + prometheus_region: ${{ inputs.prometheus_region }} + region: ${{ inputs.region }} cluster_name: ${{ inputs.cluster_name }} workspace_id: ${{ inputs.prometheus_workspace_id }} git_ref: ${{ inputs.git_ref }} - name: install karpenter + if: ${{ inputs.private_cluster == 'false' }} uses: ./.github/actions/e2e/install-karpenter with: account_id: ${{ inputs.account_id }} @@ -281,3 +268,4 @@ runs: cluster_name: ${{ inputs.cluster_name }} k8s_version: ${{ inputs.k8s_version }} git_ref: ${{ inputs.git_ref }} + private_cluster: ${{ inputs.private_cluster }} diff --git a/.github/actions/e2e/slack/notify/action.yaml b/.github/actions/e2e/slack/notify/action.yaml index b28c3b62e29e..a756c62ec062 100644 --- a/.github/actions/e2e/slack/notify/action.yaml +++ b/.github/actions/e2e/slack/notify/action.yaml @@ -12,6 +12,8 @@ inputs: required: true git_ref: description: "The git commit, tag, or branch to check out. Requires a corresponding Karpenter snapshot release" + workflow_trigger: + description: "Workflow trigger for this run" runs: using: "composite" steps: @@ -23,6 +25,9 @@ runs: env: SUITE: ${{ inputs.suite }} run: | + if [[ ${{ inputs.workflow_trigger == 'private_cluster' || inputs.workflow_trigger == 'soak'}} ]]; then + SUITE=${{ inputs.workflow_trigger }} + fi if [[ ${{ github.event_name }} == "schedule" && inputs.suite != "soak" ]]; then RUN_NAME="$SUITE-periodic" elif [[ ${{ github.event_name }} == "schedule" ]]; then diff --git a/.github/dependabot.yaml b/.github/dependabot.yaml index 0380b07dfb24..025117413f37 100644 --- a/.github/dependabot.yaml +++ b/.github/dependabot.yaml @@ -122,6 +122,14 @@ updates: action-deps: patterns: - '*' + - package-ecosystem: github-actions + directory: .github/actions/e2e/run-tests-private-cluster + schedule: + interval: weekly + groups: + action-deps: + patterns: + - '*' - package-ecosystem: github-actions directory: .github/actions/e2e/setup-cluster schedule: diff --git a/.github/workflows/e2e-private-cluster-trigger.yaml b/.github/workflows/e2e-private-cluster-trigger.yaml new file mode 100644 index 000000000000..84cc71e03fa6 --- /dev/null +++ b/.github/workflows/e2e-private-cluster-trigger.yaml @@ -0,0 +1,18 @@ +name: E2EPrivateClusterTrigger +on: + schedule: + - cron: '7 0 * * 4' +jobs: + private-cluster-trigger: + permissions: + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 + statuses: write # ./.github/actions/commit-status/start + uses: ./.github/workflows/e2e.yaml + with: + suite: Integration + region: us-east-1 + workflow_trigger: "private_cluster" + cleanup: true + codebuild_region: US_EAST_1 + secrets: + SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 98974eba1a1f..dc120754be4a 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -47,6 +47,8 @@ on: enable_metrics: type: boolean default: false + codebuild_region: + type: string workflow_call: inputs: git_ref: @@ -68,6 +70,8 @@ on: required: true workflow_trigger: type: string + codebuild_region: + type: string cluster_name: type: string description: If cluster_name is empty, a new cluster will be created. Otherwise, tests will run on an existing cluster @@ -130,7 +134,7 @@ jobs: k8s_version: ${{ inputs.k8s_version }} eksctl_version: v0.169.0 ip_family: ${{ contains(inputs.suite, 'IPv6') && 'IPv6' || 'IPv4' }} # Set the value to IPv6 if IPv6 suite, else IPv4 - private_cluster: ${{ inputs.suite == 'PrivateCluster' }} + private_cluster: ${{ inputs.workflow_trigger == 'private_cluster' }} git_ref: ${{ inputs.git_ref }} ecr_account_id: ${{ vars.SNAPSHOT_ACCOUNT_ID }} ecr_region: ${{ vars.SNAPSHOT_REGION }} @@ -138,15 +142,33 @@ jobs: prometheus_region: ${{ vars.PROMETHEUS_REGION }} enable_local_zones: ${{ inputs.suite == 'LocalZone' }} cleanup: ${{ inputs.cleanup }} + codebuild_role: ${{ vars[format('{0}_CODEBUILD_ROLE', inputs.codebuild_region)] }} + - name: run tests for private cluster + if: ${{ inputs.workflow_trigger == 'private_cluster' }} + uses: ./.github/actions/e2e/run-tests-private-cluster + with: + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + suite: ${{ inputs.suite }} + prometheus_region: ${{ vars.PROMETHEUS_REGION }} + prometheus_workspace_id: ${{ vars.WORKSPACE_ID }} + region: ${{ inputs.region }} + account_id: ${{ vars.CI_ACCOUNT_ID }} + k8s_version: ${{ inputs.k8s_version }} + ecr_account_id: ${{ vars.SNAPSHOT_ACCOUNT_ID }} + ecr_region: ${{ vars.SNAPSHOT_REGION }} + private_cluster: ${{ inputs.workflow_trigger == 'private_cluster' }} + enable_metrics: ${{ inputs.enable_metrics }} + metrics_region: ${{ vars.TIMESTREAM_REGION }} + node_role: ${{ env.NODE_ROLE }} + cleanup: ${{ inputs.cleanup }} + codebuild_sg: ${{ vars[format('{0}_CODEBUILD_SG', inputs.codebuild_region)] }} + codebuild_vpc: ${{ vars[format('{0}_CODEBUILD_VPC', inputs.codebuild_region)] }} - name: run the ${{ inputs.suite }} test suite + if: ${{ inputs.workflow_trigger != 'private_cluster' }} env: SUITE: ${{ inputs.suite }} ENABLE_METRICS: ${{ inputs.enable_metrics }} run: | - # If we are performing the PrivateCluster test suite, then we should just run the 'Integration' test suite - if [[ $SUITE == 'PrivateCluster' ]]; then - SUITE="Integration" - fi aws eks update-kubeconfig --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} # Clean up the cluster before running all tests kubectl delete nodepool --all @@ -162,11 +184,12 @@ jobs: with: cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} url: ${{ inputs.workflow_trigger == 'soak' && secrets.SLACK_WEBHOOK_SOAK_URL || secrets.SLACK_WEBHOOK_URL }} - suite: ${{ inputs.workflow_trigger == 'soak' && 'soak' || inputs.suite }} + suite: ${{ inputs.suite }} git_ref: ${{ inputs.git_ref }} + workflow_trigger: ${{ inputs.workflow_trigger }} - name: dump logs on failure uses: ./.github/actions/e2e/dump-logs - if: failure() || cancelled() + if: (failure() || cancelled()) && inputs.workflow_trigger != 'private_cluster' with: account_id: ${{ vars.CI_ACCOUNT_ID }} role: ${{ vars.CI_ROLE_NAME }} @@ -182,6 +205,7 @@ jobs: cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} git_ref: ${{ inputs.git_ref }} eksctl_version: v0.169.0 + private_cluster: ${{ inputs.workflow_trigger == 'private_cluster' }} - if: always() && github.event_name == 'workflow_run' uses: ./.github/actions/commit-status/end with: diff --git a/test/hack/e2e_scripts/clean_private_cluster.sh b/test/hack/e2e_scripts/clean_private_cluster.sh new file mode 100755 index 000000000000..d749b66dad67 --- /dev/null +++ b/test/hack/e2e_scripts/clean_private_cluster.sh @@ -0,0 +1,35 @@ +#!/usr/bin/env bash + +# Delete instance profile +aws iam remove-role-from-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" --role-name "KarpenterNodeRole-${CLUSTER_NAME}" +aws iam delete-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" + +# Delete private registry policy for pull through cache +aws iam delete-role-policy --role-name "${NODE_ROLE}" --policy-name "PullThroughCachePolicy" + +# Delete cluster +eksctl delete cluster --name "${CLUSTER_NAME}" --force + +#Delete manually created VPC endpoints +endpoints=$(aws ec2 describe-vpc-endpoints --filters Name=vpc-id,Values="${CLUSTER_VPC_ID}" Name=tag:testing/cluster,Values="${CLUSTER_NAME}" --query "VpcEndpoints") +echo "$endpoints" | jq '.[].VpcEndpointId' -r | +while read -r endpointID; +do + aws ec2 delete-vpc-endpoints --vpc-endpoint-ids "$endpointID" + sleep 1 +done + +#Remove codebuild security group ingress from cluster security group +aws ec2 revoke-security-group-ingress --group-id "${EKS_CLUSTER_SG}" --protocol all --source-group "${SG_CB}" + +# Delete route table entry for cluster +subnet_config=$(aws ec2 describe-subnets --filters Name=vpc-id,Values="${VPC_CB}" Name=tag:aws-cdk:subnet-type,Values=Private --query "Subnets") +echo "$subnet_config" | jq '.[].SubnetId' -r | +while read -r subnet; +do + ROUTE_TABLE_ID=$((aws ec2 describe-route-tables --filters Name=vpc-id,Values="${VPC_CB}" Name=association.subnet-id,Values="$subnet" --query "RouteTables[0].RouteTableId") | jq -r) + aws ec2 delete-route --route-table-id "$ROUTE_TABLE_ID" --destination-cidr-block 192.168.0.0/16 +done + +# Delete VPC peering connection +aws ec2 delete-vpc-peering-connection --vpc-peering-connection-id "${VPC_PEERING_CONNECTION_ID}" \ No newline at end of file diff --git a/test/hack/e2e_scripts/configure_private_cluster.sh b/test/hack/e2e_scripts/configure_private_cluster.sh new file mode 100755 index 000000000000..a6babed1d227 --- /dev/null +++ b/test/hack/e2e_scripts/configure_private_cluster.sh @@ -0,0 +1,83 @@ +# Add the SQS and SSM VPC endpoints if we are creating a private cluster +# We need to grab all of the VPC details for the cluster in order to add the endpoint +# Add inbound rules for codeBuild security group, create temporary access entry +VPC_CONFIG=$(aws eks describe-cluster --name "$CLUSTER_NAME" --query "cluster.resourcesVpcConfig") +VPC_ID=$(echo "$VPC_CONFIG" | jq .vpcId -r) +echo CLUSTER_VPC_ID="$VPC_ID" >> "$GITHUB_ENV" +SUBNET_IDS=($(echo "$VPC_CONFIG" | jq '.subnetIds | join(" ")' -r)) +SHARED_NODE_SG=$((aws ec2 describe-security-groups --filters Name=tag:aws:cloudformation:stack-name,Values=eksctl-"$CLUSTER_NAME"-cluster Name=tag:aws:cloudformation:logical-id,Values=ClusterSharedNodeSecurityGroup --query "SecurityGroups[0]") | jq .GroupId -r) +eks_cluster_sg=$((aws ec2 describe-security-groups --filters Name=tag:aws:eks:cluster-name,Values="$CLUSTER_NAME" --query "SecurityGroups[0]") | jq .GroupId -r) +echo EKS_CLUSTER_SG="$eks_cluster_sg" >> "$GITHUB_ENV" + +for SERVICE in "com.amazonaws.$REGION.ssm" "com.amazonaws.$REGION.eks" "com.amazonaws.$REGION.sqs"; do + aws ec2 create-vpc-endpoint \ + --vpc-id "${VPC_ID}" \ + --vpc-endpoint-type Interface \ + --service-name "${SERVICE}" \ + --subnet-ids "${SUBNET_IDS[@]}" \ + --security-group-ids "${eks_cluster_sg}" \ + --tag-specifications "ResourceType=vpc-endpoint,Tags=[{Key=testing/type,Value=e2e},{Key=testing/cluster,Value=$CLUSTER_NAME},{Key=github.com/run-url,Value=https://github.com/$REPOSITORY/actions/runs/$RUN_ID},{Key=karpenter.sh/discovery,Value=$CLUSTER_NAME}]" +done + +# VPC peering request from codebuild +aws ec2 create-vpc-peering-connection --vpc-id "${CODEBUILD_VPC}" --peer-vpc-id "${VPC_ID}" --tag-specifications "ResourceType=vpc-peering-connection,Tags=[{Key=testing/type,Value=e2e},{Key=testing/cluster,Value=$CLUSTER_NAME},{Key=github.com/run-url,Value=https://github.com/$REPOSITORY/actions/runs/$RUN_ID},{Key=karpenter.sh/discovery,Value=$CLUSTER_NAME}]" +vpc_peering_connection_id=$((aws ec2 describe-vpc-peering-connections --filters Name=accepter-vpc-info.vpc-id,Values="${VPC_ID}" --query "VpcPeeringConnections[0]") | jq .VpcPeeringConnectionId -r) +aws ec2 accept-vpc-peering-connection --vpc-peering-connection-id "${vpc_peering_connection_id}" +echo VPC_PEERING_CONNECTION_ID="$vpc_peering_connection_id" >> "$GITHUB_ENV" + +# Modify route table for codebuild vpc +subnet_config=$(aws ec2 describe-subnets --filters Name=vpc-id,Values="${CODEBUILD_VPC}" Name=tag:aws-cdk:subnet-type,Values=Private --query "Subnets") +echo "$subnet_config" | jq '.[].SubnetId' -r | +while read -r subnet; +do + ROUTE_TABLE_ID=$((aws ec2 describe-route-tables --filters Name=vpc-id,Values="${CODEBUILD_VPC}" Name=association.subnet-id,Values="$subnet" --query "RouteTables[0].RouteTableId") | jq -r) + aws ec2 create-route --route-table-id "$ROUTE_TABLE_ID" --destination-cidr-block 192.168.0.0/16 --vpc-peering-connection-id "$vpc_peering_connection_id" +done + + +# Modify route table for cluster vpc +CLUSTER_ROUTE_TABLE=$(aws ec2 describe-route-tables --filters Name=vpc-id,Values="${VPC_ID}" Name=association.main,Values=false --query "RouteTables") +echo "$CLUSTER_ROUTE_TABLE" | jq '.[].RouteTableId' -r | +while read -r routeTableId; +do + aws ec2 create-route --route-table-id $routeTableId --destination-cidr-block 10.0.0.0/16 --vpc-peering-connection-id "$vpc_peering_connection_id" +done + +aws ec2 authorize-security-group-ingress --group-id "${SHARED_NODE_SG}" --protocol all --source-group "${CODEBUILD_SG}" +aws ec2 authorize-security-group-ingress --group-id "${eks_cluster_sg}" --protocol all --source-group "${CODEBUILD_SG}" + +# There is currently no VPC private endpoint for the IAM API. Therefore, we need to +# provision and manage an instance profile manually. +aws iam create-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" --tags Key=testing/cluster,Value="$CLUSTER_NAME" +aws iam add-role-to-instance-profile --instance-profile-name "KarpenterNodeInstanceProfile-${CLUSTER_NAME}" --role-name "KarpenterNodeRole-${CLUSTER_NAME}" + +#Create private registry policy for pull through cache +MANAGED_NG=$(aws eks list-nodegroups --cluster-name "${CLUSTER_NAME}" --query nodegroups --output text) +node_role=$(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" --nodegroup-name "${MANAGED_NG}" --query nodegroup.nodeRole --output text | cut -d '/' -f 2) +echo NODE_ROLE="$node_role" >> "$GITHUB_ENV" +cat <> policy.json +{ + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "PullThroughCache", + "Effect": "Allow", + "Action": [ + "ecr:BatchImportUpstreamImage", + "ecr:CreateRepository" + ], + "Resource": [ + "arn:aws:ecr:$REGION:$ACCOUNT_ID:repository/ecr-public/*", + "arn:aws:ecr:$REGION:$ACCOUNT_ID:repository/k8s/*", + "arn:aws:ecr:$REGION:$ACCOUNT_ID:repository/quay/*" + ] + } + ] +} +EOF +aws iam put-role-policy --role-name "${node_role}" --policy-name "PullThroughCachePolicy" --policy-document file://policy.json + +# Use pull through cache to pull images that are needed for the tests to run as it requires a route to the internet for the first time +docker pull "$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/k8s/pause:3.6" +docker pull "$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/ecr-public/eks-distro/kubernetes/pause:3.2" +docker pull "$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/ecr-public/docker/library/alpine:latest" \ No newline at end of file diff --git a/test/hack/e2e_scripts/diff_karpenter.sh b/test/hack/e2e_scripts/diff_karpenter.sh new file mode 100755 index 000000000000..42d6e5274a0f --- /dev/null +++ b/test/hack/e2e_scripts/diff_karpenter.sh @@ -0,0 +1,9 @@ +CHART="oci://$ECR_ACCOUNT_ID.dkr.ecr.$ECR_REGION.amazonaws.com/karpenter/snapshot/karpenter" +if (( "$PRIVATE_CLUSTER" == 'true' )); then + CHART="oci://$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/karpenter/snapshot/karpenter" +fi + +helm diff upgrade --namespace kube-system \ +karpenter "${CHART}" \ +--version 0-$(git rev-parse HEAD) \ +--reuse-values --three-way-merge --detailed-exitcode diff --git a/test/hack/e2e_scripts/install_helm.sh b/test/hack/e2e_scripts/install_helm.sh new file mode 100755 index 000000000000..afa71b811479 --- /dev/null +++ b/test/hack/e2e_scripts/install_helm.sh @@ -0,0 +1,4 @@ +TEMPDIR=$(mktemp -d) +curl -fsSL -o "${TEMPDIR}/get_helm.sh" https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 +chmod 700 "${TEMPDIR}/get_helm.sh" +"${TEMPDIR}/get_helm.sh" --version "$HELM_VERSION" diff --git a/test/hack/e2e_scripts/install_karpenter.sh b/test/hack/e2e_scripts/install_karpenter.sh new file mode 100755 index 000000000000..dbe1aba20d71 --- /dev/null +++ b/test/hack/e2e_scripts/install_karpenter.sh @@ -0,0 +1,43 @@ +aws eks update-kubeconfig --name "$CLUSTER_NAME" + +# Parse minor version to determine whether to enable the webhooks +K8S_VERSION_MINOR="${K8S_VERSION#*.}" +WEBHOOK_ENABLED=false +if (( K8S_VERSION_MINOR < 25 )); then + WEBHOOK_ENABLED=true +fi + +CHART="oci://$ECR_ACCOUNT_ID.dkr.ecr.$ECR_REGION.amazonaws.com/karpenter/snapshot/karpenter" +ADDITIONAL_FLAGS="" +if (( "$PRIVATE_CLUSTER" == 'true' )); then + CHART="oci://$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/karpenter/snapshot/karpenter" + ADDITIONAL_FLAGS="--set .Values.controller.image.repository=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/karpenter/snapshot/controller --set .Values.controller.image.digest=\"\"" +fi + +# Remove service account annotation when dropping support for 1.23 +helm upgrade --install karpenter "${CHART}" \ + -n kube-system \ + --version "0-$(git rev-parse HEAD)" \ + --set logLevel=debug \ + --set webhook.enabled=${WEBHOOK_ENABLED} \ + --set settings.isolatedVPC=${PRIVATE_CLUSTER} \ + --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::$ACCOUNT_ID:role/karpenter-irsa-$CLUSTER_NAME" \ + $ADDITIONAL_FLAGS \ + --set settings.clusterName="$CLUSTER_NAME" \ + --set settings.interruptionQueue="$CLUSTER_NAME" \ + --set settings.featureGates.spotToSpotConsolidation=true \ + --set controller.resources.requests.cpu=3 \ + --set controller.resources.requests.memory=3Gi \ + --set controller.resources.limits.cpu=3 \ + --set controller.resources.limits.memory=3Gi \ + --set serviceMonitor.enabled=true \ + --set serviceMonitor.additionalLabels.scrape=enabled \ + --set "serviceMonitor.endpointConfig.relabelings[0].targetLabel=clusterName" \ + --set "serviceMonitor.endpointConfig.relabelings[0].replacement=$CLUSTER_NAME" \ + --set "serviceMonitor.endpointConfig.relabelings[1].targetLabel=gitRef" \ + --set "serviceMonitor.endpointConfig.relabelings[1].replacement=$(git rev-parse HEAD)" \ + --set "serviceMonitor.endpointConfig.relabelings[2].targetLabel=mostRecentTag" \ + --set "serviceMonitor.endpointConfig.relabelings[2].replacement=$(git describe --abbrev=0 --tags)" \ + --set "serviceMonitor.endpointConfig.relabelings[3].targetLabel=commitsAfterTag" \ + --set "serviceMonitor.endpointConfig.relabelings[3].replacement=\"$(git describe --tags | cut -d '-' -f 2)\"" \ + --wait diff --git a/test/hack/e2e_scripts/install_prometheus.sh b/test/hack/e2e_scripts/install_prometheus.sh new file mode 100755 index 000000000000..d05e463dd223 --- /dev/null +++ b/test/hack/e2e_scripts/install_prometheus.sh @@ -0,0 +1,31 @@ +# Remove service account annotation when dropping support for 1.23 + +CHART=prometheus-community/kube-prometheus-stack +VALUES=./.github/actions/e2e/install-prometheus/values.yaml +ENABLED=true +ADDITIONAL_FLAGS="" +if (( "$PRIVATE_CLUSTER" == 'true' )); then + CHART=$(find . -name kube-prometheus-stack*) + ENABLED=false + ADDITIONAL_FLAGS="--set prometheusOperator.admissionWebhooks.patch.image.registry=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/k8s --set prometheusOperator.prometheusConfigReloader.image.registry=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/quay --set prometheusOperator.image.registry=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/quay --set kube-state-metrics.image.registry=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/k8s --set alertmanager.alertmanagerSpec.image.registry=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/quay --set prometheus.prometheusSpec.image.registry=$ACCOUNT_ID.dkr.ecr.$REGION.amazonaws.com/quay --set grafana.enabled=$ENABLED" +fi + +helm upgrade --install prometheus "${CHART}" \ +-n prometheus \ +-f ${VALUES} \ +--set prometheus.prometheusSpec.remoteWrite[0].url=https://aps-workspaces.$PROMETHEUS_REGION.amazonaws.com/workspaces/$WORKSPACE_ID/api/v1/remote_write \ +--set prometheus.prometheusSpec.remoteWrite[0].sigv4.region=$PROMETHEUS_REGION \ +--set prometheus.serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::$ACCOUNT_ID:role/prometheus-irsa-$CLUSTER_NAME" \ +$ADDITIONAL_FLAGS \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[0].targetLabel=metrics_path" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[0].action=replace" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[0].sourceLabels[0]=__metrics_path__" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[1].targetLabel=clusterName" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[1].replacement=$CLUSTER_NAME" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[2].targetLabel=gitRef" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[2].replacement=$(git rev-parse HEAD)" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[3].targetLabel=mostRecentTag" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[3].replacement=$(git describe --abbrev=0 --tags)" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[4].targetLabel=commitsAfterTag" \ +--set "kubelet.serviceMonitor.cAdvisorRelabelings[4].replacement=\"$(git describe --tags | cut -d '-' -f 2)\"" \ +--wait diff --git a/test/hack/e2e_scripts/noderole_bootstrap_permission.sh b/test/hack/e2e_scripts/noderole_bootstrap_permission.sh new file mode 100755 index 000000000000..1fa0d9de5be9 --- /dev/null +++ b/test/hack/e2e_scripts/noderole_bootstrap_permission.sh @@ -0,0 +1,7 @@ +eksctl create iamidentitymapping \ +--username system:node:{{EC2PrivateDNSName}} \ +--cluster "$CLUSTER_NAME" \ +--arn "arn:aws:iam::$ACCOUNT_ID:role/KarpenterNodeRole-$CLUSTER_NAME" \ +--group system:bootstrappers \ +--group system:nodes \ +--group eks:kube-proxy-windows diff --git a/test/hack/resource/clean/main.go b/test/hack/resource/clean/main.go index fdd028020d1c..fc44495a1984 100644 --- a/test/hack/resource/clean/main.go +++ b/test/hack/resource/clean/main.go @@ -75,6 +75,7 @@ func main() { resourcetypes.NewOIDC(iamClient), resourcetypes.NewInstanceProfile(iamClient), resourcetypes.NewStack(cloudFormationClient), + resourcetypes.NewVPCPeeringConnection(ec2Client), } for i := range resourceTypes { diff --git a/test/hack/resource/count/main.go b/test/hack/resource/count/main.go index 2696fcc5e76b..e80c271759f2 100644 --- a/test/hack/resource/count/main.go +++ b/test/hack/resource/count/main.go @@ -50,6 +50,7 @@ func main() { resourcetypes.NewOIDC(iamClient), resourcetypes.NewInstanceProfile(iamClient), resourcetypes.NewStack(cloudFormationClient), + resourcetypes.NewVPCPeeringConnection(ec2Client), } for i := range resourceTypes { diff --git a/test/hack/resource/pkg/resourcetypes/vpc_peering_connection.go b/test/hack/resource/pkg/resourcetypes/vpc_peering_connection.go new file mode 100644 index 000000000000..b7c9648c609c --- /dev/null +++ b/test/hack/resource/pkg/resourcetypes/vpc_peering_connection.go @@ -0,0 +1,133 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcetypes + +import ( + "context" + "time" + + "github.com/aws/aws-sdk-go-v2/service/ec2" + ec2types "github.com/aws/aws-sdk-go-v2/service/ec2/types" + "github.com/samber/lo" + "golang.org/x/exp/slices" +) + +type VPCPeeringConnection struct { + ec2Client *ec2.Client +} + +func NewVPCPeeringConnection(ec2Client *ec2.Client) *VPCPeeringConnection { + return &VPCPeeringConnection{ec2Client: ec2Client} +} + +func (v *VPCPeeringConnection) String() string { + return "VPCPeeringConnection" +} + +func (v *VPCPeeringConnection) Global() bool { + return false +} + +func (v *VPCPeeringConnection) Get(ctx context.Context, clusterName string) (ids []string, err error) { + var nextToken *string + for { + out, err := v.ec2Client.DescribeVpcPeeringConnections(ctx, &ec2.DescribeVpcPeeringConnectionsInput{ + Filters: []ec2types.Filter{ + { + Name: lo.ToPtr("tag:" + karpenterTestingTag), + Values: []string{clusterName}, + }, + }, + NextToken: nextToken, + }) + if err != nil { + return ids, err + } + for _, connection := range out.VpcPeeringConnections { + ids = append(ids, lo.FromPtr(connection.VpcPeeringConnectionId)) + } + nextToken = out.NextToken + if nextToken == nil { + break + } + } + return ids, err +} + +func (v *VPCPeeringConnection) CountAll(ctx context.Context) (count int, err error) { + var nextToken *string + for { + out, err := v.ec2Client.DescribeVpcPeeringConnections(ctx, &ec2.DescribeVpcPeeringConnectionsInput{ + NextToken: nextToken, + }) + if err != nil { + return count, err + } + + count += len(out.VpcPeeringConnections) + + nextToken = out.NextToken + if nextToken == nil { + break + } + } + return count, err +} + +func (v *VPCPeeringConnection) GetExpired(ctx context.Context, expirationTime time.Time, excludedClusters []string) (ids []string, err error) { + var nextToken *string + for { + out, err := v.ec2Client.DescribeVpcPeeringConnections(ctx, &ec2.DescribeVpcPeeringConnectionsInput{ + Filters: []ec2types.Filter{ + { + Name: lo.ToPtr("tag-key"), + Values: []string{karpenterTestingTag}, + }, + }, + NextToken: nextToken, + }) + if err != nil { + return ids, err + } + for _, connection := range out.VpcPeeringConnections { + clusterName, found := lo.Find(connection.Tags, func(tag ec2types.Tag) bool { + return *tag.Key == k8sClusterTag + }) + if found && slices.Contains(excludedClusters, lo.FromPtr(clusterName.Value)) { + continue + } + if connection.ExpirationTime.Before(expirationTime) { + ids = append(ids, lo.FromPtr(connection.VpcPeeringConnectionId)) + } + } + nextToken = out.NextToken + if nextToken == nil { + break + } + } + return ids, err +} + +// Cleanup any old VPC peering connections that were provisioned as part of testing +func (v *VPCPeeringConnection) Cleanup(ctx context.Context, ids []string) ([]string, error) { + for _, id := range ids { + if _, err := v.ec2Client.DeleteVpcPeeringConnection(ctx, &ec2.DeleteVpcPeeringConnectionInput{ + VpcPeeringConnectionId: lo.ToPtr(id), + }); err != nil { + return nil, err + } + } + return ids, nil +} diff --git a/test/pkg/environment/aws/environment.go b/test/pkg/environment/aws/environment.go index a64a7fde39c1..1b1dc4476ab5 100644 --- a/test/pkg/environment/aws/environment.go +++ b/test/pkg/environment/aws/environment.go @@ -19,6 +19,8 @@ import ( "os" "testing" + coretest "sigs.k8s.io/karpenter/pkg/test" + "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/aws/client" "github.com/aws/aws-sdk-go/aws/endpoints" @@ -53,7 +55,9 @@ func init() { corev1beta1.NormalizedLabels = lo.Assign(corev1beta1.NormalizedLabels, map[string]string{"topology.ebs.csi.aws.com/zone": corev1.LabelTopologyZone}) } -const WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9" +var WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9" + +var EphemeralInitContainerImage = "alpine" // ExcludedInstanceFamilies denotes instance families that have issues during resource registration due to compatibility // issues with versions of the VPR Resource Controller. @@ -77,6 +81,7 @@ type Environment struct { ClusterName string ClusterEndpoint string InterruptionQueue string + PrivateCluster bool } func NewEnvironment(t *testing.T) *Environment { @@ -106,6 +111,12 @@ func NewEnvironment(t *testing.T) *Environment { ClusterName: lo.Must(os.LookupEnv("CLUSTER_NAME")), ClusterEndpoint: lo.Must(os.LookupEnv("CLUSTER_ENDPOINT")), } + + if _, awsEnv.PrivateCluster = os.LookupEnv("PRIVATE_CLUSTER"); awsEnv.PrivateCluster { + WindowsDefaultImage = fmt.Sprintf("857221689048.dkr.ecr.%s.amazonaws.com/k8s/pause:3.6", awsEnv.Region) + EphemeralInitContainerImage = fmt.Sprintf("857221689048.dkr.ecr.%s.amazonaws.com/ecr-public/docker/library/alpine:latest", awsEnv.Region) + coretest.DefaultImage = fmt.Sprintf("857221689048.dkr.ecr.%s.amazonaws.com/ecr-public/eks-distro/kubernetes/pause:3.2", awsEnv.Region) + } // Initialize the provider only if the INTERRUPTION_QUEUE environment variable is defined if v, ok := os.LookupEnv("INTERRUPTION_QUEUE"); ok { awsEnv.SQSProvider = lo.Must(sqs.NewProvider(env.Context, servicesqs.New(session), v)) @@ -137,6 +148,11 @@ func (env *Environment) DefaultEC2NodeClass() *v1beta1.EC2NodeClass { Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, }, } + if env.PrivateCluster { + nodeClass.Spec.Role = "" + nodeClass.Spec.InstanceProfile = lo.ToPtr(fmt.Sprintf("KarpenterNodeInstanceProfile-%s", env.ClusterName)) + return nodeClass + } nodeClass.Spec.Role = fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName) return nodeClass } diff --git a/test/suites/integration/extended_resources_test.go b/test/suites/integration/extended_resources_test.go index 4491d77df62f..49c2e8d3c7ae 100644 --- a/test/suites/integration/extended_resources_test.go +++ b/test/suites/integration/extended_resources_test.go @@ -38,6 +38,11 @@ import ( ) var _ = Describe("Extended Resources", func() { + BeforeEach(func() { + if env.PrivateCluster { + Skip("skipping Extended Resources test for private cluster") + } + }) It("should provision nodes for a deployment that requests nvidia.com/gpu", func() { ExpectNvidiaDevicePluginCreated() // TODO: jmdeal@ remove AL2 pin once AL2023 accelerated AMIs are available diff --git a/test/suites/integration/instance_profile_test.go b/test/suites/integration/instance_profile_test.go index 1dd44f424080..4c2e7b860a89 100644 --- a/test/suites/integration/instance_profile_test.go +++ b/test/suites/integration/instance_profile_test.go @@ -30,6 +30,11 @@ import ( ) var _ = Describe("InstanceProfile Generation", func() { + BeforeEach(func() { + if env.PrivateCluster { + Skip("skipping InstanceProfile Generation test for private cluster") + } + }) It("should generate the InstanceProfile when setting the role", func() { pod := coretest.Pod() env.ExpectCreated(nodePool, nodeClass, pod) diff --git a/test/suites/integration/scheduling_test.go b/test/suites/integration/scheduling_test.go index d8b4d2cea177..9c9dba0f6a26 100644 --- a/test/suites/integration/scheduling_test.go +++ b/test/suites/integration/scheduling_test.go @@ -550,7 +550,7 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { func ephemeralInitContainer(requirements v1.ResourceRequirements) v1.Container { return v1.Container{ - Image: "alpine", + Image: aws.EphemeralInitContainerImage, Command: []string{"/bin/sh"}, Args: []string{"-c", "sleep 5"}, Resources: requirements, diff --git a/test/suites/integration/tags_test.go b/test/suites/integration/tags_test.go index b7302e8ebc4c..ab56b5abd312 100644 --- a/test/suites/integration/tags_test.go +++ b/test/suites/integration/tags_test.go @@ -15,6 +15,7 @@ limitations under the License. package integration_test import ( + "fmt" "time" "github.com/aws/aws-sdk-go/service/ec2" @@ -108,6 +109,10 @@ var _ = Describe("Tags", func() { nodeClass = test.EC2NodeClass(*nodeClass, v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{ Tags: map[string]string{"Name": "custom-name", "testing/cluster": env.ClusterName}, }}) + if env.PrivateCluster { + nodeClass.Spec.Role = "" + nodeClass.Spec.InstanceProfile = lo.ToPtr(fmt.Sprintf("KarpenterNodeInstanceProfile-%s", env.ClusterName)) + } nodePool = coretest.NodePool(*nodePool, corev1beta1.NodePool{ Spec: corev1beta1.NodePoolSpec{ Template: corev1beta1.NodeClaimTemplate{