From af57df44f0d7db668ee5a6435daa96bf278471a8 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 08:54:19 -0700 Subject: [PATCH 01/47] docs: Fix region being undefined while applying beta policy (#4865) --- designs/integration-testing.md | 2 +- .../en/preview/upgrading/upgrade-guide.md | 13 ++--- .../upgrading/v1beta1-controller-policy.json | 54 +++++++++---------- 3 files changed, 35 insertions(+), 34 deletions(-) diff --git a/designs/integration-testing.md b/designs/integration-testing.md index e151aae94fea..4621ec98f1e2 100644 --- a/designs/integration-testing.md +++ b/designs/integration-testing.md @@ -39,4 +39,4 @@ __(To be implemented)__ Contributing to the list of test suites in the testing f __(To be implemented)__ Periodic testing will be an important part of Karpenter’s testing history. Results and history will be visualized as a testgrid (https://testgrid.k8s.io/) where users can look at metrics and logs for each set of test runs. -__(To be implemented)__ Upgrade instructions between releases as detailed in the Upgrade Guide (https://karpenter.sh/preview/upgrade-guide/#how-do-we-break-incompatibility) will be tested as well. Additional tests will be included in the PR to create the release. As a result, releases will go through the same process as normal commits, and will ensure that upgrade instructions that introduce breaking changes are tested. +__(To be implemented)__ Upgrade instructions between releases as detailed in the Upgrade Guide (https://karpenter.sh/preview/upgrading/upgrade-guide/#how-do-we-break-incompatibility) will be tested as well. Additional tests will be included in the PR to create the release. As a result, releases will go through the same process as normal commits, and will ensure that upgrade instructions that introduce breaking changes are tested. diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index dcb24ee1f0e2..d6a39d2ae1d2 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -63,22 +63,22 @@ This procedure assumes you are running the Karpenter controller on cluster and w To upgrade your provisioner and AWSNodeTemplate YAML files to be compatible with v1beta1, you can either update them manually or use the [karpenter-convert](https://github.com/aws/karpenter/tree/main/tools/karpenter-convert) CLI tool. To install that tool: -``` +```bash go install github.com/aws/karpenter/tools/karpenter-convert/cmd/karpenter-convert@latest ``` Add `~/go/bin` to your $PATH, if you have not already done so. 1. Determine the current cluster version: Run the following to make sure that your Karpenter version is v0.31.x: - ``` + ```bash kubectl get pod -A | grep karpenter kubectl describe pod -n karpenter karpenter-xxxxxxxxxx-xxxxx | grep Image: | grep v0..... ``` Sample output: - ``` + ```bash Image: public.ecr.aws/karpenter/controller:v0.31.0@sha256:d29767fa9c5c0511a3812397c932f5735234f03a7a875575422b712d15e54a77 ``` - {{% alert title="Note" color="primary" %}} + {{% alert title="Warning" color="primary" %}} v0.31.2 introduces minor changes to Karpenter so that rollback from v0.32.0 is supported. If you are coming from some other patch version of minor version v0.31.x, note that v0.31.2 is the _only_ patch version that supports rollback. {{% /alert %}} @@ -100,8 +100,9 @@ Add `~/go/bin` to your $PATH, if you have not already done so. ```bash TEMPOUT=$(mktemp) - curl -fsSL https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}website/content/en/preview/upgrade/v1beta1-controller-policy.json > ${TEMPOUT} + curl -fsSL https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}website/content/en/preview/upgrading/v1beta1-controller-policy.json > ${TEMPOUT} + REGION=${AWS_REGION:=$AWS_DEFAULT_REGION} POLICY_DOCUMENT=$(envsubst < ${TEMPOUT}) POLICY_NAME="KarpenterControllerPolicy-${CLUSTER_NAME}-v1beta1" ROLE_NAME="${CLUSTER_NAME}-karpenter" @@ -113,7 +114,7 @@ Add `~/go/bin` to your $PATH, if you have not already done so. 5. Apply the v0.32.0 Custom Resource Definitions (CRDs) in the crds directory of the Karpenter helm chart. Here are the ways you can do this: * As an independent helm chart [karpenter-crd](https://gallery.ecr.aws/karpenter/karpenter-crd) - [source](https://github.com/aws/karpenter/blob/main/charts/karpenter-crd) that can be used by Helm to manage the lifecycle of these CRDs. To upgrade or install `karpenter-crd` run: - ``` + ```bash helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-crd --version vx.y.z --namespace karpenter --create-namespace ``` diff --git a/website/content/en/preview/upgrading/v1beta1-controller-policy.json b/website/content/en/preview/upgrading/v1beta1-controller-policy.json index e6923be897d2..efe818b0708e 100644 --- a/website/content/en/preview/upgrading/v1beta1-controller-policy.json +++ b/website/content/en/preview/upgrading/v1beta1-controller-policy.json @@ -5,12 +5,12 @@ "Sid": "AllowScopedEC2InstanceActions", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}::image/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}::snapshot/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:spot-instances-request/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:security-group/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:subnet/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${REGION}::image/*", + "arn:${AWS_PARTITION}:ec2:${REGION}::snapshot/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:spot-instances-request/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:security-group/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:subnet/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" ], "Action": [ "ec2:RunInstances", @@ -21,11 +21,11 @@ "Sid": "AllowScopedEC2InstanceActionsWithTags", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:fleet/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:volume/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:network-interface/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${REGION}:*:fleet/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:volume/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:network-interface/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" ], "Action": [ "ec2:RunInstances", @@ -45,11 +45,11 @@ "Sid": "AllowScopedResourceCreationTagging", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:fleet/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:volume/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:network-interface/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${REGION}:*:fleet/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:volume/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:network-interface/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" ], "Action": "ec2:CreateTags", "Condition": { @@ -69,7 +69,7 @@ { "Sid": "AllowScopedResourceTagging", "Effect": "Allow", - "Resource": "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", + "Resource": "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", "Action": "ec2:CreateTags", "Condition": { "StringEquals": { @@ -90,8 +90,8 @@ "Sid": "AllowScopedDeletion", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", - "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", + "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" ], "Action": [ "ec2:TerminateInstances", @@ -123,14 +123,14 @@ ], "Condition": { "StringEquals": { - "aws:RequestedRegion": "${AWS_REGION}" + "aws:RequestedRegion": "${REGION}" } } }, { "Sid": "AllowSSMReadActions", "Effect": "Allow", - "Resource": "arn:${AWS_PARTITION}:ssm:${AWS_REGION}::parameter/aws/service/*", + "Resource": "arn:${AWS_PARTITION}:ssm:${REGION}::parameter/aws/service/*", "Action": "ssm:GetParameter" }, { @@ -142,7 +142,7 @@ { "Sid": "AllowInterruptionQueueActions", "Effect": "Allow", - "Resource": "arn:aws:sqs:${AWS_REGION}:${AWS_ACCOUNT_ID}:${CLUSTER_NAME}", + "Resource": "arn:aws:sqs:${REGION}:${AWS_ACCOUNT_ID}:${CLUSTER_NAME}", "Action": [ "sqs:DeleteMessage", "sqs:GetQueueAttributes", @@ -169,7 +169,7 @@ "Condition": { "StringEquals": { "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" + "aws:RequestTag/topology.kubernetes.io/region": "${REGION}" } } }, @@ -181,9 +181,9 @@ "Condition": { "StringEquals": { "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}", + "aws:ResourceTag/topology.kubernetes.io/region": "${REGION}", "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" + "aws:RequestTag/topology.kubernetes.io/region": "${REGION}" } } }, @@ -199,7 +199,7 @@ "Condition": { "StringEquals": { "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}" + "aws:ResourceTag/topology.kubernetes.io/region": "${REGION}" } } }, @@ -212,7 +212,7 @@ { "Sid": "AllowAPIServerEndpointDiscovery", "Effect": "Allow", - "Resource": "arn:${AWS_PARTITION}:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", + "Resource": "arn:${AWS_PARTITION}:eks:${REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", "Action": "eks:DescribeCluster" } ] From 2012cf98c2e2e9625e858842c9f2d177efb0c364 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 09:31:24 -0700 Subject: [PATCH 02/47] chore: Passthrough new helm values as environment variables (#4840) --- .../actions/e2e/install-karpenter/action.yaml | 4 +- Makefile | 26 ++++--- charts/karpenter/README.md | 23 +++++-- charts/karpenter/README.md.gotmpl | 7 +- charts/karpenter/templates/_helpers.tpl | 27 -------- charts/karpenter/templates/configmap.yaml | 50 +++++++++++++- charts/karpenter/templates/deployment.yaml | 46 +++++++++++++ charts/karpenter/values.yaml | 36 ++++++++-- go.mod | 2 +- hack/docs/instancetypes_gen_docs.go | 3 + pkg/apis/settings/settings.go | 2 +- test/pkg/debug/monitor.go | 3 + test/pkg/environment/aws/setup.go | 7 +- test/pkg/environment/common/expectations.go | 69 +++++++++++++++++-- test/suites/drift/suite_test.go | 10 ++- test/suites/expiration/expiration_test.go | 5 +- test/suites/integration/cni_test.go | 14 ++-- .../integration/extended_resources_test.go | 4 +- test/suites/integration/tags_test.go | 4 +- .../suites/machine/garbage_collection_test.go | 6 +- test/suites/scale/deprovisioning_test.go | 5 +- .../scripts/step08-apply-helm-chart.sh | 2 +- .../content/en/preview/reference/settings.md | 9 +++ .../en/preview/upgrading/upgrade-guide.md | 6 +- 24 files changed, 273 insertions(+), 97 deletions(-) diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index 79c00642b5c6..74179c812297 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -55,9 +55,9 @@ runs: -n karpenter \ --version "v0-$(git rev-parse HEAD)" \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"="arn:aws:iam::${{ inputs.account_id }}:role/karpenter-irsa-${{ inputs.cluster_name }}" \ - --set settings.aws.clusterName="${{ inputs.cluster_name }}" \ + --set settings.clusterName="${{ inputs.cluster_name }}" \ --set settings.aws.defaultInstanceProfile="KarpenterNodeInstanceProfile-${{ inputs.cluster_name }}" \ - --set settings.aws.interruptionQueueName="${{ inputs.cluster_name }}" \ + --set settings.interruptionQueue="${{ inputs.cluster_name }}" \ --set controller.resources.requests.cpu=3 \ --set controller.resources.requests.memory=3Gi \ --set controller.resources.limits.cpu=3 \ diff --git a/Makefile b/Makefile index 260a1a70f974..54f6cac11e08 100644 --- a/Makefile +++ b/Makefile @@ -12,11 +12,11 @@ CLUSTER_ENDPOINT ?= $(shell kubectl config view --minify -o jsonpath='{.clusters AWS_ACCOUNT_ID ?= $(shell aws sts get-caller-identity --query Account --output text) KARPENTER_IAM_ROLE_ARN ?= arn:aws:iam::${AWS_ACCOUNT_ID}:role/${CLUSTER_NAME}-karpenter HELM_OPTS ?= --set serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn=${KARPENTER_IAM_ROLE_ARN} \ - --set settings.aws.clusterName=${CLUSTER_NAME} \ - --set settings.aws.clusterEndpoint=${CLUSTER_ENDPOINT} \ + --set settings.clusterName=${CLUSTER_NAME} \ + --set settings.clusterEndpoint=${CLUSTER_ENDPOINT} \ --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ - --set settings.featureGates.driftEnabled=true \ + --set settings.interruptionQueue=${CLUSTER_NAME} \ + --set settings.featureGates.drift=true \ --set controller.resources.requests.cpu=1 \ --set controller.resources.requests.memory=1Gi \ --set controller.resources.limits.cpu=1 \ @@ -48,15 +48,18 @@ ci-non-test: verify licenses vulncheck ## Runs checks other than tests run: ## Run Karpenter controller binary against your local cluster kubectl create configmap -n ${SYSTEM_NAMESPACE} karpenter-global-settings \ - --from-literal=aws.clusterName=${CLUSTER_NAME} \ - --from-literal=aws.clusterEndpoint=${CLUSTER_ENDPOINT} \ --from-literal=aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --from-literal=aws.interruptionQueueName=${CLUSTER_NAME} \ - --from-literal=featureGates.driftEnabled=true \ --dry-run=client -o yaml | kubectl apply -f - - SYSTEM_NAMESPACE=${SYSTEM_NAMESPACE} KUBERNETES_MIN_VERSION="1.19.0-0" LEADER_ELECT=false DISABLE_WEBHOOK=true \ + SYSTEM_NAMESPACE=${SYSTEM_NAMESPACE} \ + KUBERNETES_MIN_VERSION="1.19.0-0" \ + LEADER_ELECT=false \ + DISABLE_WEBHOOK=true \ + CLUSTER_NAME=${CLUSTER_NAME} \ + CLUSTER_ENDPOINT=${CLUSTER_ENDPOINT} \ + INTERRUPTION_QUEUE=${CLUSTER_NAME} \ + FEATURE_GATES="Drift=true" \ go run ./cmd/controller/main.go clean-run: ## Clean resources deployed by the run target @@ -76,7 +79,10 @@ battletest: ## Run randomized, racing, code-covered tests -tags random_test_delay e2etests: ## Run the e2e suite against your local cluster - cd test && CLUSTER_NAME=${CLUSTER_NAME} go test \ + cd test && CLUSTER_ENDPOINT=${CLUSTER_ENDPOINT} \ + CLUSTER_NAME=${CLUSTER_NAME} \ + INTERRUPTION_QUEUE=${CLUSTER_NAME} \ + go test \ -p 1 \ -count 1 \ -timeout ${TEST_TIMEOUT} \ diff --git a/charts/karpenter/README.md b/charts/karpenter/README.md index 9b4eee5cb7b0..dd65466fd39e 100644 --- a/charts/karpenter/README.md +++ b/charts/karpenter/README.md @@ -17,10 +17,9 @@ helm upgrade --install --namespace karpenter --create-namespace \ karpenter oci://public.ecr.aws/karpenter/karpenter \ --version v0.31.0 \ --set serviceAccount.annotations.eks\.amazonaws\.com/role-arn=${KARPENTER_IAM_ROLE_ARN} \ - --set settings.aws.clusterName=${CLUSTER_NAME} \ - --set settings.aws.clusterEndpoint=${CLUSTER_ENDPOINT} \ - --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ + --set settings.clusterName=${CLUSTER_NAME} \ + --set settings.clusterEndpoint=${CLUSTER_ENDPOINT} \ + --set settings.interruptionQueue=${CLUSTER_NAME} \ --wait ``` @@ -53,7 +52,7 @@ helm upgrade --install --namespace karpenter --create-namespace \ | hostNetwork | bool | `false` | Bind the pod to the host network. This is required when using a custom CNI. | | imagePullPolicy | string | `"IfNotPresent"` | Image pull policy for Docker images. | | imagePullSecrets | list | `[]` | Image pull secrets for Docker images. | -| logConfig | object | `{"enabled":true,"errorOutputPaths":["stderr"],"logEncoding":"console","logLevel":{"controller":"debug","global":"debug","webhook":"error"},"outputPaths":["stdout"]}` | Log configuration | +| logConfig | object | `{"enabled":true,"errorOutputPaths":["stderr"],"logEncoding":"console","logLevel":{"controller":"debug","global":"debug","webhook":"error"},"outputPaths":["stdout"]}` | Log configuration (Deprecated: Logging configuration will be dropped by v1, use logLevel instead) | | logConfig.enabled | bool | `true` | Whether to enable provisioning and mounting the log ConfigMap | | logConfig.errorOutputPaths | list | `["stderr"]` | Log errorOutputPaths - defaults to stderr only | | logConfig.logEncoding | string | `"console"` | Log encoding - defaults to console - must be one of 'json', 'console' | @@ -79,8 +78,10 @@ helm upgrade --install --namespace karpenter --create-namespace \ | serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | | serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | | serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | -| settings | object | `{"aws":{"assumeRoleARN":"","assumeRoleDuration":"15m","clusterCABundle":"","clusterEndpoint":"","clusterName":"","defaultInstanceProfile":"","enableENILimitedPodDensity":true,"enablePodENI":false,"interruptionQueueName":"","isolatedVPC":false,"tags":null,"vmMemoryOverheadPercent":0.075},"batchIdleDuration":"1s","batchMaxDuration":"10s","featureGates":{"driftEnabled":false}}` | Global Settings to configure Karpenter | -| settings.aws | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","clusterCABundle":"","clusterEndpoint":"","clusterName":"","defaultInstanceProfile":"","enableENILimitedPodDensity":true,"enablePodENI":false,"interruptionQueueName":"","isolatedVPC":false,"tags":null,"vmMemoryOverheadPercent":0.075}` | AWS-specific configuration values | +| settings | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","aws":{"assumeRoleARN":"","assumeRoleDuration":"15m","clusterCABundle":"","clusterEndpoint":"","clusterName":"","defaultInstanceProfile":"","enableENILimitedPodDensity":true,"enablePodENI":false,"interruptionQueueName":"","isolatedVPC":false,"reservedENIs":"0","tags":null,"vmMemoryOverheadPercent":0.075},"batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","featureGates":{"driftEnabled":false},"interruptionQueue":"","isolatedVPC":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter | +| settings.assumeRoleARN | string | `""` | Role to assume for calling AWS services. | +| settings.assumeRoleDuration | string | `"15m"` | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRoleARN set. | +| settings.aws | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","clusterCABundle":"","clusterEndpoint":"","clusterName":"","defaultInstanceProfile":"","enableENILimitedPodDensity":true,"enablePodENI":false,"interruptionQueueName":"","isolatedVPC":false,"reservedENIs":"0","tags":null,"vmMemoryOverheadPercent":0.075}` | AWS-specific configuration values (Deprecated: Use values without the "aws" prefix instead) | | settings.aws.assumeRoleARN | string | `""` | Role to assume for calling AWS services. | | settings.aws.assumeRoleDuration | string | `"15m"` | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRoleARN set. | | settings.aws.clusterCABundle | string | `""` | Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. | @@ -91,12 +92,20 @@ helm upgrade --install --namespace karpenter --create-namespace \ | settings.aws.enablePodENI | bool | `false` | If true then instances that support pod ENI will report a vpc.amazonaws.com/pod-eni resource | | settings.aws.interruptionQueueName | string | `""` | interruptionQueueName is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. | | settings.aws.isolatedVPC | bool | `false` | If true then assume we can't reach AWS services which don't have a VPC endpoint This also has the effect of disabling look-ups to the AWS pricing endpoint | +| settings.aws.reservedENIs | string | `"0"` | Reserved ENIs are not included in the calculations for max-pods or kube-reserved This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html | | settings.aws.tags | string | `nil` | The global tags to use on all AWS infrastructure resources (launch templates, instances, etc.) across node templates | | settings.aws.vmMemoryOverheadPercent | float | `0.075` | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types | | settings.batchIdleDuration | string | `"1s"` | The maximum amount of time with no new ending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. | | settings.batchMaxDuration | string | `"10s"` | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. | +| settings.clusterCABundle | string | `""` | Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. | +| settings.clusterEndpoint | string | `""` | Cluster endpoint. If not set, will be discovered during startup (EKS only) | +| settings.clusterName | string | `""` | Cluster name. | | settings.featureGates | object | `{"driftEnabled":false}` | Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features | | settings.featureGates.driftEnabled | bool | `false` | driftEnabled is in ALPHA and is disabled by default. Setting driftEnabled to true enables the drift deprovisioner to watch for drift between currently deployed nodes and the desired state of nodes set in provisioners and node templates | +| settings.interruptionQueue | string | `""` | interruptionQueue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. | +| settings.isolatedVPC | bool | `false` | If true then assume we can't reach AWS services which don't have a VPC endpoint This also has the effect of disabling look-ups to the AWS pricing endpoint | +| settings.reservedENIs | string | `"0"` | Reserved ENIs are not included in the calculations for max-pods or kube-reserved This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html | +| settings.vmMemoryOverheadPercent | float | `0.075` | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types | | strategy | object | `{"rollingUpdate":{"maxUnavailable":1}}` | Strategy for updating the pod. | | terminationGracePeriodSeconds | string | `nil` | Override the default termination grace period for the pod. | | tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | diff --git a/charts/karpenter/README.md.gotmpl b/charts/karpenter/README.md.gotmpl index 8443b27bb1d6..75d5e128ada9 100644 --- a/charts/karpenter/README.md.gotmpl +++ b/charts/karpenter/README.md.gotmpl @@ -16,10 +16,9 @@ helm upgrade --install --namespace karpenter --create-namespace \ karpenter oci://public.ecr.aws/karpenter/{{ template "chart.name" . }} \ --version v{{ template "chart.version" . }} \ --set serviceAccount.annotations.eks\.amazonaws\.com/role-arn=${KARPENTER_IAM_ROLE_ARN} \ - --set settings.aws.clusterName=${CLUSTER_NAME} \ - --set settings.aws.clusterEndpoint=${CLUSTER_ENDPOINT} \ - --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ + --set settings.clusterName=${CLUSTER_NAME} \ + --set settings.clusterEndpoint=${CLUSTER_ENDPOINT} \ + --set settings.interruptionQueue=${CLUSTER_NAME} \ --wait ``` diff --git a/charts/karpenter/templates/_helpers.tpl b/charts/karpenter/templates/_helpers.tpl index c59591fd0c75..3d55afc2942d 100644 --- a/charts/karpenter/templates/_helpers.tpl +++ b/charts/karpenter/templates/_helpers.tpl @@ -141,33 +141,6 @@ This works because Helm treats dictionaries as mutable objects and allows passin {{- end }} {{- end }} -{{/* -Flatten Settings Map using "." syntax -*/}} -{{- define "flattenSettings" -}} -{{- $map := first . -}} -{{- $label := last . -}} -{{- range $key := (keys $map | uniq | sortAlpha) }} - {{- $sublabel := $key -}} - {{- $val := (get $map $key) -}} - {{- if $label -}} - {{- $sublabel = list $label $key | join "." -}} - {{- end -}} - {{/* Special-case "tags" since we want this to be a JSON object */}} - {{- if eq $key "tags" -}} - {{- if not (kindIs "invalid" $val) -}} - {{- $sublabel | quote | nindent 2 }}: {{ $val | toJson | quote }} - {{- end -}} - {{- else if kindOf $val | eq "map" -}} - {{- list $val $sublabel | include "flattenSettings" -}} - {{- else -}} - {{- if not (kindIs "invalid" $val) -}} - {{- $sublabel | quote | nindent 2 -}}: {{ $val | quote }} - {{- end -}} -{{- end -}} -{{- end -}} -{{- end -}} - {{/* Flatten the stdout logging outputs from args provided */}} diff --git a/charts/karpenter/templates/configmap.yaml b/charts/karpenter/templates/configmap.yaml index d52c7b6fe48c..8311a36192bf 100644 --- a/charts/karpenter/templates/configmap.yaml +++ b/charts/karpenter/templates/configmap.yaml @@ -10,4 +10,52 @@ metadata: {{- toYaml . | nindent 4 }} {{- end }} data: - {{- list .Values.settings "" | include "flattenSettings" | indent 2 }} +{{- with .Values.settings.batchMaxDuration }} + batchMaxDuration: "{{ . }}" +{{- end }} +{{- with .Values.settings.batchIdleDuration }} + batchIdleDuration: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.assumeRoleARN }} + aws.assumeRoleARN: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.assumeRoleDuration }} + aws.assumeRoleDuration: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.clusterCABundle }} + aws.clusterCABundle: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.clusterName }} + aws.clusterName: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.clusterEndpoint }} + aws.clusterEndpoint: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.defaultInstanceProfile }} + aws.defaultInstanceProfile: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.enablePodENI }} + aws.enablePodENI: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.enableENILimitedPodDensity }} + aws.enableENILimitedPodDensity: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.isolatedVPC }} + aws.isolatedVPC: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.vmMemoryOverheadPercent }} + aws.vmMemoryOverheadPercent: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.interruptionQueueName }} + aws.interruptionQueueName: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.tags }} + aws.tags: "{{ . }}" +{{- end }} +{{- with .Values.settings.aws.reservedENIs }} + aws.reservedENIs: "{{ . }}" +{{- end }} +{{- with .Values.settings.featureGates.driftEnabled }} + featureGates.driftEnabled: "${{ . }}" +{{- end }} + diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index f15f26a94f9b..66a7a7812cab 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -98,6 +98,52 @@ spec: containerName: controller divisor: "0" resource: limits.memory + - name: FEATURE_GATES + value: "Drift={{ or .Values.settings.featureGates.drift .Values.settings.featureGates.driftEnabled }}" + {{- with .Values.settings.batchMaxDuration }} + - name: BATCH_MAX_DURATION + value: "{{ . }}" + {{- end }} + {{- with .Values.settings.batchIdleDuration }} + - name: BATCH_IDLE_DURATION + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.assumeRoleARN .Values.settings.aws.assumeRoleARN }} + - name: ASSUME_ROLE_ARN + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.assumeRoleDuration .Values.settings.aws.assumeRoleDuration }} + - name: ASSUME_ROLE_DURATION + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.clusterCABundle .Values.settings.aws.clusterCABundle }} + - name: CLUSTER_CA_BUNDLE + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.clusterName .Values.settings.aws.clusterName }} + - name: CLUSTER_NAME + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.clusterEndpoint .Values.settings.aws.clusterEndpoint }} + - name: CLUSTER_ENDPOINT + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.isolatedVPC .Values.settings.aws.isolatedVPC }} + - name: ISOLATED_VPC + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.vmMemoryOverheadPercent .Values.settings.aws.vmMemoryOverheadPercent }} + - name: VM_MEMORY_OVERHEAD_PERCENT + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.interruptionQueue .Values.settings.aws.interruptionQueueName }} + - name: INTERRUPTION_QUEUE + value: "{{ . }}" + {{- end }} + {{- with or .Values.settings.reservedENIs .Values.settings.aws.reservedENIs }} + - name: RESERVED_ENIS + value: "{{ . }}" + {{- end }} {{- with .Values.controller.env }} {{- toYaml . | nindent 12 }} {{- end }} diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index d99f16c5261f..1e5dd3051fca 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -154,7 +154,7 @@ webhook: logLevel: debug # -- Global log encoding (Deprecated: Use logConfig.logEncoding instead) logEncoding: console -# -- Log configuration +# -- Log configuration (Deprecated: Logging configuration will be dropped by v1, use logLevel instead) logConfig: # -- Whether to enable provisioning and mounting the log ConfigMap enabled: true @@ -183,7 +183,28 @@ settings: # faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods # will be batched separately. batchIdleDuration: 1s - # -- AWS-specific configuration values + # -- Role to assume for calling AWS services. + assumeRoleARN: "" + # -- Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRoleARN set. + assumeRoleDuration: 15m + # -- Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. + clusterCABundle: "" + # -- Cluster name. + clusterName: "" + # -- Cluster endpoint. If not set, will be discovered during startup (EKS only) + clusterEndpoint: "" + # -- If true then assume we can't reach AWS services which don't have a VPC endpoint + # This also has the effect of disabling look-ups to the AWS pricing endpoint + isolatedVPC: false + # -- The VM memory overhead as a percent that will be subtracted from the total memory for all instance types + vmMemoryOverheadPercent: 0.075 + # -- interruptionQueue is disabled if not specified. Enabling interruption handling may + # require additional permissions on the controller service account. Additional permissions are outlined in the docs. + interruptionQueue: "" + # -- Reserved ENIs are not included in the calculations for max-pods or kube-reserved + # This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html + reservedENIs: "0" + # -- AWS-specific configuration values (Deprecated: Use values without the "aws" prefix instead) aws: # -- Role to assume for calling AWS services. assumeRoleARN: "" @@ -212,10 +233,17 @@ settings: interruptionQueueName: "" # -- The global tags to use on all AWS infrastructure resources (launch templates, instances, etc.) across node templates tags: + # -- Reserved ENIs are not included in the calculations for max-pods or kube-reserved + # This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html + reservedENIs: "0" # -- Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates # in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features featureGates: - # -- driftEnabled is in ALPHA and is disabled by default. - # Setting driftEnabled to true enables the drift deprovisioner to watch for drift between currently deployed nodes + # -- drift is in ALPHA and is disabled by default. + # Setting drift to true enables the drift disruption method to watch for drift between currently deployed nodes + # and the desired state of nodes set in provisioners and node templates + drift: false + # -- driftEnabled is in ALPHA and is disabled by default. (Deprecated: Use featureGates.drift instead) + # Setting driftEnabled to true enables the drift disruption method to watch for drift between currently deployed nodes # and the desired state of nodes set in provisioners and node templates driftEnabled: false diff --git a/go.mod b/go.mod index 46e05c333eb7..f68137a37a0f 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/aws/aws-sdk-go v1.46.0 github.com/aws/karpenter-core v0.31.1-0.20231019191151-73c0fd546f75 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c + github.com/go-logr/zapr v1.2.4 github.com/imdario/mergo v0.3.16 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.13.0 @@ -49,7 +50,6 @@ require ( github.com/go-kit/log v0.2.1 // indirect github.com/go-logfmt/logfmt v0.6.0 // indirect github.com/go-logr/logr v1.2.4 // indirect - github.com/go-logr/zapr v1.2.4 // indirect github.com/go-openapi/jsonpointer v0.20.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.22.4 // indirect diff --git a/hack/docs/instancetypes_gen_docs.go b/hack/docs/instancetypes_gen_docs.go index b87f3cb435ce..877f00274b41 100644 --- a/hack/docs/instancetypes_gen_docs.go +++ b/hack/docs/instancetypes_gen_docs.go @@ -40,6 +40,7 @@ import ( coreoptions "github.com/aws/karpenter-core/pkg/operator/options" coretest "github.com/aws/karpenter-core/pkg/test" nodepoolutil "github.com/aws/karpenter-core/pkg/utils/nodepool" + "github.com/aws/karpenter/pkg/apis/settings" awscloudprovider "github.com/aws/karpenter/pkg/cloudprovider" "github.com/aws/karpenter/pkg/operator" "github.com/aws/karpenter/pkg/operator/options" @@ -93,6 +94,8 @@ func main() { ClusterEndpoint: lo.ToPtr("https://docs-gen.aws"), IsolatedVPC: lo.ToPtr(true), // disable pricing lookup })) + // TODO @joinnis: Remove this when dropping alpha support + ctx = settings.ToContext(ctx, test.Settings()) ctx, op := operator.NewOperator(ctx, &coreoperator.Operator{ Manager: &FakeManager{}, diff --git a/pkg/apis/settings/settings.go b/pkg/apis/settings/settings.go index 052d6eb0341b..e8670b1fd496 100644 --- a/pkg/apis/settings/settings.go +++ b/pkg/apis/settings/settings.go @@ -127,7 +127,7 @@ func AsTypedString[T ~string](key string, target *T) configmap.ParseFunc { // AsStringMap parses a value as a JSON map of map[string]string. func AsStringMap(key string, target *map[string]string) configmap.ParseFunc { return func(data map[string]string) error { - if raw, ok := data[key]; ok { + if raw, ok := data[key]; ok && raw != "" { m := map[string]string{} if err := json.Unmarshal([]byte(raw), &m); err != nil { return err diff --git a/test/pkg/debug/monitor.go b/test/pkg/debug/monitor.go index 08217318e165..aa4a8f4b1689 100644 --- a/test/pkg/debug/monitor.go +++ b/test/pkg/debug/monitor.go @@ -18,11 +18,13 @@ import ( "context" "sync" + "github.com/go-logr/zapr" "github.com/samber/lo" "k8s.io/client-go/rest" "knative.dev/pkg/logging" controllerruntime "sigs.k8s.io/controller-runtime" "sigs.k8s.io/controller-runtime/pkg/client" + ctrl "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/metrics/server" @@ -39,6 +41,7 @@ type Monitor struct { func New(ctx context.Context, config *rest.Config, kubeClient client.Client) *Monitor { logger := logging.FromContext(ctx) + ctrl.SetLogger(zapr.NewLogger(logger.Desugar())) mgr := lo.Must(controllerruntime.NewManager(config, controllerruntime.Options{ Scheme: scheme.Scheme, BaseContext: func() context.Context { diff --git a/test/pkg/environment/aws/setup.go b/test/pkg/environment/aws/setup.go index 461264f3d7fc..990b648610e2 100644 --- a/test/pkg/environment/aws/setup.go +++ b/test/pkg/environment/aws/setup.go @@ -22,7 +22,8 @@ import ( "github.com/aws/karpenter/pkg/apis/v1alpha1" ) -var persistedSettings = &v1.ConfigMap{} +var persistedSettings []v1.EnvVar +var persistedSettingsLegacy = &v1.ConfigMap{} var ( CleanableObjects = []client.Object{ @@ -32,6 +33,7 @@ var ( func (env *Environment) BeforeEach() { persistedSettings = env.ExpectSettings() + persistedSettingsLegacy = env.ExpectSettingsLegacy() env.Environment.BeforeEach() } @@ -43,5 +45,6 @@ func (env *Environment) Cleanup() { func (env *Environment) AfterEach() { env.Environment.AfterEach() // Ensure we reset settings after collecting the controller logs - env.ExpectSettingsReplaced(persistedSettings.Data) + env.ExpectSettingsReplaced(persistedSettings...) + env.ExpectSettingsReplacedLegacy(persistedSettingsLegacy.Data) } diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index ab856d96e318..7d4d932849ed 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -96,24 +96,76 @@ func (env *Environment) ExpectCreatedOrUpdated(objects ...client.Object) { } } -// ExpectSettings gets the karpenter-global-settings ConfigMap -func (env *Environment) ExpectSettings() *v1.ConfigMap { +func (env *Environment) ExpectSettings() (res []v1.EnvVar) { + GinkgoHelper() + + d := &appsv1.Deployment{} + Expect(env.Client.Get(env.Context, types.NamespacedName{Namespace: "karpenter", Name: "karpenter"}, d)).To(Succeed()) + Expect(d.Spec.Template.Spec.Containers).To(HaveLen(1)) + return lo.Map(d.Spec.Template.Spec.Containers[0].Env, func(v v1.EnvVar, _ int) v1.EnvVar { + return *v.DeepCopy() + }) +} + +func (env *Environment) ExpectSettingsReplaced(vars ...v1.EnvVar) { + GinkgoHelper() + + d := &appsv1.Deployment{} + Expect(env.Client.Get(env.Context, types.NamespacedName{Namespace: "karpenter", Name: "karpenter"}, d)).To(Succeed()) + Expect(d.Spec.Template.Spec.Containers).To(HaveLen(1)) + + stored := d.DeepCopy() + d.Spec.Template.Spec.Containers[0].Env = vars + + if !equality.Semantic.DeepEqual(d, stored) { + By("replacing environment variables for karpenter deployment") + Expect(env.Client.Patch(env.Context, d, client.MergeFrom(stored))).To(Succeed()) + env.EventuallyExpectKarpenterRestarted() + } +} + +func (env *Environment) ExpectSettingsOverridden(vars ...v1.EnvVar) { + GinkgoHelper() + + d := &appsv1.Deployment{} + Expect(env.Client.Get(env.Context, types.NamespacedName{Namespace: "karpenter", Name: "karpenter"}, d)).To(Succeed()) + Expect(d.Spec.Template.Spec.Containers).To(HaveLen(1)) + + stored := d.DeepCopy() + for _, v := range vars { + if _, i, ok := lo.FindIndexOf(d.Spec.Template.Spec.Containers[0].Env, func(e v1.EnvVar) bool { + return e.Name == v.Name + }); ok { + d.Spec.Template.Spec.Containers[0].Env[i] = v + } else { + d.Spec.Template.Spec.Containers[0].Env = append(d.Spec.Template.Spec.Containers[0].Env, v) + } + } + if !equality.Semantic.DeepEqual(d, stored) { + By("overriding environment variables for karpenter deployment") + Expect(env.Client.Patch(env.Context, d, client.MergeFrom(stored))).To(Succeed()) + env.EventuallyExpectKarpenterRestarted() + } +} + +// ExpectSettingsLegacy gets the karpenter-global-settings ConfigMap +func (env *Environment) ExpectSettingsLegacy() *v1.ConfigMap { GinkgoHelper() return env.ExpectConfigMapExists(types.NamespacedName{Namespace: "karpenter", Name: "karpenter-global-settings"}) } -// ExpectSettingsReplaced performs a full replace of the settings, replacing the existing data +// ExpectSettingsReplacedLegacy performs a full replace of the settings, replacing the existing data // with the data passed through -func (env *Environment) ExpectSettingsReplaced(data ...map[string]string) { +func (env *Environment) ExpectSettingsReplacedLegacy(data ...map[string]string) { GinkgoHelper() if env.ExpectConfigMapDataReplaced(types.NamespacedName{Namespace: "karpenter", Name: "karpenter-global-settings"}, data...) { env.EventuallyExpectKarpenterRestarted() } } -// ExpectSettingsOverridden overrides specific values specified through data. It only overrides +// ExpectSettingsOverriddenLegacy overrides specific values specified through data. It only overrides // or inserts the specific values specified and does not upsert any of the existing data -func (env *Environment) ExpectSettingsOverridden(data ...map[string]string) { +func (env *Environment) ExpectSettingsOverriddenLegacy(data ...map[string]string) { GinkgoHelper() if env.ExpectConfigMapDataOverridden(types.NamespacedName{Namespace: "karpenter", Name: "karpenter-global-settings"}, data...) { env.EventuallyExpectKarpenterRestarted() @@ -226,6 +278,11 @@ func (env *Environment) EventuallyExpectKarpenterRestarted() { GinkgoHelper() By("rolling out the new karpenter deployment") env.EventuallyExpectRollout("karpenter", "karpenter") + env.ExpectKarpenterLeaseOwnerChanged() +} + +func (env *Environment) ExpectKarpenterLeaseOwnerChanged() { + GinkgoHelper() By("waiting for a new karpenter pod to hold the lease") pods := env.ExpectKarpenterPods() diff --git a/test/suites/drift/suite_test.go b/test/suites/drift/suite_test.go index baf91ee74faa..3bf0b67e34e0 100644 --- a/test/suites/drift/suite_test.go +++ b/test/suites/drift/suite_test.go @@ -87,9 +87,8 @@ var _ = Describe("Drift", Label("AWS"), func() { }, }, }) - env.ExpectSettingsOverridden(map[string]string{ - "featureGates.driftEnabled": "true", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) + env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) }) It("should deprovision nodes that have drifted due to AMIs", func() { // choose an old static image @@ -122,9 +121,8 @@ var _ = Describe("Drift", Label("AWS"), func() { env.EventuallyExpectNotFound(pod, machine, node) }) It("should not deprovision nodes that have drifted without the featureGate enabled", func() { - env.ExpectSettingsOverridden(map[string]string{ - "featureGates.driftEnabled": "false", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) + env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=false"}) // choose an old static image parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ Name: awssdk.String("/aws/service/eks/optimized-ami/1.23/amazon-linux-2/amazon-eks-node-1.23-v20230322/image_id"), diff --git a/test/suites/expiration/expiration_test.go b/test/suites/expiration/expiration_test.go index 9c3c7ff9ab96..75a31c5f2a94 100644 --- a/test/suites/expiration/expiration_test.go +++ b/test/suites/expiration/expiration_test.go @@ -73,9 +73,8 @@ var _ = Describe("Expiration", func() { ProviderRef: &v1alpha5.MachineTemplateRef{Name: nodeTemplate.Name}, TTLSecondsUntilExpired: ptr.Int64(30), }) - env.ExpectSettingsOverridden(map[string]string{ - "featureGates.driftEnabled": "false", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) + env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) }) It("should expire the node after the TTLSecondsUntilExpired is reached", func() { var numPods int32 = 1 diff --git a/test/suites/integration/cni_test.go b/test/suites/integration/cni_test.go index 85c78ff9b64e..a14efea66cd5 100644 --- a/test/suites/integration/cni_test.go +++ b/test/suites/integration/cni_test.go @@ -21,6 +21,7 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" @@ -33,9 +34,7 @@ import ( var _ = Describe("CNITests", func() { It("should set max pods to 110 when AWSENILimited when AWS_ENI_LIMITED_POD_DENSITY is false", func() { - env.ExpectSettingsOverridden(map[string]string{ - "aws.enableENILimitedPodDensity": "false", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.enableENILimitedPodDensity": "false"}) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{ AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, @@ -73,9 +72,8 @@ var _ = Describe("CNITests", func() { Expect(allocatablePods).To(Equal(eniLimitedPodsFor(node.Labels["node.kubernetes.io/instance-type"]))) }) It("should set maxPods when reservedENIs is set", func() { - env.ExpectSettingsOverridden(map[string]string{ - "aws.reservedENIs": "1", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.reservedENIs": "1"}) + env.ExpectSettingsOverridden(corev1.EnvVar{Name: "RESERVED_ENIS", Value: "1"}) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{ AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, @@ -111,9 +109,9 @@ func reservedENIsFor(instanceType string) int64 { Expect(err).ToNot(HaveOccurred()) networkInfo := *instance.InstanceTypes[0].NetworkInfo reservedENIs := 0 - reservedENIsStr, ok := env.ExpectSettings().Data["aws.reservedENIs"] + reservedENIsVar, ok := lo.Find(env.ExpectSettings(), func(v corev1.EnvVar) bool { return v.Name == "RESERVED_ENIS" }) if ok { - reservedENIs, err = strconv.Atoi(reservedENIsStr) + reservedENIs, err = strconv.Atoi(reservedENIsVar.Value) Expect(err).ToNot(HaveOccurred()) } return (*networkInfo.MaximumNetworkInterfaces-int64(reservedENIs))*(*networkInfo.Ipv4AddressesPerInterface-1) + 2 diff --git a/test/suites/integration/extended_resources_test.go b/test/suites/integration/extended_resources_test.go index 7e222743d734..98de604d8e4b 100644 --- a/test/suites/integration/extended_resources_test.go +++ b/test/suites/integration/extended_resources_test.go @@ -119,9 +119,7 @@ var _ = Describe("Extended Resources", func() { DeferCleanup(func() { env.ExpectPodENIDisabled() }) - env.ExpectSettingsOverridden(map[string]string{ - "aws.enablePodENI": "true", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.enablePodENI": "true"}) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, diff --git a/test/suites/integration/tags_test.go b/test/suites/integration/tags_test.go index cd97aea53bf5..20891e7668ff 100644 --- a/test/suites/integration/tags_test.go +++ b/test/suites/integration/tags_test.go @@ -66,9 +66,7 @@ var _ = Describe("Tags", func() { }, }) - env.ExpectSettingsOverridden(map[string]string{ - "aws.tags": `{"TestTag": "TestVal", "example.com/tag": "custom-value"}`, - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.tags": `{"TestTag": "TestVal", "example.com/tag": "custom-value"}`}) provisioner := test.Provisioner(coretest.ProvisionerOptions{ProviderRef: &v1alpha5.MachineTemplateRef{Name: provider.Name}}) pod := coretest.Pod() diff --git a/test/suites/machine/garbage_collection_test.go b/test/suites/machine/garbage_collection_test.go index 74247796d631..c4b2b0112b23 100644 --- a/test/suites/machine/garbage_collection_test.go +++ b/test/suites/machine/garbage_collection_test.go @@ -25,6 +25,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" + v1 "k8s.io/api/core/v1" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/test" @@ -134,9 +135,8 @@ var _ = Describe("NodeClaimGarbageCollection", func() { }) It("should succeed to garbage collect a Machine that was deleted without the cluster's knowledge", func() { // Disable the interruption queue for the garbage collection test - env.ExpectSettingsOverridden(map[string]string{ - "aws.interruptionQueueName": "", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.interruptionQueueName": ""}) + env.ExpectSettingsOverridden(v1.EnvVar{Name: "INTERRUPTION_QUEUE", Value: ""}) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, diff --git a/test/suites/scale/deprovisioning_test.go b/test/suites/scale/deprovisioning_test.go index 1ae51e001155..473a82621bdf 100644 --- a/test/suites/scale/deprovisioning_test.go +++ b/test/suites/scale/deprovisioning_test.go @@ -81,9 +81,8 @@ var _ = Describe("Deprovisioning", Label(debug.NoWatch), Label(debug.NoEvents), var dsCount int BeforeEach(func() { - env.ExpectSettingsOverridden(map[string]string{ - "featureGates.driftEnabled": "true", - }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) + env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) nodeTemplate = awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, diff --git a/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step08-apply-helm-chart.sh b/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step08-apply-helm-chart.sh index b6c86db990e4..8d167122b2b7 100755 --- a/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step08-apply-helm-chart.sh +++ b/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step08-apply-helm-chart.sh @@ -4,7 +4,7 @@ helm registry logout public.ecr.aws helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version ${KARPENTER_VERSION} --namespace karpenter --create-namespace \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \ --set settings.clusterName=${CLUSTER_NAME} \ - --set settings.interruptionQueueName=${CLUSTER_NAME} \ + --set settings.interruptionQueue=${CLUSTER_NAME} \ --set controller.resources.requests.cpu=1 \ --set controller.resources.requests.memory=1Gi \ --set controller.resources.limits.cpu=1 \ diff --git a/website/content/en/preview/reference/settings.md b/website/content/en/preview/reference/settings.md index 78f959ea8dd6..d3d2a5a5351a 100644 --- a/website/content/en/preview/reference/settings.md +++ b/website/content/en/preview/reference/settings.md @@ -12,12 +12,19 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf | Environment Variable | CLI Flag | Description | |--|--|--| +| ASSUME_ROLE_ARN | \-\-assume-role-arn | Role to assume for calling AWS services.| +| ASSUME_ROLE_DURATION | \-\-assume-role-duration | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRole set. (default = 15m0s)| | BATCH_IDLE_DURATION | \-\-batch-idle-duration | The maximum amount of time with no new pending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. (default = 1s)| | BATCH_MAX_DURATION | \-\-batch-max-duration | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. (default = 10s)| +| CLUSTER_CA_BUNDLE | \-\-cluster-ca-bundle | Cluster CA bundle for nodes to use for TLS connections with the API server. If not set, this is taken from the controller's TLS configuration.| +| CLUSTER_ENDPOINT | \-\-cluster-endpoint | The external kubernetes cluster endpoint for new nodes to connect with. If not specified, will discover the cluster endpoint using DescribeCluster API.| +| CLUSTER_NAME | \-\-cluster-name | [REQUIRED] The kubernetes cluster name for resource discovery.| | DISABLE_WEBHOOK | \-\-disable-webhook | Disable the admission and validation webhooks| | ENABLE_PROFILING | \-\-enable-profiling | Enable the profiling on the metric endpoint| | FEATURE_GATES | \-\-feature-gates | Optional features can be enabled / disabled using feature gates. Current options are: Drift (default = Drift=false)| | HEALTH_PROBE_PORT | \-\-health-probe-port | The port the health probe endpoint binds to for reporting controller health (default = 8081)| +| INTERRUPTION_QUEUE | \-\-interruption-queue | Interruption queue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs.| +| ISOLATED_VPC | \-\-isolated-vpc | If true, then assume we can't reach AWS services which don't have a VPC endpoint. This also has the effect of disabling look-ups to the AWS pricing endpoint.| | KARPENTER_SERVICE | \-\-karpenter-service | The Karpenter Service name for the dynamic webhook certificate| | KUBE_CLIENT_BURST | \-\-kube-client-burst | The maximum allowed burst of queries to the kube-apiserver (default = 300)| | KUBE_CLIENT_QPS | \-\-kube-client-qps | The smoothed rate of qps to kube-apiserver (default = 200)| @@ -25,6 +32,8 @@ Karpenter surfaces environment variables and CLI parameters to allow you to conf | LOG_LEVEL | \-\-log-level | Log verbosity level. Can be one of 'debug', 'info', or 'error'| | MEMORY_LIMIT | \-\-memory-limit | Memory limit on the container running the controller. The GC soft memory limit is set to 90% of this value. (default = -1)| | METRICS_PORT | \-\-metrics-port | The port the metric endpoint binds to for operating metrics about the controller itself (default = 8000)| +| RESERVED_ENIS | \-\-reserved-enis | Reserved ENIs are not included in the calculations for max-pods or kube-reserved. This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html. (default = 0)| +| VM_MEMORY_OVERHEAD_PERCENT | \-\-vm-memory-overhead-percent | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types. (default = 0.075)| | WEBHOOK_METRICS_PORT | \-\-webhook-metrics-port | The port the webhook metric endpoing binds to for operating metrics about the webhook (default = 8001)| | WEBHOOK_PORT | \-\-webhook-port | The port the webhook endpoint binds to for validation and mutation of resources (default = 8443)| diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index d6a39d2ae1d2..9d4d6937393d 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -137,8 +137,8 @@ Add `~/go/bin` to your $PATH, if you have not already done so. helm upgrade --install karpenter oci://public.ecr.aws/karpenter/karpenter --version ${KARPENTER_VERSION} --namespace karpenter --create-namespace \ --set serviceAccount.annotations."eks\.amazonaws\.com/role-arn"=${KARPENTER_IAM_ROLE_ARN} \ --set settings.aws.defaultInstanceProfile=KarpenterNodeInstanceProfile-${CLUSTER_NAME} \ - --set settings.aws.clusterName=${CLUSTER_NAME} \ - --set settings.aws.interruptionQueueName=${CLUSTER_NAME} \ + --set settings.clusterName=${CLUSTER_NAME} \ + --set settings.interruptionQueue=${CLUSTER_NAME} \ --set controller.resources.requests.cpu=1 \ --set controller.resources.requests.memory=1Gi \ --set controller.resources.limits.cpu=1 \ @@ -204,6 +204,8 @@ Add `~/go/bin` to your $PATH, if you have not already done so. * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. +* Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. +* Karpenter now moves all AWS controller-wide configuration settings from the `settings.aws` block into the top-level `settings` block. The previous `settings.aws` block is deprecated and will be dropped at v0.33.0. ### Upgrading to v0.31.0+ From 68889664112379b7bd0b0131aa78fdb608c5c2fa Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 11:23:14 -0700 Subject: [PATCH 03/47] docs: Fix docs redirects (#4870) --- website/static/_redirects | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/website/static/_redirects b/website/static/_redirects index 5655bc1466e9..921f268b8ec6 100644 --- a/website/static/_redirects +++ b/website/static/_redirects @@ -5,11 +5,25 @@ /v0.26/getting-started/getting-started-with-karpenter/* /v0.26/getting-started/getting-started-with-eksctl/:splat /v0.27/getting-started/getting-started-with-eksctl/* /v0.27/getting-started/getting-started-with-karpenter/:splat /v0.28/getting-started/getting-started-with-eksctl/* /v0.28/getting-started/getting-started-with-karpenter/:splat + +# Redirect all preview documentation to the new routes /preview/concepts/provisioners /preview/concepts/nodepools /preview/concepts/node-templates /preview/concepts/nodeclasses /preview/concepts/deprovisioning /preview/concepts/disruption -/preview/upgrade/* /preview/upgrading/* +/preview/upgrade-guide /preview/upgrading/upgrade-guide /preview/concepts/instance-types /preview/reference/instance-types /preview/concepts/metrics /preview/reference/metrics /preview/concepts/settings /preview/reference/settings /preview/concepts/threat-model /preview/reference/threat-model + +# Redirect all v0.32 documentation to the new routes +/v0.32/concepts/provisioners /v0.32/concepts/nodepools +/v0.32/concepts/node-templates /v0.32/concepts/nodeclasses +/v0.32/concepts/deprovisioning /v0.32/concepts/disruption +/v0.32/upgrade-guide /v0.32/upgrading/upgrade-guide +/v0.32/concepts/instance-types /v0.32/reference/instance-types +/v0.32/concepts/metrics /v0.32/reference/metrics +/v0.32/concepts/settings /v0.32/reference/settings +/v0.32/concepts/threat-model /v0.32/reference/threat-model + +# TODO @joinnis: Add redirects for the "docs" version and future versions after v0.32 release \ No newline at end of file From be01fe554c8a0be0a719b3f7e8ff2753f8ea358c Mon Sep 17 00:00:00 2001 From: Jason Deal Date: Fri, 20 Oct 2023 11:27:25 -0700 Subject: [PATCH 04/47] chore: remove EnablePodENI logic (#4867) --- pkg/providers/instancetype/nodeclass_test.go | 18 ------------- .../instancetype/nodetemplate_test.go | 18 ------------- pkg/providers/instancetype/types.go | 27 +++++++++---------- 3 files changed, 12 insertions(+), 51 deletions(-) diff --git a/pkg/providers/instancetype/nodeclass_test.go b/pkg/providers/instancetype/nodeclass_test.go index 16cb06e0f502..bee4807860ea 100644 --- a/pkg/providers/instancetype/nodeclass_test.go +++ b/pkg/providers/instancetype/nodeclass_test.go @@ -428,20 +428,6 @@ var _ = Describe("NodeClass/InstanceTypes", func() { ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) ExpectScheduled(ctx, env.Client, pod) }) - It("should fail to launch AWS Pod ENI if the setting enabling it isn't set", func() { - ctx = settings.ToContext(ctx, test.Settings(test.SettingOptions{ - EnablePodENI: lo.ToPtr(false), - })) - ExpectApplied(ctx, env.Client, nodePool, nodeClass) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{v1beta1.ResourceAWSPodENI: resource.MustParse("1")}, - Limits: v1.ResourceList{v1beta1.ResourceAWSPodENI: resource.MustParse("1")}, - }, - }) - ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) - ExpectNotScheduled(ctx, env.Client, pod) - }) It("should launch AWS Pod ENI on a compatible instance type", func() { ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{ @@ -928,10 +914,6 @@ var _ = Describe("NodeClass/InstanceTypes", func() { } }) It("should override max-pods value when AWSENILimitedPodDensity is unset", func() { - ctx = settings.ToContext(ctx, test.Settings(test.SettingOptions{ - EnablePodENI: lo.ToPtr(false), - })) - instanceInfo, err := awsEnv.InstanceTypesProvider.GetInstanceTypes(ctx) Expect(err).To(BeNil()) nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ diff --git a/pkg/providers/instancetype/nodetemplate_test.go b/pkg/providers/instancetype/nodetemplate_test.go index 30e03bdfa179..890e4d387f17 100644 --- a/pkg/providers/instancetype/nodetemplate_test.go +++ b/pkg/providers/instancetype/nodetemplate_test.go @@ -435,20 +435,6 @@ var _ = Describe("NodeTemplate/InstanceTypes", func() { ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) ExpectScheduled(ctx, env.Client, pod) }) - It("should fail to launch AWS Pod ENI if the setting enabling it isn't set", func() { - ctx = settings.ToContext(ctx, test.Settings(test.SettingOptions{ - EnablePodENI: lo.ToPtr(false), - })) - ExpectApplied(ctx, env.Client, provisioner, nodeTemplate) - pod := coretest.UnschedulablePod(coretest.PodOptions{ - ResourceRequirements: v1.ResourceRequirements{ - Requests: v1.ResourceList{v1alpha1.ResourceAWSPodENI: resource.MustParse("1")}, - Limits: v1.ResourceList{v1alpha1.ResourceAWSPodENI: resource.MustParse("1")}, - }, - }) - ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) - ExpectNotScheduled(ctx, env.Client, pod) - }) It("should launch AWS Pod ENI on a compatible instance type", func() { ExpectApplied(ctx, env.Client, provisioner, nodeTemplate) pod := coretest.UnschedulablePod(coretest.PodOptions{ @@ -970,10 +956,6 @@ var _ = Describe("NodeTemplate/InstanceTypes", func() { } }) It("should override max-pods value when AWSENILimitedPodDensity is unset", func() { - ctx = settings.ToContext(ctx, test.Settings(test.SettingOptions{ - EnablePodENI: lo.ToPtr(false), - })) - instanceInfo, err := awsEnv.InstanceTypesProvider.GetInstanceTypes(ctx) Expect(err).To(BeNil()) provisioner = test.Provisioner(coretest.ProvisionerOptions{Kubelet: &v1alpha5.KubeletConfiguration{MaxPods: ptr.Int32(10)}}) diff --git a/pkg/providers/instancetype/types.go b/pkg/providers/instancetype/types.go index 5f132f1c6cb3..750543c3e251 100644 --- a/pkg/providers/instancetype/types.go +++ b/pkg/providers/instancetype/types.go @@ -173,19 +173,16 @@ func computeCapacity(ctx context.Context, info *ec2.InstanceTypeInfo, amiFamily blockDeviceMappings []*v1beta1.BlockDeviceMapping, kc *corev1beta1.KubeletConfiguration) v1.ResourceList { resourceList := v1.ResourceList{ - v1.ResourceCPU: *cpu(info), - v1.ResourceMemory: *memory(ctx, info), - v1.ResourceEphemeralStorage: *ephemeralStorage(amiFamily, blockDeviceMappings), - v1.ResourcePods: *pods(ctx, info, amiFamily, kc), - v1beta1.ResourceAWSPodENI: *awsPodENI(ctx, aws.StringValue(info.InstanceType)), - v1beta1.ResourceNVIDIAGPU: *nvidiaGPUs(info), - v1beta1.ResourceAMDGPU: *amdGPUs(info), - v1beta1.ResourceAWSNeuron: *awsNeurons(info), - v1beta1.ResourceHabanaGaudi: *habanaGaudis(info), - } - if _, ok := amiFamily.(*amifamily.Windows); ok { - //ResourcePrivateIPv4Address is the same as ENILimitedPods on Windows node - resourceList[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(info) + v1.ResourceCPU: *cpu(info), + v1.ResourceMemory: *memory(ctx, info), + v1.ResourceEphemeralStorage: *ephemeralStorage(amiFamily, blockDeviceMappings), + v1.ResourcePods: *pods(ctx, info, amiFamily, kc), + v1beta1.ResourceAWSPodENI: *awsPodENI(aws.StringValue(info.InstanceType)), + v1beta1.ResourceNVIDIAGPU: *nvidiaGPUs(info), + v1beta1.ResourceAMDGPU: *amdGPUs(info), + v1beta1.ResourceAWSNeuron: *awsNeurons(info), + v1beta1.ResourceHabanaGaudi: *habanaGaudis(info), + v1beta1.ResourcePrivateIPv4Address: *privateIPv4Address(info), } return resourceList } @@ -238,10 +235,10 @@ func ephemeralStorage(amiFamily amifamily.AMIFamily, blockDeviceMappings []*v1be return amifamily.DefaultEBS.VolumeSize } -func awsPodENI(ctx context.Context, name string) *resource.Quantity { +func awsPodENI(name string) *resource.Quantity { // https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html#supported-instance-types limits, ok := Limits[name] - if settings.FromContext(ctx).EnablePodENI && ok && limits.IsTrunkingCompatible { + if ok && limits.IsTrunkingCompatible { return resources.Quantity(fmt.Sprint(limits.BranchInterface)) } return resources.Quantity("0") From f55af443e2394218c1cf845f6c41233b66c67ffa Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 12:56:37 -0700 Subject: [PATCH 05/47] Fix returning non-empty reconcile result and error (#4864) --- go.mod | 2 +- go.sum | 4 ++-- pkg/controllers/interruption/controller.go | 5 ++++- pkg/controllers/nodeclaim/garbagecollection/controller.go | 5 ++++- pkg/controllers/nodeclaim/link/controller.go | 5 ++++- pkg/controllers/nodeclaim/tagging/controller.go | 7 +------ pkg/controllers/nodeclass/controller.go | 5 ++++- website/static/_redirects | 2 +- 8 files changed, 21 insertions(+), 14 deletions(-) diff --git a/go.mod b/go.mod index f68137a37a0f..a654b0718615 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.46.0 - github.com/aws/karpenter-core v0.31.1-0.20231019191151-73c0fd546f75 + github.com/aws/karpenter-core v0.31.1-0.20231020162441-e7fbaa291c4e github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 github.com/imdario/mergo v0.3.16 diff --git a/go.sum b/go.sum index e3db90e43d87..4de6857477d5 100644 --- a/go.sum +++ b/go.sum @@ -57,8 +57,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.46.0 h1:Igh7W8P+sA6mXJ9yhreOSweefLapcqekhxQlY1llxcM= github.com/aws/aws-sdk-go v1.46.0/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/karpenter-core v0.31.1-0.20231019191151-73c0fd546f75 h1:YGsJA1sH7xW749onvhczydUZksdMm3PB/AJr8qcZllA= -github.com/aws/karpenter-core v0.31.1-0.20231019191151-73c0fd546f75/go.mod h1:rb3kp/3cj38tACF6udfpmIvKoQMwirSVoHNlrd66LyE= +github.com/aws/karpenter-core v0.31.1-0.20231020162441-e7fbaa291c4e h1:7heVq5GV1sMgcFlnD6pNElqmbxKMLHX9kaXm9njZC0Y= +github.com/aws/karpenter-core v0.31.1-0.20231020162441-e7fbaa291c4e/go.mod h1:rb3kp/3cj38tACF6udfpmIvKoQMwirSVoHNlrd66LyE= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c/go.mod h1:l/TIBsaCx/IrOr0Xvlj/cHLOf05QzuQKEZ1hx2XWmfU= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= diff --git a/pkg/controllers/interruption/controller.go b/pkg/controllers/interruption/controller.go index f6d594764755..6e466f5133f9 100644 --- a/pkg/controllers/interruption/controller.go +++ b/pkg/controllers/interruption/controller.go @@ -114,7 +114,10 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc } errs[i] = c.deleteMessage(ctx, sqsMessages[i]) }) - return reconcile.Result{}, multierr.Combine(errs...) + if err = multierr.Combine(errs...); err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{}, nil } func (c *Controller) Name() string { diff --git a/pkg/controllers/nodeclaim/garbagecollection/controller.go b/pkg/controllers/nodeclaim/garbagecollection/controller.go index 8183528e45b3..72f2477a4021 100644 --- a/pkg/controllers/nodeclaim/garbagecollection/controller.go +++ b/pkg/controllers/nodeclaim/garbagecollection/controller.go @@ -97,8 +97,11 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc errs[i] = c.garbageCollect(ctx, managedRetrieved[i], nodeList) } }) + if err = multierr.Combine(errs...); err != nil { + return reconcile.Result{}, err + } c.successfulCount++ - return reconcile.Result{RequeueAfter: lo.Ternary(c.successfulCount <= 20, time.Second*10, time.Minute*2)}, multierr.Combine(errs...) + return reconcile.Result{RequeueAfter: lo.Ternary(c.successfulCount <= 20, time.Second*10, time.Minute*2)}, nil } func (c *Controller) garbageCollect(ctx context.Context, nodeClaim *v1beta1.NodeClaim, nodeList *v1.NodeList) error { diff --git a/pkg/controllers/nodeclaim/link/controller.go b/pkg/controllers/nodeclaim/link/controller.go index b54130725154..fcad21bbe82e 100644 --- a/pkg/controllers/nodeclaim/link/controller.go +++ b/pkg/controllers/nodeclaim/link/controller.go @@ -96,8 +96,11 @@ func (c *Controller) Reconcile(ctx context.Context, _ reconcile.Request) (reconc workqueue.ParallelizeUntil(ctx, 100, len(retrieved), func(i int) { errs[i] = c.link(ctx, retrieved[i], machineList.Items) }) + if err = multierr.Combine(errs...); err != nil { + return reconcile.Result{}, err + } // Effectively, don't requeue this again once it succeeds - return reconcile.Result{RequeueAfter: math.MaxInt64}, multierr.Combine(errs...) + return reconcile.Result{RequeueAfter: math.MaxInt64}, nil } func (c *Controller) link(ctx context.Context, retrieved *v1beta1.NodeClaim, existingMachines []v1alpha5.Machine) error { diff --git a/pkg/controllers/nodeclaim/tagging/controller.go b/pkg/controllers/nodeclaim/tagging/controller.go index bc620ebb9b73..c3add64d0c4d 100644 --- a/pkg/controllers/nodeclaim/tagging/controller.go +++ b/pkg/controllers/nodeclaim/tagging/controller.go @@ -61,11 +61,9 @@ func (c *Controller) Name() string { func (c *Controller) Reconcile(ctx context.Context, nodeClaim *corev1beta1.NodeClaim) (reconcile.Result, error) { stored := nodeClaim.DeepCopy() - if !isTaggable(nodeClaim) { return reconcile.Result{}, nil } - ctx = logging.WithLogger(ctx, logging.FromContext(ctx).With("provider-id", nodeClaim.Status.ProviderID)) id, err := utils.ParseInstanceID(nodeClaim.Status.ProviderID) if err != nil { @@ -73,18 +71,15 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClaim *corev1beta1.NodeC logging.FromContext(ctx).Errorf("failed to parse instance ID, %w", err) return reconcile.Result{}, nil } - - if err := c.tagInstance(ctx, nodeClaim, id); err != nil { + if err = c.tagInstance(ctx, nodeClaim, id); err != nil { return reconcile.Result{}, cloudprovider.IgnoreNodeClaimNotFoundError(err) } - nodeClaim.Annotations = lo.Assign(nodeClaim.Annotations, map[string]string{v1beta1.AnnotationInstanceTagged: "true"}) if !equality.Semantic.DeepEqual(nodeClaim, stored) { if err := c.kubeClient.Patch(ctx, nodeClaim, client.MergeFrom(stored)); err != nil { return reconcile.Result{}, client.IgnoreNotFound(err) } } - return reconcile.Result{}, nil } diff --git a/pkg/controllers/nodeclass/controller.go b/pkg/controllers/nodeclass/controller.go index 22eaa711d9a6..e587fcc56fa7 100644 --- a/pkg/controllers/nodeclass/controller.go +++ b/pkg/controllers/nodeclass/controller.go @@ -92,7 +92,10 @@ func (c *Controller) Reconcile(ctx context.Context, nodeClass *v1beta1.EC2NodeCl err = multierr.Append(err, client.IgnoreNotFound(patchErr)) } } - return reconcile.Result{RequeueAfter: 5 * time.Minute}, err + if err != nil { + return reconcile.Result{}, err + } + return reconcile.Result{RequeueAfter: 5 * time.Minute}, nil } func (c *Controller) Finalize(ctx context.Context, nodeClass *v1beta1.EC2NodeClass) (reconcile.Result, error) { diff --git a/website/static/_redirects b/website/static/_redirects index 921f268b8ec6..ae6fac8d0a54 100644 --- a/website/static/_redirects +++ b/website/static/_redirects @@ -26,4 +26,4 @@ /v0.32/concepts/settings /v0.32/reference/settings /v0.32/concepts/threat-model /v0.32/reference/threat-model -# TODO @joinnis: Add redirects for the "docs" version and future versions after v0.32 release \ No newline at end of file +# TODO @joinnis: Add redirects for the "docs" version and future versions after v0.32 release From 355903d84baf60c232970dca7509e3303fe19c96 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 14:30:10 -0700 Subject: [PATCH 06/47] fix: Enforce `karpenter.k8s.aws` tags on instance profiles and instances (#4872) --- pkg/apis/v1beta1/labels.go | 4 ++-- pkg/providers/instance/instance.go | 1 + .../cloudformation.yaml | 8 ++++---- .../content/en/preview/reference/cloudformation.md | 14 +++++++------- .../upgrading/v1beta1-controller-policy.json | 10 ++++++++++ 5 files changed, 24 insertions(+), 13 deletions(-) diff --git a/pkg/apis/v1beta1/labels.go b/pkg/apis/v1beta1/labels.go index 54acdfb8fbbe..84396f3f4d00 100644 --- a/pkg/apis/v1beta1/labels.go +++ b/pkg/apis/v1beta1/labels.go @@ -93,7 +93,7 @@ var ( ResourceAWSPodENI v1.ResourceName = "vpc.amazonaws.com/pod-eni" ResourcePrivateIPv4Address v1.ResourceName = "vpc.amazonaws.com/PrivateIPv4Address" - LabelNodeClass = Group + "/nodeclass" + LabelNodeClass = Group + "/ec2nodeclass" LabelInstanceHypervisor = Group + "/instance-hypervisor" LabelInstanceEncryptionInTransitSupported = Group + "/instance-encryption-in-transit-supported" @@ -113,6 +113,6 @@ var ( LabelInstanceAcceleratorName = Group + "/instance-accelerator-name" LabelInstanceAcceleratorManufacturer = Group + "/instance-accelerator-manufacturer" LabelInstanceAcceleratorCount = Group + "/instance-accelerator-count" - AnnotationNodeClassHash = Group + "/nodeclass-hash" + AnnotationNodeClassHash = Group + "/ec2nodeclass-hash" AnnotationInstanceTagged = Group + "/tagged" ) diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 35690da3c5ea..3721dd38a8b9 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -267,6 +267,7 @@ func getTags(ctx context.Context, nodeClass *v1beta1.EC2NodeClass, nodeClaim *co fmt.Sprintf("kubernetes.io/cluster/%s", options.FromContext(ctx).ClusterName): "owned", corev1beta1.NodePoolLabelKey: nodeClaim.Labels[corev1beta1.NodePoolLabelKey], corev1beta1.ManagedByAnnotationKey: options.FromContext(ctx).ClusterName, + v1beta1.LabelNodeClass: nodeClass.Name, } } return lo.Assign(overridableTags, settings.FromContext(ctx).Tags, nodeClass.Spec.Tags, staticTags) diff --git a/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml b/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml index 71832c76bec8..08baa8f22f81 100644 --- a/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml +++ b/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml @@ -207,7 +207,7 @@ Resources: "aws:RequestTag/topology.kubernetes.io/region": "${AWS::Region}" }, "StringLike": { - "aws:RequestTag/karpenter.sh/nodeclass": "*" + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" } } }, @@ -226,8 +226,8 @@ Resources: "aws:RequestTag/topology.kubernetes.io/region": "${AWS::Region}" }, "StringLike": { - "aws:ResourceTag/karpenter.sh/nodeclass": "*", - "aws:RequestTag/karpenter.sh/nodeclass": "*" + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" } } }, @@ -246,7 +246,7 @@ Resources: "aws:ResourceTag/topology.kubernetes.io/region": "${AWS::Region}" }, "StringLike": { - "aws:ResourceTag/karpenter.sh/nodeclass": "*" + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" } } }, diff --git a/website/content/en/preview/reference/cloudformation.md b/website/content/en/preview/reference/cloudformation.md index 500b87c77439..dab1459debfb 100644 --- a/website/content/en/preview/reference/cloudformation.md +++ b/website/content/en/preview/reference/cloudformation.md @@ -352,7 +352,7 @@ This gives EC2 permission explicit permission to use the `KarpenterNodeRole-${Cl The AllowScopedInstanceProfileCreationActions Sid gives the Karpenter controller permission to create a new instance profile with [`iam:CreateInstanceProfile`](https://docs.aws.amazon.com/IAM/latest/APIReference/API_CreateInstanceProfile.html), provided that the request is made to a cluster with `kubernetes.io/cluster/${ClusterName` set to owned and is made in the current region. -Also, `karpenter.sh/nodeclass` must be set to some value. This ensures that Karpenter can generate instance profiles on your behalf based on roles specified in your `EC2NodeClasses` that you use to configure Karpenter. +Also, `karpenter.k8s.aws/ec2nodeclass` must be set to some value. This ensures that Karpenter can generate instance profiles on your behalf based on roles specified in your `EC2NodeClasses` that you use to configure Karpenter. ```json { @@ -368,7 +368,7 @@ Also, `karpenter.sh/nodeclass` must be set to some value. This ensures that Karp "aws:RequestTag/topology.kubernetes.io/region": "${AWS::Region}" }, "StringLike": { - "aws:RequestTag/karpenter.sh/nodeclass": "*" + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" } } } @@ -377,7 +377,7 @@ Also, `karpenter.sh/nodeclass` must be set to some value. This ensures that Karp #### AllowScopedInstanceProfileTagActions The AllowScopedInstanceProfileTagActions Sid gives the Karpenter controller permission to tag an instance profile with [`iam:TagInstanceProfile`](https://docs.aws.amazon.com/IAM/latest/APIReference/API_TagInstanceProfile.html), based on the values shown below, -Also, `karpenter.sh/nodeclass` must be set to some value. This ensures that Karpenter is only able to act on instance profiles that it provisions for this cluster. +Also, `karpenter.k8s.aws/ec2nodeclass` must be set to some value. This ensures that Karpenter is only able to act on instance profiles that it provisions for this cluster. ```json { @@ -395,8 +395,8 @@ Also, `karpenter.sh/nodeclass` must be set to some value. This ensures that Karp "aws:RequestTag/topology.kubernetes.io/region": "${AWS::Region}" }, "StringLike": { - "aws:ResourceTag/karpenter.sh/nodeclass": "*", - "aws:RequestTag/karpenter.sh/nodeclass": "*" + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" } } } @@ -407,7 +407,7 @@ Also, `karpenter.sh/nodeclass` must be set to some value. This ensures that Karp The AllowScopedInstanceProfileActions Sid gives the Karpenter controller permission to perform [`iam:AddRoleToInstanceProfile`](https://docs.aws.amazon.com/IAM/latest/APIReference/API_AddRoleToInstanceProfile.html), [`iam:RemoveRoleFromInstanceProfile`](https://docs.aws.amazon.com/IAM/latest/APIReference/API_RemoveRoleFromInstanceProfile.html), and [`iam:DeleteInstanceProfile`](https://docs.aws.amazon.com/IAM/latest/APIReference/API_DeleteInstanceProfile.html) actions, provided that the request is made to a cluster with `kubernetes.io/cluster/${ClusterName` set to owned and is made in the current region. -Also, `karpenter.sh/nodeclass` must be set to some value. This permission is further enforced by the `iam:PassRole` permission. If Karpenter attempts to add a role to an instance profile that it doesn't have `iam:PassRole` permission on, that call will fail. Therefore, if you configure Karpenter to use a new role through the `EC2NodeClass`, ensure that you also specify that role within your `iam:PassRole` permission. +Also, `karpenter.k8s.aws/ec2nodeclass` must be set to some value. This permission is further enforced by the `iam:PassRole` permission. If Karpenter attempts to add a role to an instance profile that it doesn't have `iam:PassRole` permission on, that call will fail. Therefore, if you configure Karpenter to use a new role through the `EC2NodeClass`, ensure that you also specify that role within your `iam:PassRole` permission. ```json { @@ -425,7 +425,7 @@ Also, `karpenter.sh/nodeclass` must be set to some value. This permission is fur "aws:ResourceTag/topology.kubernetes.io/region": "${AWS::Region}" }, "StringLike": { - "aws:ResourceTag/karpenter.sh/nodeclass": "*" + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" } } } diff --git a/website/content/en/preview/upgrading/v1beta1-controller-policy.json b/website/content/en/preview/upgrading/v1beta1-controller-policy.json index efe818b0708e..97b66c6b983d 100644 --- a/website/content/en/preview/upgrading/v1beta1-controller-policy.json +++ b/website/content/en/preview/upgrading/v1beta1-controller-policy.json @@ -170,6 +170,9 @@ "StringEquals": { "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", "aws:RequestTag/topology.kubernetes.io/region": "${REGION}" + }, + "StringLike": { + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" } } }, @@ -184,6 +187,10 @@ "aws:ResourceTag/topology.kubernetes.io/region": "${REGION}", "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", "aws:RequestTag/topology.kubernetes.io/region": "${REGION}" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", + "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" } } }, @@ -200,6 +207,9 @@ "StringEquals": { "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", "aws:ResourceTag/topology.kubernetes.io/region": "${REGION}" + }, + "StringLike": { + "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" } } }, From ee76bc3ae4bdc5f2aa45f457874e71f2bfab7073 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 16:18:04 -0700 Subject: [PATCH 07/47] test: v1beta1 e2e testing (#4873) Co-authored-by: njtran --- .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 23 +- test/pkg/debug/monitor.go | 1 + test/pkg/debug/nodeclaim.go | 82 +++ test/pkg/environment/aws/setup.go | 4 +- test/pkg/environment/common/expectations.go | 24 + test/pkg/environment/common/setup.go | 15 +- test/suites/{ => alpha}/chaos/suite_test.go | 2 +- .../{ => alpha}/consolidation/suite_test.go | 2 +- test/suites/{ => alpha}/drift/suite_test.go | 2 +- .../{ => alpha}/expiration/expiration_test.go | 2 +- .../{ => alpha}/integration/ami_test.go | 0 .../integration/aws_metadata_test.go | 0 .../backwards_compatability_test.go | 0 .../integration/block_device_mappings_test.go | 0 .../{ => alpha}/integration/cni_test.go | 0 .../{ => alpha}/integration/daemonset_test.go | 0 .../{ => alpha}/integration/emptiness_test.go | 0 .../integration/extended_resources_test.go | 0 .../{ => alpha}/integration/hash_test.go | 0 .../integration/instance_profile_test.go | 0 .../integration/kubelet_config_test.go | 0 .../lease_garbagecollection_test.go | 0 .../integration/scheduling_test.go | 0 .../integration/security_group_test.go | 0 .../{ => alpha}/integration/storage_test.go | 0 .../{ => alpha}/integration/subnet_test.go | 0 .../{ => alpha}/integration/suite_test.go | 2 +- .../{ => alpha}/integration/tags_test.go | 0 .../integration/termination_test.go | 0 .../testdata/al2_no_mime_userdata_input.sh | 0 .../testdata/al2_userdata_input.sh | 0 .../integration/testdata/amd_driver_input.sh | 0 .../integration/testdata/br_userdata_input.sh | 0 .../testdata/windows_userdata_input.ps1 | 0 .../{ => alpha}/integration/webhook_test.go | 0 .../{ => alpha}/interruption/suite_test.go | 2 +- test/suites/{ => alpha}/ipv6/suite_test.go | 2 +- .../machine/garbage_collection_test.go | 0 test/suites/{ => alpha}/machine/link_test.go | 0 .../{ => alpha}/machine/machine_test.go | 0 test/suites/{ => alpha}/machine/suite_test.go | 2 +- .../al2_userdata_custom_labels_input.sh | 0 .../machine/testdata/al2_userdata_input.sh | 0 .../{ => alpha}/scale/deprovisioning_test.go | 0 .../{ => alpha}/scale/provisioning_test.go | 0 test/suites/{ => alpha}/scale/suite_test.go | 2 +- .../{ => alpha}/utilization/suite_test.go | 2 +- test/suites/beta/drift/suite_test.go | 560 ++++++++++++++++++ 49 files changed, 705 insertions(+), 26 deletions(-) create mode 100644 test/pkg/debug/nodeclaim.go rename test/suites/{ => alpha}/chaos/suite_test.go (99%) rename test/suites/{ => alpha}/consolidation/suite_test.go (99%) rename test/suites/{ => alpha}/drift/suite_test.go (99%) rename test/suites/{ => alpha}/expiration/expiration_test.go (99%) rename test/suites/{ => alpha}/integration/ami_test.go (100%) rename test/suites/{ => alpha}/integration/aws_metadata_test.go (100%) rename test/suites/{ => alpha}/integration/backwards_compatability_test.go (100%) rename test/suites/{ => alpha}/integration/block_device_mappings_test.go (100%) rename test/suites/{ => alpha}/integration/cni_test.go (100%) rename test/suites/{ => alpha}/integration/daemonset_test.go (100%) rename test/suites/{ => alpha}/integration/emptiness_test.go (100%) rename test/suites/{ => alpha}/integration/extended_resources_test.go (100%) rename test/suites/{ => alpha}/integration/hash_test.go (100%) rename test/suites/{ => alpha}/integration/instance_profile_test.go (100%) rename test/suites/{ => alpha}/integration/kubelet_config_test.go (100%) rename test/suites/{ => alpha}/integration/lease_garbagecollection_test.go (100%) rename test/suites/{ => alpha}/integration/scheduling_test.go (100%) rename test/suites/{ => alpha}/integration/security_group_test.go (100%) rename test/suites/{ => alpha}/integration/storage_test.go (100%) rename test/suites/{ => alpha}/integration/subnet_test.go (100%) rename test/suites/{ => alpha}/integration/suite_test.go (96%) rename test/suites/{ => alpha}/integration/tags_test.go (100%) rename test/suites/{ => alpha}/integration/termination_test.go (100%) rename test/suites/{ => alpha}/integration/testdata/al2_no_mime_userdata_input.sh (100%) rename test/suites/{ => alpha}/integration/testdata/al2_userdata_input.sh (100%) rename test/suites/{ => alpha}/integration/testdata/amd_driver_input.sh (100%) rename test/suites/{ => alpha}/integration/testdata/br_userdata_input.sh (100%) rename test/suites/{ => alpha}/integration/testdata/windows_userdata_input.ps1 (100%) rename test/suites/{ => alpha}/integration/webhook_test.go (100%) rename test/suites/{ => alpha}/interruption/suite_test.go (99%) rename test/suites/{ => alpha}/ipv6/suite_test.go (99%) rename test/suites/{ => alpha}/machine/garbage_collection_test.go (100%) rename test/suites/{ => alpha}/machine/link_test.go (100%) rename test/suites/{ => alpha}/machine/machine_test.go (100%) rename test/suites/{ => alpha}/machine/suite_test.go (97%) rename test/suites/{ => alpha}/machine/testdata/al2_userdata_custom_labels_input.sh (100%) rename test/suites/{ => alpha}/machine/testdata/al2_userdata_input.sh (100%) rename test/suites/{ => alpha}/scale/deprovisioning_test.go (100%) rename test/suites/{ => alpha}/scale/provisioning_test.go (100%) rename test/suites/{ => alpha}/scale/suite_test.go (97%) rename test/suites/{ => alpha}/utilization/suite_test.go (98%) create mode 100644 test/suites/beta/drift/suite_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index 6ed99e570507..da5f528af31f 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Integration, Machine, Consolidation, Utilization, Interruption, Drift, Expiration, Chaos, IPv6] + suite: [Beta/Drift, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 6f36ed52974a..9e02a888e3b9 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -15,16 +15,17 @@ on: type: choice required: true options: - - Integration - - Machine - - Consolidation - - Utilization - - Interruption - - Drift - - Expiration - - Chaos - - IPv6 - - Scale + - Beta/Drift + - Alpha/Integration + - Alpha/Machine + - Alpha/Consolidation + - Alpha/Utilization + - Alpha/Interruption + - Alpha/Drift + - Alpha/Expiration + - Alpha/Chaos + - Alpha/IPv6 + - Alpha/Scale k8s_version: type: choice options: @@ -107,7 +108,7 @@ jobs: cluster_name: ${{ env.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} eksctl_version: ${{ inputs.eksctl_version }} - ip_family: ${{ inputs.suite == 'IPv6' && 'IPv6' || 'IPv4' }} # Set the value to IPv6 if IPv6 suite, else IPv4 + ip_family: ${{ contains(inputs.suite, 'IPv6') && 'IPv6' || 'IPv4' }} # Set the value to IPv6 if IPv6 suite, else IPv4 git_ref: ${{ inputs.git_ref }} - name: install prometheus uses: ./.github/actions/e2e/install-prometheus diff --git a/test/pkg/debug/monitor.go b/test/pkg/debug/monitor.go index aa4a8f4b1689..9ce77d4f20e2 100644 --- a/test/pkg/debug/monitor.go +++ b/test/pkg/debug/monitor.go @@ -83,6 +83,7 @@ func (m *Monitor) Stop() { func newControllers(kubeClient client.Client) []controller.Controller { return []controller.Controller{ NewMachineController(kubeClient), + NewNodeClaimController(kubeClient), NewNodeController(kubeClient), NewPodController(kubeClient), } diff --git a/test/pkg/debug/nodeclaim.go b/test/pkg/debug/nodeclaim.go new file mode 100644 index 000000000000..0adf699a62d5 --- /dev/null +++ b/test/pkg/debug/nodeclaim.go @@ -0,0 +1,82 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package debug + +import ( + "context" + "fmt" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + corecontroller "github.com/aws/karpenter-core/pkg/operator/controller" +) + +type NodeClaimController struct { + kubeClient client.Client +} + +func NewNodeClaimController(kubeClient client.Client) *NodeClaimController { + return &NodeClaimController{ + kubeClient: kubeClient, + } +} + +func (c *NodeClaimController) Name() string { + return "nodeclaim" +} + +func (c *NodeClaimController) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + nc := &corev1beta1.NodeClaim{} + if err := c.kubeClient.Get(ctx, req.NamespacedName, nc); err != nil { + if errors.IsNotFound(err) { + fmt.Printf("[DELETED %s] NODECLAIM %s\n", time.Now().Format(time.RFC3339), req.NamespacedName.String()) + } + return reconcile.Result{}, client.IgnoreNotFound(err) + } + fmt.Printf("[CREATED/UPDATED %s] NODECLAIM %s %s\n", time.Now().Format(time.RFC3339), req.NamespacedName.Name, c.GetInfo(nc)) + return reconcile.Result{}, nil +} + +func (c *NodeClaimController) GetInfo(nc *corev1beta1.NodeClaim) string { + return fmt.Sprintf("ready=%t launched=%t registered=%t initialized=%t", + nc.StatusConditions().IsHappy(), + nc.StatusConditions().GetCondition(corev1beta1.Launched).IsTrue(), + nc.StatusConditions().GetCondition(corev1beta1.Registered).IsTrue(), + nc.StatusConditions().GetCondition(corev1beta1.Initialized).IsTrue(), + ) +} + +func (c *NodeClaimController) Builder(_ context.Context, m manager.Manager) corecontroller.Builder { + return corecontroller.Adapt(controllerruntime. + NewControllerManagedBy(m). + For(&corev1beta1.NodeClaim{}). + WithEventFilter(predicate.Funcs{ + UpdateFunc: func(e event.UpdateEvent) bool { + oldNodeClaim := e.ObjectOld.(*corev1beta1.NodeClaim) + newNodeClaim := e.ObjectNew.(*corev1beta1.NodeClaim) + return c.GetInfo(oldNodeClaim) != c.GetInfo(newNodeClaim) + }, + }). + WithOptions(controller.Options{MaxConcurrentReconciles: 10})) +} diff --git a/test/pkg/environment/aws/setup.go b/test/pkg/environment/aws/setup.go index 990b648610e2..5e8bb4d21a7e 100644 --- a/test/pkg/environment/aws/setup.go +++ b/test/pkg/environment/aws/setup.go @@ -20,6 +20,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter/pkg/apis/v1alpha1" + "github.com/aws/karpenter/pkg/apis/v1beta1" ) var persistedSettings []v1.EnvVar @@ -28,6 +29,7 @@ var persistedSettingsLegacy = &v1.ConfigMap{} var ( CleanableObjects = []client.Object{ &v1alpha1.AWSNodeTemplate{}, + &v1beta1.EC2NodeClass{}, } ) @@ -38,8 +40,8 @@ func (env *Environment) BeforeEach() { } func (env *Environment) Cleanup() { - env.Environment.CleanupObjects(CleanableObjects...) env.Environment.Cleanup() + env.Environment.CleanupObjects(CleanableObjects...) } func (env *Environment) AfterEach() { diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index 7d4d932849ed..b62476485e99 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -40,6 +40,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" pscheduling "github.com/aws/karpenter-core/pkg/controllers/provisioning/scheduling" "github.com/aws/karpenter-core/pkg/scheduling" "github.com/aws/karpenter-core/pkg/test" @@ -583,6 +584,29 @@ func (env *Environment) EventuallyExpectMachinesReady(machines ...*v1alpha5.Mach }).Should(Succeed()) } +func (env *Environment) EventuallyExpectCreatedNodeClaimCount(comparator string, count int) []*corev1beta1.NodeClaim { + GinkgoHelper() + By(fmt.Sprintf("waiting for created nodeclaims to be %s to %d", comparator, count)) + nodeClaimList := &corev1beta1.NodeClaimList{} + Eventually(func(g Gomega) { + g.Expect(env.Client.List(env.Context, nodeClaimList)).To(Succeed()) + g.Expect(len(nodeClaimList.Items)).To(BeNumerically(comparator, count)) + }).Should(Succeed()) + return lo.Map(nodeClaimList.Items, func(nc corev1beta1.NodeClaim, _ int) *corev1beta1.NodeClaim { + return &nc + }) +} + +func (env *Environment) EventuallyExpectNodeClaimsReady(nodeClaims ...*corev1beta1.NodeClaim) { + Eventually(func(g Gomega) { + for _, nc := range nodeClaims { + temp := &corev1beta1.NodeClaim{} + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nc), temp)).Should(Succeed()) + g.Expect(temp.StatusConditions().IsHappy()).To(BeTrue()) + } + }).Should(Succeed()) +} + func (env *Environment) GetNode(nodeName string) v1.Node { GinkgoHelper() var node v1.Node diff --git a/test/pkg/environment/common/setup.go b/test/pkg/environment/common/setup.go index b877db6e3c78..e4c371fe71f9 100644 --- a/test/pkg/environment/common/setup.go +++ b/test/pkg/environment/common/setup.go @@ -34,9 +34,12 @@ import ( "github.com/aws/karpenter-core/pkg/apis" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter-core/pkg/utils/pod" + "github.com/aws/karpenter/pkg/apis/v1alpha1" + "github.com/aws/karpenter/pkg/apis/v1beta1" "github.com/aws/karpenter/test/pkg/debug" ) @@ -50,10 +53,12 @@ var ( &v1.PersistentVolume{}, &storagev1.StorageClass{}, &v1alpha5.Provisioner{}, + &corev1beta1.NodePool{}, &v1.LimitRange{}, &schedulingv1.PriorityClass{}, &v1.Node{}, &v1alpha5.Machine{}, + &corev1beta1.NodeClaim{}, } ) @@ -65,9 +70,6 @@ func (env *Environment) BeforeEach() { // Expect this cluster to be clean for test runs to execute successfully env.ExpectCleanCluster() - var provisioners v1alpha5.ProvisionerList - Expect(env.Client.List(env.Context, &provisioners)).To(Succeed()) - Expect(provisioners.Items).To(HaveLen(0), "expected no provisioners to exist") env.Monitor.Reset() env.StartingNodeCount = env.Monitor.NodeCountAtReset() } @@ -88,6 +90,13 @@ func (env *Environment) ExpectCleanCluster() { Expect(pods.Items[i].Namespace).ToNot(Equal("default"), fmt.Sprintf("expected no pods in the `default` namespace, found %s/%s", pods.Items[i].Namespace, pods.Items[i].Name)) } + for _, obj := range []client.Object{&v1alpha5.Provisioner{}, &v1alpha1.AWSNodeTemplate{}, &corev1beta1.NodePool{}, &v1beta1.EC2NodeClass{}} { + metaList := &metav1.PartialObjectMetadataList{} + gvk := lo.Must(apiutil.GVKForObject(obj, env.Client.Scheme())) + metaList.SetGroupVersionKind(gvk) + Expect(env.Client.List(env.Context, metaList, client.Limit(1))).To(Succeed()) + Expect(metaList.Items).To(HaveLen(0), fmt.Sprintf("expected no %s to exist", gvk.Kind)) + } } func (env *Environment) Cleanup() { diff --git a/test/suites/chaos/suite_test.go b/test/suites/alpha/chaos/suite_test.go similarity index 99% rename from test/suites/chaos/suite_test.go rename to test/suites/alpha/chaos/suite_test.go index b2d2dedb01b5..0d8364d42f6d 100644 --- a/test/suites/chaos/suite_test.go +++ b/test/suites/alpha/chaos/suite_test.go @@ -54,7 +54,7 @@ func TestChaos(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Chaos") + RunSpecs(t, "Alpha/Chaos") } var _ = BeforeEach(func() { env.BeforeEach() }) diff --git a/test/suites/consolidation/suite_test.go b/test/suites/alpha/consolidation/suite_test.go similarity index 99% rename from test/suites/consolidation/suite_test.go rename to test/suites/alpha/consolidation/suite_test.go index 6be4afe97e3f..aa0c98084be7 100644 --- a/test/suites/consolidation/suite_test.go +++ b/test/suites/alpha/consolidation/suite_test.go @@ -49,7 +49,7 @@ func TestConsolidation(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Consolidation") + RunSpecs(t, "Alpha/Consolidation") } var _ = BeforeEach(func() { env.BeforeEach() }) diff --git a/test/suites/drift/suite_test.go b/test/suites/alpha/drift/suite_test.go similarity index 99% rename from test/suites/drift/suite_test.go rename to test/suites/alpha/drift/suite_test.go index 3bf0b67e34e0..d7fae9c7acb0 100644 --- a/test/suites/drift/suite_test.go +++ b/test/suites/alpha/drift/suite_test.go @@ -55,7 +55,7 @@ func TestDrift(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Drift") + RunSpecs(t, "Alpha/Drift") } var _ = BeforeEach(func() { diff --git a/test/suites/expiration/expiration_test.go b/test/suites/alpha/expiration/expiration_test.go similarity index 99% rename from test/suites/expiration/expiration_test.go rename to test/suites/alpha/expiration/expiration_test.go index 75a31c5f2a94..024a49a81df4 100644 --- a/test/suites/expiration/expiration_test.go +++ b/test/suites/alpha/expiration/expiration_test.go @@ -51,7 +51,7 @@ func TestExpiration(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Expiration") + RunSpecs(t, "Alpha/Expiration") } var _ = BeforeEach(func() { diff --git a/test/suites/integration/ami_test.go b/test/suites/alpha/integration/ami_test.go similarity index 100% rename from test/suites/integration/ami_test.go rename to test/suites/alpha/integration/ami_test.go diff --git a/test/suites/integration/aws_metadata_test.go b/test/suites/alpha/integration/aws_metadata_test.go similarity index 100% rename from test/suites/integration/aws_metadata_test.go rename to test/suites/alpha/integration/aws_metadata_test.go diff --git a/test/suites/integration/backwards_compatability_test.go b/test/suites/alpha/integration/backwards_compatability_test.go similarity index 100% rename from test/suites/integration/backwards_compatability_test.go rename to test/suites/alpha/integration/backwards_compatability_test.go diff --git a/test/suites/integration/block_device_mappings_test.go b/test/suites/alpha/integration/block_device_mappings_test.go similarity index 100% rename from test/suites/integration/block_device_mappings_test.go rename to test/suites/alpha/integration/block_device_mappings_test.go diff --git a/test/suites/integration/cni_test.go b/test/suites/alpha/integration/cni_test.go similarity index 100% rename from test/suites/integration/cni_test.go rename to test/suites/alpha/integration/cni_test.go diff --git a/test/suites/integration/daemonset_test.go b/test/suites/alpha/integration/daemonset_test.go similarity index 100% rename from test/suites/integration/daemonset_test.go rename to test/suites/alpha/integration/daemonset_test.go diff --git a/test/suites/integration/emptiness_test.go b/test/suites/alpha/integration/emptiness_test.go similarity index 100% rename from test/suites/integration/emptiness_test.go rename to test/suites/alpha/integration/emptiness_test.go diff --git a/test/suites/integration/extended_resources_test.go b/test/suites/alpha/integration/extended_resources_test.go similarity index 100% rename from test/suites/integration/extended_resources_test.go rename to test/suites/alpha/integration/extended_resources_test.go diff --git a/test/suites/integration/hash_test.go b/test/suites/alpha/integration/hash_test.go similarity index 100% rename from test/suites/integration/hash_test.go rename to test/suites/alpha/integration/hash_test.go diff --git a/test/suites/integration/instance_profile_test.go b/test/suites/alpha/integration/instance_profile_test.go similarity index 100% rename from test/suites/integration/instance_profile_test.go rename to test/suites/alpha/integration/instance_profile_test.go diff --git a/test/suites/integration/kubelet_config_test.go b/test/suites/alpha/integration/kubelet_config_test.go similarity index 100% rename from test/suites/integration/kubelet_config_test.go rename to test/suites/alpha/integration/kubelet_config_test.go diff --git a/test/suites/integration/lease_garbagecollection_test.go b/test/suites/alpha/integration/lease_garbagecollection_test.go similarity index 100% rename from test/suites/integration/lease_garbagecollection_test.go rename to test/suites/alpha/integration/lease_garbagecollection_test.go diff --git a/test/suites/integration/scheduling_test.go b/test/suites/alpha/integration/scheduling_test.go similarity index 100% rename from test/suites/integration/scheduling_test.go rename to test/suites/alpha/integration/scheduling_test.go diff --git a/test/suites/integration/security_group_test.go b/test/suites/alpha/integration/security_group_test.go similarity index 100% rename from test/suites/integration/security_group_test.go rename to test/suites/alpha/integration/security_group_test.go diff --git a/test/suites/integration/storage_test.go b/test/suites/alpha/integration/storage_test.go similarity index 100% rename from test/suites/integration/storage_test.go rename to test/suites/alpha/integration/storage_test.go diff --git a/test/suites/integration/subnet_test.go b/test/suites/alpha/integration/subnet_test.go similarity index 100% rename from test/suites/integration/subnet_test.go rename to test/suites/alpha/integration/subnet_test.go diff --git a/test/suites/integration/suite_test.go b/test/suites/alpha/integration/suite_test.go similarity index 96% rename from test/suites/integration/suite_test.go rename to test/suites/alpha/integration/suite_test.go index 8dbd82c1c59a..924c5ca2ee3b 100644 --- a/test/suites/integration/suite_test.go +++ b/test/suites/alpha/integration/suite_test.go @@ -33,7 +33,7 @@ func TestIntegration(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Integration") + RunSpecs(t, "Alpha/Integration") } var _ = BeforeEach(func() { env.BeforeEach() }) diff --git a/test/suites/integration/tags_test.go b/test/suites/alpha/integration/tags_test.go similarity index 100% rename from test/suites/integration/tags_test.go rename to test/suites/alpha/integration/tags_test.go diff --git a/test/suites/integration/termination_test.go b/test/suites/alpha/integration/termination_test.go similarity index 100% rename from test/suites/integration/termination_test.go rename to test/suites/alpha/integration/termination_test.go diff --git a/test/suites/integration/testdata/al2_no_mime_userdata_input.sh b/test/suites/alpha/integration/testdata/al2_no_mime_userdata_input.sh similarity index 100% rename from test/suites/integration/testdata/al2_no_mime_userdata_input.sh rename to test/suites/alpha/integration/testdata/al2_no_mime_userdata_input.sh diff --git a/test/suites/integration/testdata/al2_userdata_input.sh b/test/suites/alpha/integration/testdata/al2_userdata_input.sh similarity index 100% rename from test/suites/integration/testdata/al2_userdata_input.sh rename to test/suites/alpha/integration/testdata/al2_userdata_input.sh diff --git a/test/suites/integration/testdata/amd_driver_input.sh b/test/suites/alpha/integration/testdata/amd_driver_input.sh similarity index 100% rename from test/suites/integration/testdata/amd_driver_input.sh rename to test/suites/alpha/integration/testdata/amd_driver_input.sh diff --git a/test/suites/integration/testdata/br_userdata_input.sh b/test/suites/alpha/integration/testdata/br_userdata_input.sh similarity index 100% rename from test/suites/integration/testdata/br_userdata_input.sh rename to test/suites/alpha/integration/testdata/br_userdata_input.sh diff --git a/test/suites/integration/testdata/windows_userdata_input.ps1 b/test/suites/alpha/integration/testdata/windows_userdata_input.ps1 similarity index 100% rename from test/suites/integration/testdata/windows_userdata_input.ps1 rename to test/suites/alpha/integration/testdata/windows_userdata_input.ps1 diff --git a/test/suites/integration/webhook_test.go b/test/suites/alpha/integration/webhook_test.go similarity index 100% rename from test/suites/integration/webhook_test.go rename to test/suites/alpha/integration/webhook_test.go diff --git a/test/suites/interruption/suite_test.go b/test/suites/alpha/interruption/suite_test.go similarity index 99% rename from test/suites/interruption/suite_test.go rename to test/suites/alpha/interruption/suite_test.go index 0e1fc563efd5..eee4796fcd6f 100644 --- a/test/suites/interruption/suite_test.go +++ b/test/suites/alpha/interruption/suite_test.go @@ -50,7 +50,7 @@ func TestInterruption(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Interruption") + RunSpecs(t, "Alpha/Interruption") } var _ = BeforeEach(func() { diff --git a/test/suites/ipv6/suite_test.go b/test/suites/alpha/ipv6/suite_test.go similarity index 99% rename from test/suites/ipv6/suite_test.go rename to test/suites/alpha/ipv6/suite_test.go index 7e6bad6f0e95..77fc301fb840 100644 --- a/test/suites/ipv6/suite_test.go +++ b/test/suites/alpha/ipv6/suite_test.go @@ -40,7 +40,7 @@ func TestIPv6(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "IPv6") + RunSpecs(t, "Alpha/IPv6") } var _ = BeforeEach(func() { env.BeforeEach() }) diff --git a/test/suites/machine/garbage_collection_test.go b/test/suites/alpha/machine/garbage_collection_test.go similarity index 100% rename from test/suites/machine/garbage_collection_test.go rename to test/suites/alpha/machine/garbage_collection_test.go diff --git a/test/suites/machine/link_test.go b/test/suites/alpha/machine/link_test.go similarity index 100% rename from test/suites/machine/link_test.go rename to test/suites/alpha/machine/link_test.go diff --git a/test/suites/machine/machine_test.go b/test/suites/alpha/machine/machine_test.go similarity index 100% rename from test/suites/machine/machine_test.go rename to test/suites/alpha/machine/machine_test.go diff --git a/test/suites/machine/suite_test.go b/test/suites/alpha/machine/suite_test.go similarity index 97% rename from test/suites/machine/suite_test.go rename to test/suites/alpha/machine/suite_test.go index cd77ea4f7d07..078d2280d216 100644 --- a/test/suites/machine/suite_test.go +++ b/test/suites/alpha/machine/suite_test.go @@ -33,7 +33,7 @@ func TestMachine(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Machine") + RunSpecs(t, "Alpha/Machine") } var _ = BeforeEach(func() { env.BeforeEach() }) diff --git a/test/suites/machine/testdata/al2_userdata_custom_labels_input.sh b/test/suites/alpha/machine/testdata/al2_userdata_custom_labels_input.sh similarity index 100% rename from test/suites/machine/testdata/al2_userdata_custom_labels_input.sh rename to test/suites/alpha/machine/testdata/al2_userdata_custom_labels_input.sh diff --git a/test/suites/machine/testdata/al2_userdata_input.sh b/test/suites/alpha/machine/testdata/al2_userdata_input.sh similarity index 100% rename from test/suites/machine/testdata/al2_userdata_input.sh rename to test/suites/alpha/machine/testdata/al2_userdata_input.sh diff --git a/test/suites/scale/deprovisioning_test.go b/test/suites/alpha/scale/deprovisioning_test.go similarity index 100% rename from test/suites/scale/deprovisioning_test.go rename to test/suites/alpha/scale/deprovisioning_test.go diff --git a/test/suites/scale/provisioning_test.go b/test/suites/alpha/scale/provisioning_test.go similarity index 100% rename from test/suites/scale/provisioning_test.go rename to test/suites/alpha/scale/provisioning_test.go diff --git a/test/suites/scale/suite_test.go b/test/suites/alpha/scale/suite_test.go similarity index 97% rename from test/suites/scale/suite_test.go rename to test/suites/alpha/scale/suite_test.go index 8050d43cc98f..0bb04ea2a9de 100644 --- a/test/suites/scale/suite_test.go +++ b/test/suites/alpha/scale/suite_test.go @@ -35,7 +35,7 @@ func TestScale(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Scale") + RunSpecs(t, "Alpha/Scale") } var _ = BeforeEach(func() { diff --git a/test/suites/utilization/suite_test.go b/test/suites/alpha/utilization/suite_test.go similarity index 98% rename from test/suites/utilization/suite_test.go rename to test/suites/alpha/utilization/suite_test.go index 9c09f49b66b4..bfaf8524a1f1 100644 --- a/test/suites/utilization/suite_test.go +++ b/test/suites/alpha/utilization/suite_test.go @@ -42,7 +42,7 @@ func TestUtilization(t *testing.T) { AfterSuite(func() { env.Stop() }) - RunSpecs(t, "Utilization") + RunSpecs(t, "Alpha/Utilization") } var _ = BeforeEach(func() { env.BeforeEach() }) diff --git a/test/suites/beta/drift/suite_test.go b/test/suites/beta/drift/suite_test.go new file mode 100644 index 000000000000..ef0303f35b38 --- /dev/null +++ b/test/suites/beta/drift/suite_test.go @@ -0,0 +1,560 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package drift_test + +import ( + "fmt" + "sort" + "strings" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + + awssdk "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" + "github.com/aws/aws-sdk-go/service/eks" + "github.com/aws/aws-sdk-go/service/iam" + "github.com/aws/aws-sdk-go/service/ssm" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + awstest "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var env *aws.Environment +var customAMI string + +func TestDrift(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/Drift") +} + +var _ = BeforeEach(func() { + env.BeforeEach() +}) + +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Beta/Drift", Label("AWS"), func() { + var pod *v1.Pod + var nodeClass *v1beta1.EC2NodeClass + var nodePool *corev1beta1.NodePool + BeforeEach(func() { + customAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) + nodeClass = awstest.EC2NodeClass(v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }}) + nodePool = test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{{Key: corev1beta1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{corev1beta1.CapacityTypeOnDemand}}}, + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, + }, + }, + }, + }) + // Add a do-not-disrupt pod so that we can check node metadata before we disrupt + pod = test.Pod(test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + corev1beta1.DoNotDisruptAnnotationKey: "true", + }, + }, + }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) + env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) + }) + It("should disrupt nodes that have drifted due to AMIs", func() { + // choose an old static image + parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: awssdk.String("/aws/service/eks/optimized-ami/1.23/amazon-linux-2/amazon-eks-node-1.23-v20230322/image_id"), + }) + Expect(err).To(BeNil()) + oldCustomAMI := *parameter.Parameter.Value + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: oldCustomAMI}} + nodeClass.Spec.UserData = awssdk.String(fmt.Sprintf("#!/bin/bash\n/etc/eks/bootstrap.sh '%s'", env.ClusterName)) + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectNodeCount("==", 1)[0] + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: customAMI}} + env.ExpectCreatedOrUpdated(nodeClass) + + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted)).ToNot(BeNil()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, nodeClaim, node) + }) + It("should not disrupt nodes that have drifted without the featureGate enabled", func() { + env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) + env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=false"}) + // choose an old static image + parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: awssdk.String("/aws/service/eks/optimized-ami/1.23/amazon-linux-2/amazon-eks-node-1.23-v20230322/image_id"), + }) + Expect(err).To(BeNil()) + oldCustomAMI := *parameter.Parameter.Value + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: oldCustomAMI}} + nodeClass.Spec.UserData = awssdk.String(fmt.Sprintf("#!/bin/bash\n/etc/eks/bootstrap.sh '%s'", env.ClusterName)) + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.CreatedNodes()[0] + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: customAMI}} + env.ExpectUpdated(nodeClass) + + // We should consistently get the same node existing for a minute + Consistently(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), &v1.Node{})).To(Succeed()) + }).WithTimeout(time.Minute).Should(Succeed()) + }) + It("should disrupt nodes that have drifted due to securitygroup", func() { + By("getting the cluster vpc id") + output, err := env.EKSAPI.DescribeCluster(&eks.DescribeClusterInput{Name: awssdk.String(env.ClusterName)}) + Expect(err).To(BeNil()) + + By("creating new security group") + createSecurityGroup := &ec2.CreateSecurityGroupInput{ + GroupName: awssdk.String("security-group-drift"), + Description: awssdk.String("End-to-end Drift Test, should delete after drift test is completed"), + VpcId: output.Cluster.ResourcesVpcConfig.VpcId, + TagSpecifications: []*ec2.TagSpecification{ + { + ResourceType: awssdk.String("security-group"), + Tags: []*ec2.Tag{ + { + Key: awssdk.String("karpenter.sh/discovery"), + Value: awssdk.String(env.ClusterName), + }, + { + Key: awssdk.String(test.DiscoveryLabel), + Value: awssdk.String(env.ClusterName), + }, + { + Key: awssdk.String("creation-date"), + Value: awssdk.String(time.Now().Format(time.RFC3339)), + }, + }, + }, + }, + } + _, _ = env.EC2API.CreateSecurityGroup(createSecurityGroup) + + By("looking for security groups") + var securitygroups []aws.SecurityGroup + var testSecurityGroup aws.SecurityGroup + Eventually(func(g Gomega) { + securitygroups = env.GetSecurityGroups(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + testSecurityGroup, _ = lo.Find(securitygroups, func(sg aws.SecurityGroup) bool { + return awssdk.StringValue(sg.GroupName) == "security-group-drift" + }) + g.Expect(testSecurityGroup).ToNot(BeNil()) + }).Should(Succeed()) + + By("creating a new provider with the new securitygroup") + awsIDs := lo.FilterMap(securitygroups, func(sg aws.SecurityGroup, _ int) (string, bool) { + if awssdk.StringValue(sg.GroupId) != awssdk.StringValue(testSecurityGroup.GroupId) { + return awssdk.StringValue(sg.GroupId), true + } + return "", false + }) + sgTerms := []v1beta1.SecurityGroupSelectorTerm{{ID: awssdk.StringValue(testSecurityGroup.GroupId)}} + for _, id := range awsIDs { + sgTerms = append(sgTerms, v1beta1.SecurityGroupSelectorTerm{ID: id}) + } + nodeClass.Spec.SecurityGroupSelectorTerms = sgTerms + + env.ExpectCreated(pod, nodeClass, nodePool) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthy(pod) + + sgTerms = lo.Reject(sgTerms, func(t v1beta1.SecurityGroupSelectorTerm, _ int) bool { + return t.ID == awssdk.StringValue(testSecurityGroup.GroupId) + }) + nodeClass.Spec.SecurityGroupSelectorTerms = sgTerms + env.ExpectCreatedOrUpdated(nodeClass) + + By("validating the drifted status condition has propagated") + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted)).ToNot(BeNil()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, nodeClaim, node) + }) + It("should disrupt nodes that have drifted due to subnets", func() { + subnets := env.GetSubnetNameAndIds(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(len(subnets)).To(BeNumerically(">", 1)) + + nodeClass.Spec.SubnetSelectorTerms = []v1beta1.SubnetSelectorTerm{{ID: subnets[0].ID}} + + env.ExpectCreated(pod, nodeClass, nodePool) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthy(pod) + + nodeClass.Spec.SubnetSelectorTerms = []v1beta1.SubnetSelectorTerm{{ID: subnets[1].ID}} + env.ExpectCreatedOrUpdated(nodeClass) + + By("validating the drifted status condition has propagated") + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted)).ToNot(BeNil()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, node) + }) + DescribeTable("NodePool Drift", func(fieldName string, nodePoolOption corev1beta1.NodeClaimTemplate) { + updatedNodePool := test.NodePool( + corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, + }, + }, + }, + }, + corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: nodePoolOption, + }, + }, + ) + updatedNodePool.ObjectMeta = nodePool.ObjectMeta + + env.ExpectCreated(pod, nodeClass, nodePool) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthy(pod) + + env.ExpectCreatedOrUpdated(updatedNodePool) + + By("validating the drifted status condition has propagated") + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted)).ToNot(BeNil()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + + // Nodes will need to have the start-up taint removed before the node can be considered as initialized + if fieldName == "Start-up Taint" { + nodes := env.EventuallyExpectCreatedNodeCount("==", 2) + sort.Slice(nodes, func(i int, j int) bool { + return nodes[i].CreationTimestamp.Before(&nodes[j].CreationTimestamp) + }) + nodeTwo := nodes[1] + // Remove the startup taints from the new nodes to initialize them + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeTwo), nodeTwo)).To(Succeed()) + stored := nodeTwo.DeepCopy() + nodeTwo.Spec.Taints = lo.Reject(nodeTwo.Spec.Taints, func(t v1.Taint, _ int) bool { return t.Key == "example.com/another-taint-2" }) + g.Expect(env.Client.Patch(env.Context, nodeTwo, client.MergeFrom(stored))).To(Succeed()) + }).Should(Succeed()) + } + + env.EventuallyExpectNotFound(pod, node) + }, + Entry("Annotation Drift", "Annotation", corev1beta1.NodeClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{"keyAnnotationTest": "valueAnnotationTest"}, + }, + }), + Entry("Labels Drift", "Labels", corev1beta1.NodeClaimTemplate{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"keyLabelTest": "valueLabelTest"}, + }, + }), + Entry("Taints Drift", "Taints", corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Taints: []v1.Taint{{Key: "example.com/another-taint-2", Effect: v1.TaintEffectPreferNoSchedule}}, + }, + }), + Entry("KubeletConfiguration Drift", "KubeletConfiguration", corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Kubelet: &corev1beta1.KubeletConfiguration{ + EvictionSoft: map[string]string{"memory.available": "5%"}, + EvictionSoftGracePeriod: map[string]metav1.Duration{"memory.available": {Duration: time.Minute}}, + }, + }, + }), + Entry("Start-up Taints Drift", "Start-up Taint", corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + StartupTaints: []v1.Taint{{Key: "example.com/another-taint-2", Effect: v1.TaintEffectPreferNoSchedule}}, + }, + }), + Entry("NodeRequirement Drift", "NodeRequirement", corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{{Key: corev1beta1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{corev1beta1.CapacityTypeSpot}}}, + }, + }), + ) + DescribeTable("EC2NodeClass Drift", func(fieldName string, nodeClassSpec v1beta1.EC2NodeClassSpec) { + updatedNodeClass := awstest.EC2NodeClass(v1beta1.EC2NodeClass{Spec: *nodeClass.Spec.DeepCopy()}, v1beta1.EC2NodeClass{Spec: nodeClassSpec}) + updatedNodeClass.ObjectMeta = nodeClass.ObjectMeta + updatedNodeClass.Annotations = map[string]string{v1beta1.AnnotationNodeClassHash: updatedNodeClass.Hash()} + + env.ExpectCreated(pod, nodeClass, nodePool) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthy(pod) + + env.ExpectCreatedOrUpdated(updatedNodeClass) + + By("validating the drifted status condition has propagated") + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted)).ToNot(BeNil()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, node) + }, + Entry("UserData Drift", "UserData", v1beta1.EC2NodeClassSpec{UserData: awssdk.String("#!/bin/bash\n/etc/eks/bootstrap.sh")}), + Entry("Tags Drift", "Tags", v1beta1.EC2NodeClassSpec{Tags: map[string]string{"keyTag-test-3": "valueTag-test-3"}}), + Entry("MetadataOptions Drift", "MetadataOptions", v1beta1.EC2NodeClassSpec{MetadataOptions: &v1beta1.MetadataOptions{HTTPTokens: awssdk.String("required"), HTTPPutResponseHopLimit: awssdk.Int64(10)}}), + Entry("BlockDeviceMappings Drift", "BlockDeviceMappings", v1beta1.EC2NodeClassSpec{BlockDeviceMappings: []*v1beta1.BlockDeviceMapping{ + { + DeviceName: awssdk.String("/dev/xvda"), + EBS: &v1beta1.BlockDevice{ + VolumeSize: resource.NewScaledQuantity(20, resource.Giga), + VolumeType: awssdk.String("gp3"), + Encrypted: awssdk.Bool(true), + }, + }}}), + Entry("DetailedMonitoring Drift", "DetailedMonitoring", v1beta1.EC2NodeClassSpec{DetailedMonitoring: awssdk.Bool(true)}), + Entry("AMIFamily Drift", "AMIFamily", v1beta1.EC2NodeClassSpec{AMIFamily: awssdk.String(v1beta1.AMIFamilyBottlerocket)}), + ) + Context("Drift Failure", func() { + It("should not continue to drift if a node never registers", func() { + // launch a new nodeClaim + var numPods int32 = 2 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: 2, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []v1.PodAffinityTerm{{ + TopologyKey: v1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Drift the nodeClaim with bad configuration + parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: awssdk.String("/aws/service/ami-amazon-linux-latest/amzn-ami-hvm-x86_64-ebs"), + }) + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: *parameter.Parameter.Value}} + env.ExpectCreatedOrUpdated(nodeClass) + + // Should see the nodeClaim has drifted + Eventually(func(g Gomega) { + for _, nodeClaim := range startingNodeClaimState { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + } + }).Should(Succeed()) + + // Expect nodes To get cordoned + cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + + // Drift should fail and the original node should be uncordoned + // TODO: reduce timeouts when disruption waits are factored out + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(cordonedNodes[0]), cordonedNodes[0])) + g.Expect(cordonedNodes[0].Spec.Unschedulable).To(BeFalse()) + }).WithTimeout(11 * time.Minute).Should(Succeed()) + + Eventually(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + g.Expect(nodeClaims.Items).To(HaveLen(int(numPods))) + }).WithTimeout(6 * time.Minute).Should(Succeed()) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + Consistently(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + + startingNodeClaimUIDs := lo.Map(startingNodeClaimState, func(nc *corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + nodeClaimUIDs := lo.Map(nodeClaims.Items, func(nc corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + g.Expect(sets.New(nodeClaimUIDs...).IsSuperset(sets.New(startingNodeClaimUIDs...))).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + It("should not continue to drift if a node registers but never becomes initialized", func() { + // launch a new nodeClaim + var numPods int32 = 2 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: 2, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []v1.PodAffinityTerm{{ + TopologyKey: v1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Drift the nodeClaim with bad configuration that never initializes + nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com/taint", Effect: v1.TaintEffectPreferNoSchedule}} + env.ExpectCreatedOrUpdated(nodePool) + + // Should see the nodeClaim has drifted + Eventually(func(g Gomega) { + for _, nodeClaim := range startingNodeClaimState { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + } + }).Should(Succeed()) + + // Expect nodes To be cordoned + cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + + // Drift should fail and original node should be uncordoned + // TODO: reduce timeouts when disruption waits are factored outr + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(cordonedNodes[0]), cordonedNodes[0])) + g.Expect(cordonedNodes[0].Spec.Unschedulable).To(BeFalse()) + }).WithTimeout(12 * time.Minute).Should(Succeed()) + + // Expect that the new nodeClaim/node is kept around after the un-cordon + nodeList := &v1.NodeList{} + Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + Expect(nodeList.Items).To(HaveLen(int(numPods) + 1)) + + nodeClaimList := &corev1beta1.NodeClaimList{} + Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + Expect(nodeClaimList.Items).To(HaveLen(int(numPods) + 1)) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + Consistently(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) + + startingNodeClaimUIDs := lo.Map(startingNodeClaimState, func(m *corev1beta1.NodeClaim, _ int) types.UID { return m.UID }) + nodeClaimUIDs := lo.Map(nodeClaims.Items, func(m corev1beta1.NodeClaim, _ int) types.UID { return m.UID }) + g.Expect(sets.New(nodeClaimUIDs...).IsSuperset(sets.New(startingNodeClaimUIDs...))).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + }) +}) + +func ExpectInstanceProfileCreated(instanceProfileName *string) { + By("creating an instance profile") + createInstanceProfile := &iam.CreateInstanceProfileInput{ + InstanceProfileName: instanceProfileName, + Tags: []*iam.Tag{ + { + Key: awssdk.String(test.DiscoveryLabel), + Value: awssdk.String(env.ClusterName), + }, + }, + } + By("adding the karpenter role to new instance profile") + _, err := env.IAMAPI.CreateInstanceProfile(createInstanceProfile) + Expect(ignoreAlreadyExists(err)).ToNot(HaveOccurred()) + addInstanceProfile := &iam.AddRoleToInstanceProfileInput{ + InstanceProfileName: instanceProfileName, + RoleName: awssdk.String(fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName)), + } + _, err = env.IAMAPI.AddRoleToInstanceProfile(addInstanceProfile) + Expect(ignoreAlreadyContainsRole(err)).ToNot(HaveOccurred()) +} + +func ignoreAlreadyExists(err error) error { + if err != nil { + if strings.Contains(err.Error(), "EntityAlreadyExists") { + return nil + } + } + return err +} + +func ignoreAlreadyContainsRole(err error) error { + if err != nil { + if strings.Contains(err.Error(), "Cannot exceed quota for InstanceSessionsPerInstanceProfile") { + return nil + } + } + + return err +} From 24cd55c1250019eabdcdb1cdf5bfbcded30b5884 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 16:18:20 -0700 Subject: [PATCH 08/47] docs: Add more detail on removing CRDs and removing the IAM policy (#4871) --- .../en/preview/upgrading/upgrade-guide.md | 43 ++++++++++++++++--- 1 file changed, 36 insertions(+), 7 deletions(-) diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index 9d4d6937393d..54d2e6140d66 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -32,7 +32,7 @@ kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.sh_nodeclaims.yaml kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml ``` -g + ### Upgrading to v0.32.0+ #### v1beta1 Migration @@ -102,7 +102,7 @@ Add `~/go/bin` to your $PATH, if you have not already done so. TEMPOUT=$(mktemp) curl -fsSL https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}website/content/en/preview/upgrading/v1beta1-controller-policy.json > ${TEMPOUT} - REGION=${AWS_REGION:=$AWS_DEFAULT_REGION} + AWS_REGION=${AWS_REGION:=$AWS_DEFAULT_REGION} POLICY_DOCUMENT=$(envsubst < ${TEMPOUT}) POLICY_NAME="KarpenterControllerPolicy-${CLUSTER_NAME}-v1beta1" ROLE_NAME="${CLUSTER_NAME}-karpenter" @@ -119,7 +119,9 @@ Add `~/go/bin` to your $PATH, if you have not already done so. ``` {{% alert title="Note" color="warning" %}} - < If you get the error `invalid ownership metadata; label validation error:` while installing the `karpenter-crd` chart from an older version of Karpenter, follow the [Troubleshooting Guide]({{}}) for details on how to resolve these errors. + + If you get the error `invalid ownership metadata; label validation error:` while installing the `karpenter-crd` chart from an older version of Karpenter, follow the [Troubleshooting Guide]({{}}) for details on how to resolve these errors. + {{% /alert %}} * As part of the helm chart [karpenter](https://gallery.ecr.aws/karpenter/karpenter) - [source](https://github.com/aws/karpenter/blob/main/charts/karpenter/crds). Helm [does not manage the lifecycle of CRDs using this method](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/), the tool will only install the CRD during the first installation of the helm chart. Subsequent chart upgrades will not add or remove CRDs, even if the CRDs have changed. When CRDs are changed, we will make a note in the version's upgrade guide. In general, ou can reapply the CRDs in the `crds` directory of the Karpenter helm chart: @@ -194,11 +196,38 @@ Add `~/go/bin` to your $PATH, if you have not already done so. - Add the following taint to the old Provisioner: `karpenter.sh/legacy=true:NoSchedule` - For all the nodes owned by the Provisioner, delete one at a time as follows: `kubectl delete node ` -13. Update workload labels: Old v1alpha labels (`karpenter.sh/do-not-consolidate` and `karpenter.sh/do-not-evict`) are deprecated, but will not be dropped until Karpenter v1. However, you can begin updating those labels at any time with `karpenter.sh/do-not-disrupt`. You should check that there are no more Provisioner, AWSNodeTemplate, or Machine resources on your cluster. at which time you can delete the old CRDs. To validate that there are no more machines, type: +13. Update workload labels: Old alpha labels (`karpenter.sh/do-not-consolidate` and `karpenter.sh/do-not-evict`) are deprecated, but will not be dropped until Karpenter v1. However, you can begin updating those labels at any time with `karpenter.sh/do-not-disrupt`. - ```bash - kubectl get machines - ``` +14. Check that there are no more Provisioner, AWSNodeTemplate, or Machine resources on your cluster. at which time you can delete the old CRDs. To validate this, run the following command and ensure that there are no outputs to any of them: + + ```bash + kubectl get machines + kubectl get awsnodetemplates + kubectl get provisioners + ``` + +15. Remove the alpha Karpenter CRDs from the cluster. + + ```bash + kubectl delete crd machines.karpenter.sh + kubectl delete crd awsnodetemplates.karpenter.k8s.aws + kubectl delete crd provisioners.karpenter.sh + ``` + +16. Finally, remove the alpha policy from the controller role: This will remove any remaining permissions from the alpha APIs. You can orchestrate the removal of this policy with the following command: + + ```bash + ROLE_NAME="${CLUSTER_NAME}-karpenter" + POLICY_NAME="KarpenterControllerPolicy-${CLUSTER_NAME}" + POLICY_ARN=$(aws iam list-policies --query 'Policies[?PolicyName==`KarpenterControllerPolicy-scale-test`].Arn' --output text) + aws iam detach-role-policy --role-name "${ROLE_NAME}" --policy-arn "${POLICY_ARN}" + ``` + + {{% alert title="Note" color="warning" %}} + + If you are using some IaC for managing your policy documents attached to the controller role, you may want to attach this new beta policy to the same CloudFormation stack. You can do this by removing the old alpha policy, ensuring that the Karpenter controller continues to work with just the beta policy, and then updating the stack to contain the new beta policy rather than having that policy managed separately. + + {{% /alert %}} #### Additional Release Notes From 423fb3fe0a986da6eb762baefedb607da7b84647 Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Fri, 20 Oct 2023 16:18:51 -0700 Subject: [PATCH 09/47] docs: change getting started guide to refer to patch version (#4874) --- website/config.yaml | 2 +- .../getting-started/getting-started-with-karpenter/_index.md | 2 +- .../getting-started/getting-started-with-karpenter/_index.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/website/config.yaml b/website/config.yaml index 8716703de4cb..62b0da37d6e4 100644 --- a/website/config.yaml +++ b/website/config.yaml @@ -66,7 +66,7 @@ params: url: 'https://slack.k8s.io/' icon: fab fa-slack desc: 'Chat with us on Slack in the #aws-provider channel' - latest_release_version: v0.31.0 + latest_release_version: v0.31.1 versions: - v0.31 - v0.30 diff --git a/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md index 375209b3ca22..8b688212df60 100644 --- a/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md @@ -44,7 +44,7 @@ authenticate properly by running `aws sts get-caller-identity`. After setting up the tools, set the Karpenter version number: ```bash -export KARPENTER_VERSION=v0.31.0 +export KARPENTER_VERSION=v0.31.1 ``` Then set the following environment variable: diff --git a/website/content/en/v0.31/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/v0.31/getting-started/getting-started-with-karpenter/_index.md index 375209b3ca22..8b688212df60 100644 --- a/website/content/en/v0.31/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/v0.31/getting-started/getting-started-with-karpenter/_index.md @@ -44,7 +44,7 @@ authenticate properly by running `aws sts get-caller-identity`. After setting up the tools, set the Karpenter version number: ```bash -export KARPENTER_VERSION=v0.31.0 +export KARPENTER_VERSION=v0.31.1 ``` Then set the following environment variable: From e039c32b35f46409c05fa2157ed10d6b0c878e26 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 16:54:32 -0700 Subject: [PATCH 10/47] test: Fix cluster name containing slash (#4876) --- .github/workflows/e2e.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 9e02a888e3b9..ffe5bdde5ee2 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -96,7 +96,7 @@ jobs: sleep $(( $RANDOM % 300 + 1 )) - name: generate cluster name run: | - CLUSTER_NAME=$(echo ${{ inputs.suite }}-$RANDOM$RANDOM | awk '{print tolower($0)}') + CLUSTER_NAME=$(echo ${{ inputs.suite }}-$RANDOM$RANDOM | awk '{print tolower($0)}' | tr / -) echo "Using cluster name \"$CLUSTER_NAME\"" echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_ENV - name: create eks cluster '${{ env.CLUSTER_NAME }}' From e7090b021b5b3778f1af7489586f69af5f7e5b35 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 17:09:55 -0700 Subject: [PATCH 11/47] chore: Bump `karpenter-core` to latest (#4877) --- go.mod | 2 +- go.sum | 4 ++-- pkg/apis/crds/karpenter.sh_nodeclaims.yaml | 14 +++++++++++ pkg/apis/crds/karpenter.sh_nodepools.yaml | 24 ++++++++++++------- .../launchtemplate/nodeclass_test.go | 2 +- test/suites/beta/drift/suite_test.go | 4 ++-- .../en/preview/upgrading/upgrade-guide.md | 6 ++--- 7 files changed, 39 insertions(+), 17 deletions(-) diff --git a/go.mod b/go.mod index a654b0718615..da744a5d60d3 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.46.0 - github.com/aws/karpenter-core v0.31.1-0.20231020162441-e7fbaa291c4e + github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 github.com/imdario/mergo v0.3.16 diff --git a/go.sum b/go.sum index 4de6857477d5..4e834d92c3f0 100644 --- a/go.sum +++ b/go.sum @@ -57,8 +57,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.46.0 h1:Igh7W8P+sA6mXJ9yhreOSweefLapcqekhxQlY1llxcM= github.com/aws/aws-sdk-go v1.46.0/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/karpenter-core v0.31.1-0.20231020162441-e7fbaa291c4e h1:7heVq5GV1sMgcFlnD6pNElqmbxKMLHX9kaXm9njZC0Y= -github.com/aws/karpenter-core v0.31.1-0.20231020162441-e7fbaa291c4e/go.mod h1:rb3kp/3cj38tACF6udfpmIvKoQMwirSVoHNlrd66LyE= +github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604 h1:eQFElFqH3K64na70WZBh6FUFonVRKhtyUptWtpO/JdI= +github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604/go.mod h1:rb3kp/3cj38tACF6udfpmIvKoQMwirSVoHNlrd66LyE= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c/go.mod h1:l/TIBsaCx/IrOr0Xvlj/cHLOf05QzuQKEZ1hx2XWmfU= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index 3ceff377c01d..77ddfd3bf6f8 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -216,9 +216,15 @@ spec: effect: description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute key: description: Required. The taint key to be applied to a node. type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time @@ -226,6 +232,7 @@ spec: value: description: The taint value corresponding to the taint key. type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ required: - effect - key @@ -239,9 +246,15 @@ spec: effect: description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute key: description: Required. The taint key to be applied to a node. type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time @@ -249,6 +262,7 @@ spec: value: description: The taint value corresponding to the taint key. type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ required: - effect - key diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 5f0083a751ef..086b9e991f7f 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -85,19 +85,13 @@ spec: annotations: additionalProperties: type: string + description: 'Annotations is an unstructured key value map stored with a resource that may be set by external tools to store and retrieve arbitrary metadata. They are not queryable and should be preserved when modifying objects. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations' type: object - finalizers: - items: - type: string - type: array labels: additionalProperties: type: string + description: 'Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels' type: object - name: - type: string - namespace: - type: string type: object spec: description: NodeClaimSpec describes the desired state of the NodeClaim @@ -258,9 +252,15 @@ spec: effect: description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute key: description: Required. The taint key to be applied to a node. type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time @@ -268,6 +268,7 @@ spec: value: description: The taint value corresponding to the taint key. type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ required: - effect - key @@ -281,9 +282,15 @@ spec: effect: description: Required. The effect of the taint on pods that do not tolerate the taint. Valid effects are NoSchedule, PreferNoSchedule and NoExecute. type: string + enum: + - NoSchedule + - PreferNoSchedule + - NoExecute key: description: Required. The taint key to be applied to a node. type: string + minLength: 1 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ timeAdded: description: TimeAdded represents the time at which the taint was added. It is only written for NoExecute taints. format: date-time @@ -291,6 +298,7 @@ spec: value: description: The taint value corresponding to the taint key. type: string + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ required: - effect - key diff --git a/pkg/providers/launchtemplate/nodeclass_test.go b/pkg/providers/launchtemplate/nodeclass_test.go index 86a5d9941caf..44fb34b0e737 100644 --- a/pkg/providers/launchtemplate/nodeclass_test.go +++ b/pkg/providers/launchtemplate/nodeclass_test.go @@ -56,7 +56,7 @@ var _ = Describe("EC2NodeClass/LaunchTemplates", func() { nodePool = coretest.NodePool(corev1beta1.NodePool{ Spec: corev1beta1.NodePoolSpec{ Template: corev1beta1.NodeClaimTemplate{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: corev1beta1.ObjectMeta{ // TODO @joinnis: Move this into the coretest.NodePool function Labels: map[string]string{coretest.DiscoveryLabel: "unspecified"}, }, diff --git a/test/suites/beta/drift/suite_test.go b/test/suites/beta/drift/suite_test.go index ef0303f35b38..0d26313f62ae 100644 --- a/test/suites/beta/drift/suite_test.go +++ b/test/suites/beta/drift/suite_test.go @@ -320,12 +320,12 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { env.EventuallyExpectNotFound(pod, node) }, Entry("Annotation Drift", "Annotation", corev1beta1.NodeClaimTemplate{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: corev1beta1.ObjectMeta{ Annotations: map[string]string{"keyAnnotationTest": "valueAnnotationTest"}, }, }), Entry("Labels Drift", "Labels", corev1beta1.NodeClaimTemplate{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: corev1beta1.ObjectMeta{ Labels: map[string]string{"keyLabelTest": "valueLabelTest"}, }, }), diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index 54d2e6140d66..5de3c4e81b52 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -223,11 +223,11 @@ Add `~/go/bin` to your $PATH, if you have not already done so. aws iam detach-role-policy --role-name "${ROLE_NAME}" --policy-arn "${POLICY_ARN}" ``` - {{% alert title="Note" color="warning" %}} +{{% alert title="Note" color="warning" %}} - If you are using some IaC for managing your policy documents attached to the controller role, you may want to attach this new beta policy to the same CloudFormation stack. You can do this by removing the old alpha policy, ensuring that the Karpenter controller continues to work with just the beta policy, and then updating the stack to contain the new beta policy rather than having that policy managed separately. +If you are using some IaC for managing your policy documents attached to the controller role, you may want to attach this new beta policy to the same CloudFormation stack. You can do this by removing the old alpha policy, ensuring that the Karpenter controller continues to work with just the beta policy, and then updating the stack to contain the new beta policy rather than having that policy managed separately. - {{% /alert %}} +{{% /alert %}} #### Additional Release Notes From 01fb07e03abebd5068f8a37a8a313f119843b7bd Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Fri, 20 Oct 2023 17:34:05 -0700 Subject: [PATCH 12/47] test: Add Integration E2E testing for v1beta1 (#4875) --- .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 1 + test/pkg/environment/common/expectations.go | 25 +- .../alpha/integration/kubelet_config_test.go | 4 +- .../suites/alpha/scale/deprovisioning_test.go | 2 +- test/suites/alpha/scale/provisioning_test.go | 2 +- test/suites/beta/integration/ami_test.go | 345 ++++++++++++++ .../beta/integration/aws_metadata_test.go | 49 ++ .../integration/block_device_mappings_test.go | 57 +++ test/suites/beta/integration/cni_test.go | 90 ++++ .../suites/beta/integration/daemonset_test.go | 136 ++++++ .../suites/beta/integration/emptiness_test.go | 65 +++ .../integration/extended_resources_test.go | 423 ++++++++++++++++++ test/suites/beta/integration/hash_test.go | 53 +++ .../beta/integration/instance_profile_test.go | 58 +++ .../beta/integration/kubelet_config_test.go | 282 ++++++++++++ .../lease_garbagecollection_test.go | 44 ++ .../beta/integration/scheduling_test.go | 388 ++++++++++++++++ .../beta/integration/security_group_test.go | 93 ++++ test/suites/beta/integration/storage_test.go | 119 +++++ test/suites/beta/integration/subnet_test.go | 183 ++++++++ test/suites/beta/integration/suite_test.go | 77 ++++ test/suites/beta/integration/tags_test.go | 110 +++++ .../beta/integration/termination_test.go | 50 +++ .../testdata/al2_no_mime_userdata_input.sh | 2 + .../testdata/al2_userdata_input.sh | 10 + .../integration/testdata/amd_driver_input.sh | 46 ++ .../integration/testdata/br_userdata_input.sh | 4 + .../testdata/windows_userdata_input.ps1 | 1 + .../beta/integration/validation_test.go | 185 ++++++++ 30 files changed, 2900 insertions(+), 6 deletions(-) create mode 100644 test/suites/beta/integration/ami_test.go create mode 100644 test/suites/beta/integration/aws_metadata_test.go create mode 100644 test/suites/beta/integration/block_device_mappings_test.go create mode 100644 test/suites/beta/integration/cni_test.go create mode 100644 test/suites/beta/integration/daemonset_test.go create mode 100644 test/suites/beta/integration/emptiness_test.go create mode 100644 test/suites/beta/integration/extended_resources_test.go create mode 100644 test/suites/beta/integration/hash_test.go create mode 100644 test/suites/beta/integration/instance_profile_test.go create mode 100644 test/suites/beta/integration/kubelet_config_test.go create mode 100644 test/suites/beta/integration/lease_garbagecollection_test.go create mode 100644 test/suites/beta/integration/scheduling_test.go create mode 100644 test/suites/beta/integration/security_group_test.go create mode 100644 test/suites/beta/integration/storage_test.go create mode 100644 test/suites/beta/integration/subnet_test.go create mode 100644 test/suites/beta/integration/suite_test.go create mode 100644 test/suites/beta/integration/tags_test.go create mode 100644 test/suites/beta/integration/termination_test.go create mode 100644 test/suites/beta/integration/testdata/al2_no_mime_userdata_input.sh create mode 100644 test/suites/beta/integration/testdata/al2_userdata_input.sh create mode 100644 test/suites/beta/integration/testdata/amd_driver_input.sh create mode 100644 test/suites/beta/integration/testdata/br_userdata_input.sh create mode 100644 test/suites/beta/integration/testdata/windows_userdata_input.ps1 create mode 100644 test/suites/beta/integration/validation_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index da5f528af31f..4d2b1bda843b 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Drift, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: [Beta/Integration, Beta/Drift, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index ffe5bdde5ee2..689786ca9250 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -15,6 +15,7 @@ on: type: choice required: true options: + - Beta/Integration - Beta/Drift - Alpha/Integration - Alpha/Machine diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index b62476485e99..8f3844da19b8 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -710,7 +710,9 @@ func (env *Environment) ExpectCABundle() string { return base64.StdEncoding.EncodeToString(transportConfig.TLS.CAData) } -func (env *Environment) GetDaemonSetCount(prov *v1alpha5.Provisioner) int { +func (env *Environment) GetDaemonSetCountLegacy(prov *v1alpha5.Provisioner) int { + GinkgoHelper() + // Performs the same logic as the scheduler to get the number of daemonset // pods that we estimate we will need to schedule as overhead to each node daemonSetList := &appsv1.DaemonSetList{} @@ -728,3 +730,24 @@ func (env *Environment) GetDaemonSetCount(prov *v1alpha5.Provisioner) int { return true }) } + +func (env *Environment) GetDaemonSetCount(np *corev1beta1.NodePool) int { + GinkgoHelper() + + // Performs the same logic as the scheduler to get the number of daemonset + // pods that we estimate we will need to schedule as overhead to each node + daemonSetList := &appsv1.DaemonSetList{} + Expect(env.Client.List(env.Context, daemonSetList)).To(Succeed()) + + return lo.CountBy(daemonSetList.Items, func(d appsv1.DaemonSet) bool { + p := &v1.Pod{Spec: d.Spec.Template.Spec} + nodeTemplate := pscheduling.NewNodeClaimTemplate(np) + if err := scheduling.Taints(nodeTemplate.Spec.Taints).Tolerates(p); err != nil { + return false + } + if err := nodeTemplate.Requirements.Compatible(scheduling.NewPodRequirements(p), scheduling.AllowUndefinedWellKnownLabelsV1Beta1); err != nil { + return false + } + return true + }) +} diff --git a/test/suites/alpha/integration/kubelet_config_test.go b/test/suites/alpha/integration/kubelet_config_test.go index aec27cb931c9..5f9afec6a7bc 100644 --- a/test/suites/alpha/integration/kubelet_config_test.go +++ b/test/suites/alpha/integration/kubelet_config_test.go @@ -192,7 +192,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { }) // Get the DS pod count and use it to calculate the DS pod overhead - dsCount := env.GetDaemonSetCount(provisioner) + dsCount := env.GetDaemonSetCountLegacy(provisioner) provisioner.Spec.KubeletConfiguration = &v1alpha5.KubeletConfiguration{ MaxPods: ptr.Int32(1 + int32(dsCount)), } @@ -260,7 +260,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { // 2. If # of DS pods is even, we will have i.e. ceil((4+2)/2) = 3 // Since we restrict node to two cores, we will allow 6 pods. Both nodes will have // 4 DS pods and 2 test pods. - dsCount := env.GetDaemonSetCount(provisioner) + dsCount := env.GetDaemonSetCountLegacy(provisioner) provisioner.Spec.KubeletConfiguration = &v1alpha5.KubeletConfiguration{ PodsPerCore: ptr.Int32(int32(math.Ceil(float64(2+dsCount) / 2))), } diff --git a/test/suites/alpha/scale/deprovisioning_test.go b/test/suites/alpha/scale/deprovisioning_test.go index 473a82621bdf..f80ebcb6b9bd 100644 --- a/test/suites/alpha/scale/deprovisioning_test.go +++ b/test/suites/alpha/scale/deprovisioning_test.go @@ -126,7 +126,7 @@ var _ = Describe("Deprovisioning", Label(debug.NoWatch), Label(debug.NoEvents), } deployment = test.Deployment(deploymentOptions) selector = labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels) - dsCount = env.GetDaemonSetCount(provisioner) + dsCount = env.GetDaemonSetCountLegacy(provisioner) }) AfterEach(func() { diff --git a/test/suites/alpha/scale/provisioning_test.go b/test/suites/alpha/scale/provisioning_test.go index 0ebf0268815b..4c73c6115045 100644 --- a/test/suites/alpha/scale/provisioning_test.go +++ b/test/suites/alpha/scale/provisioning_test.go @@ -86,7 +86,7 @@ var _ = Describe("Provisioning", Label(debug.NoWatch), Label(debug.NoEvents), fu }) selector = labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels) // Get the DS pod count and use it to calculate the DS pod overhead - dsCount = env.GetDaemonSetCount(provisioner) + dsCount = env.GetDaemonSetCountLegacy(provisioner) }) It("should scale successfully on a node-dense scale-up", Label(debug.NoEvents), func(_ context.Context) { // Disable Prefix Delegation for the node-dense scale-up to not exhaust the IPs diff --git a/test/suites/beta/integration/ami_test.go b/test/suites/beta/integration/ami_test.go new file mode 100644 index 000000000000..bc2ddc96fab3 --- /dev/null +++ b/test/suites/beta/integration/ami_test.go @@ -0,0 +1,345 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "encoding/base64" + "fmt" + "os" + "strings" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" + "github.com/aws/aws-sdk-go/service/ssm" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + awsenv "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var _ = Describe("AMI", func() { + var customAMI string + BeforeEach(func() { + customAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) + }) + + It("should use the AMI defined by the AMI Selector", func() { + pod := coretest.Pod() + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("ImageId", HaveValue(Equal(customAMI)))) + }) + It("should use the most recent AMI when discovering multiple", func() { + // choose an old static image + parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: aws.String("/aws/service/eks/optimized-ami/1.23/amazon-linux-2/amazon-eks-node-1.23-v20230322/image_id"), + }) + Expect(err).To(BeNil()) + oldCustomAMI := *parameter.Parameter.Value + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: customAMI, + }, + { + ID: oldCustomAMI, + }, + } + nodeClass.Spec.UserData = aws.String(fmt.Sprintf("#!/bin/bash\n/etc/eks/bootstrap.sh '%s'", env.ClusterName)) + pod := coretest.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("ImageId", HaveValue(Equal(customAMI)))) + }) + It("should support ami selector Name but fail with incorrect owners", func() { + output, err := env.EC2API.DescribeImages(&ec2.DescribeImagesInput{ + ImageIds: []*string{aws.String(customAMI)}, + }) + Expect(err).To(BeNil()) + Expect(output.Images).To(HaveLen(1)) + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Name: *output.Images[0].Name, + Owner: "fakeOwnerValue", + }, + } + nodeClass.Spec.UserData = aws.String(fmt.Sprintf("#!/bin/bash\n/etc/eks/bootstrap.sh '%s'", env.ClusterName)) + pod := coretest.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.ExpectCreatedNodeCount("==", 0) + Expect(pod.Spec.NodeName).To(Equal("")) + }) + It("should support ami selector Name with default owners", func() { + output, err := env.EC2API.DescribeImages(&ec2.DescribeImagesInput{ + ImageIds: []*string{aws.String(customAMI)}, + }) + Expect(err).To(BeNil()) + Expect(output.Images).To(HaveLen(1)) + + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Name: *output.Images[0].Name, + }, + } + nodeClass.Spec.UserData = aws.String(fmt.Sprintf("#!/bin/bash\n/etc/eks/bootstrap.sh '%s'", env.ClusterName)) + pod := coretest.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("ImageId", HaveValue(Equal(customAMI)))) + }) + It("should support ami selector ids", func() { + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: customAMI, + }, + } + nodeClass.Spec.UserData = aws.String(fmt.Sprintf("#!/bin/bash\n/etc/eks/bootstrap.sh '%s'", env.ClusterName)) + pod := coretest.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("ImageId", HaveValue(Equal(customAMI)))) + }) + + Context("AMIFamily", func() { + It("should provision a node using the AL2 family", func() { + pod := coretest.Pod() + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should provision a node using the Bottlerocket family", func() { + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyBottlerocket + pod := coretest.Pod() + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should provision a node using the Ubuntu family", func() { + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyUbuntu + // TODO: remove requirements after Ubuntu fixes bootstrap script issue w/ + // new instance types not included in the max-pods.txt file. (https://github.com/aws/karpenter/issues/4472) + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + Values: []string{"m7a", "r7a", "c7a"}, + }, + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, + }...) + pod := coretest.Pod() + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support Custom AMIFamily with AMI Selectors", func() { + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: customAMI, + }, + } + nodeClass.Spec.UserData = aws.String(fmt.Sprintf("#!/bin/bash\n/etc/eks/bootstrap.sh '%s'", env.ClusterName)) + pod := coretest.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("ImageId", HaveValue(Equal(customAMI)))) + }) + It("should have the EC2NodeClass status for AMIs using wildcard", func() { + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Name: "*", + }, + } + env.ExpectCreated(nodeClass) + nc := EventuallyExpectAMIsToExist(nodeClass) + Expect(len(nc.Status.AMIs)).To(BeNumerically("<", 10)) + }) + It("should have the EC2NodeClass status for AMIs using tags", func() { + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: customAMI, + }, + } + env.ExpectCreated(nodeClass) + nc := EventuallyExpectAMIsToExist(nodeClass) + Expect(len(nc.Status.AMIs)).To(BeNumerically("==", 1)) + Expect(nc.Status.AMIs[0].ID).To(Equal(customAMI)) + }) + }) + + Context("UserData", func() { + It("should merge UserData contents for AL2 AMIFamily", func() { + content, err := os.ReadFile("testdata/al2_userdata_input.sh") + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.UserData = aws.String(string(content)) + nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}} + nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}} + pod := coretest.Pod(coretest.PodOptions{Tolerations: []v1.Toleration{{Key: "example.com", Operator: v1.TolerationOpExists}}}) + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + Expect(env.GetNode(pod.Spec.NodeName).Spec.Taints).To(ContainElements( + v1.Taint{Key: "example.com", Value: "value", Effect: "NoExecute"}, + v1.Taint{Key: "example.com", Value: "value", Effect: "NoSchedule"}, + )) + actualUserData, err := base64.StdEncoding.DecodeString(*getInstanceAttribute(pod.Spec.NodeName, "userData").UserData.Value) + Expect(err).ToNot(HaveOccurred()) + // Since the node has joined the cluster, we know our bootstrapping was correct. + // Just verify if the UserData contains our custom content too, rather than doing a byte-wise comparison. + Expect(string(actualUserData)).To(ContainSubstring("Running custom user data script")) + }) + It("should merge non-MIME UserData contents for AL2 AMIFamily", func() { + content, err := os.ReadFile("testdata/al2_no_mime_userdata_input.sh") + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.UserData = aws.String(string(content)) + nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}} + nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}} + pod := coretest.Pod(coretest.PodOptions{Tolerations: []v1.Toleration{{Key: "example.com", Operator: v1.TolerationOpExists}}}) + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + Expect(env.GetNode(pod.Spec.NodeName).Spec.Taints).To(ContainElements( + v1.Taint{Key: "example.com", Value: "value", Effect: "NoExecute"}, + v1.Taint{Key: "example.com", Value: "value", Effect: "NoSchedule"}, + )) + actualUserData, err := base64.StdEncoding.DecodeString(*getInstanceAttribute(pod.Spec.NodeName, "userData").UserData.Value) + Expect(err).ToNot(HaveOccurred()) + // Since the node has joined the cluster, we know our bootstrapping was correct. + // Just verify if the UserData contains our custom content too, rather than doing a byte-wise comparison. + Expect(string(actualUserData)).To(ContainSubstring("Running custom user data script")) + }) + It("should merge UserData contents for Bottlerocket AMIFamily", func() { + content, err := os.ReadFile("testdata/br_userdata_input.sh") + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyBottlerocket + nodeClass.Spec.UserData = aws.String(string(content)) + nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}} + nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}} + pod := coretest.Pod(coretest.PodOptions{Tolerations: []v1.Toleration{{Key: "example.com", Operator: v1.TolerationOpExists}}}) + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + Expect(env.GetNode(pod.Spec.NodeName).Spec.Taints).To(ContainElements( + v1.Taint{Key: "example.com", Value: "value", Effect: "NoExecute"}, + v1.Taint{Key: "example.com", Value: "value", Effect: "NoSchedule"}, + )) + actualUserData, err := base64.StdEncoding.DecodeString(*getInstanceAttribute(pod.Spec.NodeName, "userData").UserData.Value) + Expect(err).ToNot(HaveOccurred()) + Expect(string(actualUserData)).To(ContainSubstring("kube-api-qps = 30")) + }) + It("should merge UserData contents for Windows AMIFamily", func() { + env.ExpectWindowsIPAMEnabled() + DeferCleanup(func() { + env.ExpectWindowsIPAMDisabled() + }) + + content, err := os.ReadFile("testdata/windows_userdata_input.ps1") + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022 + nodeClass.Spec.UserData = aws.String(string(content)) + nodePool.Spec.Template.Spec.Taints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoExecute"}} + nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}} + + // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Windows)}, + }, + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + Values: []string{"m7a", "r7a", "c7a"}, + }, + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, + }...) + pod := coretest.Pod(coretest.PodOptions{ + Image: awsenv.WindowsDefaultImage, + NodeSelector: map[string]string{ + v1.LabelOSStable: string(v1.Windows), + v1.LabelWindowsBuild: "10.0.20348", + }, + Tolerations: []v1.Toleration{{Key: "example.com", Operator: v1.TolerationOpExists}}, + }) + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthyWithTimeout(time.Minute*15, pod) // Wait 15 minutes because Windows nodes/containers take longer to spin up + Expect(env.GetNode(pod.Spec.NodeName).Spec.Taints).To(ContainElements( + v1.Taint{Key: "example.com", Value: "value", Effect: "NoExecute"}, + v1.Taint{Key: "example.com", Value: "value", Effect: "NoSchedule"}, + )) + actualUserData, err := base64.StdEncoding.DecodeString(*getInstanceAttribute(pod.Spec.NodeName, "userData").UserData.Value) + Expect(err).ToNot(HaveOccurred()) + Expect(string(actualUserData)).To(ContainSubstring("Write-Host \"Running custom user data script\"")) + Expect(string(actualUserData)).To(ContainSubstring("[string]$EKSBootstrapScriptFile = \"$env:ProgramFiles\\Amazon\\EKS\\Start-EKSBootstrap.ps1\"")) + }) + }) +}) + +//nolint:unparam +func getInstanceAttribute(nodeName string, attribute string) *ec2.DescribeInstanceAttributeOutput { + var node v1.Node + Expect(env.Client.Get(env.Context, types.NamespacedName{Name: nodeName}, &node)).To(Succeed()) + providerIDSplit := strings.Split(node.Spec.ProviderID, "/") + instanceID := providerIDSplit[len(providerIDSplit)-1] + instanceAttribute, err := env.EC2API.DescribeInstanceAttribute(&ec2.DescribeInstanceAttributeInput{ + InstanceId: aws.String(instanceID), + Attribute: aws.String(attribute), + }) + Expect(err).ToNot(HaveOccurred()) + return instanceAttribute +} + +func EventuallyExpectAMIsToExist(nodeClass *v1beta1.EC2NodeClass) *v1beta1.EC2NodeClass { + nc := &v1beta1.EC2NodeClass{} + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClass), nc)).To(Succeed()) + g.Expect(nc.Status.AMIs).ToNot(BeNil()) + }).WithTimeout(30 * time.Second).Should(Succeed()) + return nc +} diff --git a/test/suites/beta/integration/aws_metadata_test.go b/test/suites/beta/integration/aws_metadata_test.go new file mode 100644 index 000000000000..30b13299a5a2 --- /dev/null +++ b/test/suites/beta/integration/aws_metadata_test.go @@ -0,0 +1,49 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" +) + +var _ = Describe("MetadataOptions", func() { + It("should use specified metadata options", func() { + nodeClass.Spec.MetadataOptions = &v1beta1.MetadataOptions{ + HTTPEndpoint: aws.String("enabled"), + HTTPProtocolIPv6: aws.String("enabled"), + HTTPPutResponseHopLimit: aws.Int64(1), + HTTPTokens: aws.String("required"), + } + pod := coretest.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("MetadataOptions", HaveValue(Equal(ec2.InstanceMetadataOptionsResponse{ + State: aws.String(ec2.InstanceMetadataOptionsStateApplied), + HttpEndpoint: aws.String("enabled"), + HttpProtocolIpv6: aws.String("enabled"), + HttpPutResponseHopLimit: aws.Int64(1), + HttpTokens: aws.String("required"), + InstanceMetadataTags: aws.String("disabled"), + })))) + }) +}) diff --git a/test/suites/beta/integration/block_device_mappings_test.go b/test/suites/beta/integration/block_device_mappings_test.go new file mode 100644 index 000000000000..71697f096c71 --- /dev/null +++ b/test/suites/beta/integration/block_device_mappings_test.go @@ -0,0 +1,57 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "github.com/aws/aws-sdk-go/aws" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter-core/pkg/utils/resources" + "github.com/aws/karpenter/pkg/apis/v1beta1" +) + +var _ = Describe("BlockDeviceMappings", func() { + It("should use specified block device mappings", func() { + nodeClass.Spec.BlockDeviceMappings = []*v1beta1.BlockDeviceMapping{ + { + DeviceName: aws.String("/dev/xvda"), + EBS: &v1beta1.BlockDevice{ + VolumeSize: resources.Quantity("10G"), + VolumeType: aws.String("io2"), + IOPS: aws.Int64(1000), + Encrypted: aws.Bool(true), + DeleteOnTermination: aws.Bool(true), + }, + }, + } + pod := test.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + instance := env.GetInstance(pod.Spec.NodeName) + Expect(len(instance.BlockDeviceMappings)).To(Equal(1)) + Expect(instance.BlockDeviceMappings[0]).ToNot(BeNil()) + Expect(instance.BlockDeviceMappings[0]).To(HaveField("DeviceName", HaveValue(Equal("/dev/xvda")))) + Expect(instance.BlockDeviceMappings[0].Ebs).To(HaveField("DeleteOnTermination", HaveValue(BeTrue()))) + volume := env.GetVolume(instance.BlockDeviceMappings[0].Ebs.VolumeId) + Expect(volume).To(HaveField("Encrypted", HaveValue(BeTrue()))) + Expect(volume).To(HaveField("Size", HaveValue(Equal(int64(10))))) // Convert G -> Gib (rounded up) + Expect(volume).To(HaveField("Iops", HaveValue(Equal(int64(1000))))) + Expect(volume).To(HaveField("VolumeType", HaveValue(Equal("io2")))) + }) +}) diff --git a/test/suites/beta/integration/cni_test.go b/test/suites/beta/integration/cni_test.go new file mode 100644 index 000000000000..eb579bd67152 --- /dev/null +++ b/test/suites/beta/integration/cni_test.go @@ -0,0 +1,90 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "strconv" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/types" + + "github.com/aws/karpenter-core/pkg/test" +) + +var _ = Describe("CNITests", func() { + It("should set max pods to 110 when AWSENILimited when AWS_ENI_LIMITED_POD_DENSITY is false", func() { + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.enableENILimitedPodDensity": "false"}) + pod := test.Pod() + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + var node corev1.Node + Expect(env.Client.Get(env.Context, types.NamespacedName{Name: pod.Spec.NodeName}, &node)).To(Succeed()) + allocatablePods, _ := node.Status.Allocatable.Pods().AsInt64() + Expect(allocatablePods).To(Equal(int64(110))) + }) + It("should set eni-limited maxPods when AWSENILimited when AWS_ENI_LIMITED_POD_DENSITY is true", func() { + pod := test.Pod() + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + var node corev1.Node + Expect(env.Client.Get(env.Context, types.NamespacedName{Name: pod.Spec.NodeName}, &node)).To(Succeed()) + allocatablePods, _ := node.Status.Allocatable.Pods().AsInt64() + Expect(allocatablePods).To(Equal(eniLimitedPodsFor(node.Labels["node.kubernetes.io/instance-type"]))) + }) + It("should set maxPods when reservedENIs is set", func() { + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.reservedENIs": "1"}) + env.ExpectSettingsOverridden(corev1.EnvVar{Name: "RESERVED_ENIS", Value: "1"}) + pod := test.Pod() + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + var node corev1.Node + Expect(env.Client.Get(env.Context, types.NamespacedName{Name: pod.Spec.NodeName}, &node)).To(Succeed()) + allocatablePods, _ := node.Status.Allocatable.Pods().AsInt64() + Expect(allocatablePods).To(Equal(reservedENIsFor(node.Labels["node.kubernetes.io/instance-type"]))) + }) +}) + +func eniLimitedPodsFor(instanceType string) int64 { + instance, err := env.EC2API.DescribeInstanceTypes(&ec2.DescribeInstanceTypesInput{ + InstanceTypes: aws.StringSlice([]string{instanceType}), + }) + Expect(err).ToNot(HaveOccurred()) + networkInfo := *instance.InstanceTypes[0].NetworkInfo + return *networkInfo.MaximumNetworkInterfaces*(*networkInfo.Ipv4AddressesPerInterface-1) + 2 +} + +func reservedENIsFor(instanceType string) int64 { + instance, err := env.EC2API.DescribeInstanceTypes(&ec2.DescribeInstanceTypesInput{ + InstanceTypes: aws.StringSlice([]string{instanceType}), + }) + Expect(err).ToNot(HaveOccurred()) + networkInfo := *instance.InstanceTypes[0].NetworkInfo + reservedENIs := 0 + reservedENIsVar, ok := lo.Find(env.ExpectSettings(), func(v corev1.EnvVar) bool { return v.Name == "RESERVED_ENIS" }) + if ok { + reservedENIs, err = strconv.Atoi(reservedENIsVar.Value) + Expect(err).ToNot(HaveOccurred()) + } + return (*networkInfo.MaximumNetworkInterfaces-int64(reservedENIs))*(*networkInfo.Ipv4AddressesPerInterface-1) + 2 +} diff --git a/test/suites/beta/integration/daemonset_test.go b/test/suites/beta/integration/daemonset_test.go new file mode 100644 index 000000000000..3e7206814de0 --- /dev/null +++ b/test/suites/beta/integration/daemonset_test.go @@ -0,0 +1,136 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + schedulingv1 "k8s.io/api/scheduling/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "sigs.k8s.io/controller-runtime/pkg/client" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter-core/pkg/test" +) + +var _ = Describe("DaemonSet", func() { + var limitrange *v1.LimitRange + var priorityclass *schedulingv1.PriorityClass + var daemonset *appsv1.DaemonSet + var dep *appsv1.Deployment + + BeforeEach(func() { + nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenUnderutilized + priorityclass = &schedulingv1.PriorityClass{ + ObjectMeta: metav1.ObjectMeta{ + Name: "high-priority-daemonsets", + }, + Value: int32(10000000), + GlobalDefault: false, + Description: "This priority class should be used for daemonsets.", + } + limitrange = &v1.LimitRange{ + ObjectMeta: metav1.ObjectMeta{ + Name: "limitrange", + Namespace: "default", + }, + } + daemonset = test.DaemonSet(test.DaemonSetOptions{ + PodOptions: test.PodOptions{ + ResourceRequirements: v1.ResourceRequirements{Limits: v1.ResourceList{v1.ResourceMemory: resource.MustParse("1Gi")}}, + PriorityClassName: "high-priority-daemonsets", + }, + }) + numPods := 1 + dep = test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceMemory: resource.MustParse("4")}, + }, + }, + }) + }) + It("should account for LimitRange Default on daemonSet pods for resources", func() { + limitrange.Spec.Limits = []v1.LimitRangeItem{ + { + Type: v1.LimitTypeContainer, + Default: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2"), + v1.ResourceMemory: resource.MustParse("1Gi"), + }, + }, + } + + podSelector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + daemonSetSelector := labels.SelectorFromSet(daemonset.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, limitrange, priorityclass, daemonset, dep) + + // Eventually expect a single node to exist and both the deployment pod and the daemonset pod to schedule to it + Eventually(func(g Gomega) { + nodeList := &v1.NodeList{} + g.Expect(env.Client.List(env, nodeList, client.HasLabels{"testing/cluster"})).To(Succeed()) + g.Expect(nodeList.Items).To(HaveLen(1)) + + deploymentPods := env.Monitor.RunningPods(podSelector) + g.Expect(deploymentPods).To(HaveLen(1)) + + daemonSetPods := env.Monitor.RunningPods(daemonSetSelector) + g.Expect(daemonSetPods).To(HaveLen(1)) + + g.Expect(deploymentPods[0].Spec.NodeName).To(Equal(nodeList.Items[0].Name)) + g.Expect(daemonSetPods[0].Spec.NodeName).To(Equal(nodeList.Items[0].Name)) + }).Should(Succeed()) + }) + It("should account for LimitRange DefaultRequest on daemonSet pods for resources", func() { + limitrange.Spec.Limits = []v1.LimitRangeItem{ + { + Type: v1.LimitTypeContainer, + DefaultRequest: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("2"), + v1.ResourceMemory: resource.MustParse("1Gi"), + }, + }, + } + + podSelector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + daemonSetSelector := labels.SelectorFromSet(daemonset.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, limitrange, priorityclass, daemonset, dep) + + // Eventually expect a single node to exist and both the deployment pod and the daemonset pod to schedule to it + Eventually(func(g Gomega) { + nodeList := &v1.NodeList{} + g.Expect(env.Client.List(env, nodeList, client.HasLabels{"testing/cluster"})).To(Succeed()) + g.Expect(nodeList.Items).To(HaveLen(1)) + + deploymentPods := env.Monitor.RunningPods(podSelector) + g.Expect(deploymentPods).To(HaveLen(1)) + + daemonSetPods := env.Monitor.RunningPods(daemonSetSelector) + g.Expect(daemonSetPods).To(HaveLen(1)) + + g.Expect(deploymentPods[0].Spec.NodeName).To(Equal(nodeList.Items[0].Name)) + g.Expect(daemonSetPods[0].Spec.NodeName).To(Equal(nodeList.Items[0].Name)) + }).Should(Succeed()) + }) +}) diff --git a/test/suites/beta/integration/emptiness_test.go b/test/suites/beta/integration/emptiness_test.go new file mode 100644 index 000000000000..9c48e694a9e8 --- /dev/null +++ b/test/suites/beta/integration/emptiness_test.go @@ -0,0 +1,65 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "time" + + "github.com/samber/lo" + "k8s.io/apimachinery/pkg/labels" + "knative.dev/pkg/ptr" + + "sigs.k8s.io/controller-runtime/pkg/client" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter-core/pkg/test" +) + +var _ = Describe("Emptiness", func() { + It("should terminate an empty node", func() { + nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenEmpty + nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{Duration: lo.ToPtr(time.Hour * 300)} + + const numPods = 1 + deployment := test.Deployment(test.DeploymentOptions{Replicas: numPods}) + + By("kicking off provisioning for a deployment") + env.ExpectCreated(nodeClass, nodePool, deployment) + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), numPods) + + By("making the nodeclaim empty") + persisted := deployment.DeepCopy() + deployment.Spec.Replicas = ptr.Int32(0) + Expect(env.Client.Patch(env, deployment, client.MergeFrom(persisted))).To(Succeed()) + + By("waiting for the nodeclaim emptiness status condition to propagate") + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Empty)).ToNot(BeNil()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Empty).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + By("waiting for the nodeclaim to deprovision when past its ConsolidateAfter timeout of 0") + nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{Duration: lo.ToPtr(time.Duration(0))} + env.ExpectUpdated(nodePool) + + env.EventuallyExpectNotFound(nodeClaim, node) + }) +}) diff --git a/test/suites/beta/integration/extended_resources_test.go b/test/suites/beta/integration/extended_resources_test.go new file mode 100644 index 000000000000..ba4681e4ee64 --- /dev/null +++ b/test/suites/beta/integration/extended_resources_test.go @@ -0,0 +1,423 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "fmt" + "os" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" +) + +var _ = Describe("Extended Resources", func() { + It("should provision nodes for a deployment that requests nvidia.com/gpu", func() { + ExpectNvidiaDevicePluginCreated() + + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + Limits: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + env.EventuallyExpectInitializedNodeCount("==", 1) + }) + It("should provision nodes for a deployment that requests nvidia.com/gpu (Bottlerocket)", func() { + // For Bottlerocket, we are testing that resources are initialized without needing a device plugin + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyBottlerocket + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + Limits: v1.ResourceList{ + "nvidia.com/gpu": resource.MustParse("1"), + }, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + env.EventuallyExpectInitializedNodeCount("==", 1) + }) + It("should provision nodes for a deployment that requests vpc.amazonaws.com/pod-eni (security groups for pods)", func() { + env.ExpectPodENIEnabled() + DeferCleanup(func() { + env.ExpectPodENIDisabled() + }) + env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.enablePodENI": "true"}) + // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + Values: []string{"m7a", "r7a", "c7a"}, + }, + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, + }...) + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "vpc.amazonaws.com/pod-eni": resource.MustParse("1"), + }, + Limits: v1.ResourceList{ + "vpc.amazonaws.com/pod-eni": resource.MustParse("1"), + }, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + env.EventuallyExpectInitializedNodeCount("==", 1) + }) + It("should provision nodes for a deployment that requests amd.com/gpu", func() { + Skip("skipping test on AMD instance types") + ExpectAMDDevicePluginCreated() + + customAMI := env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 0) + + // We create custom userData that installs the AMD Radeon driver and then performs the EKS bootstrap script + // We use a Custom AMI so that we can reboot after we start the kubelet service + rawContent, err := os.ReadFile("testdata/amd_driver_input.sh") + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: customAMI, + }, + } + nodeClass.Spec.UserData = lo.ToPtr(fmt.Sprintf(string(rawContent), env.ClusterName, + env.ClusterEndpoint, env.ExpectCABundle(), nodePool.Name)) + + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "amd.com/gpu": resource.MustParse("1"), + }, + Limits: v1.ResourceList{ + "amd.com/gpu": resource.MustParse("1"), + }, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + Eventually(func(g Gomega) { + g.Expect(env.Monitor.RunningPodsCount(selector)).To(Equal(numPods)) + }).WithTimeout(15 * time.Minute).Should(Succeed()) // The node needs additional time to install the AMD GPU driver + env.ExpectCreatedNodeCount("==", 1) + env.EventuallyExpectInitializedNodeCount("==", 1) + }) + // Need to subscribe to the AMI to run the test successfully + // https://aws.amazon.com/marketplace/pp/prodview-st5jc2rk3phr2?sr=0-2&ref_=beagle&applicationId=AWSMPContessa + It("should provision nodes for a deployment that requests habana.ai/gaudi", func() { + Skip("skipping test on an exotic instance type") + ExpectHabanaDevicePluginCreated() + + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: "ami-0fae925f94979981f", + }, + } + numPods := 1 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "habana.ai/gaudi": resource.MustParse("1"), + }, + Limits: v1.ResourceList{ + "habana.ai/gaudi": resource.MustParse("1"), + }, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + env.EventuallyExpectInitializedNodeCount("==", 1) + }) +}) + +func ExpectNvidiaDevicePluginCreated() { + GinkgoHelper() + env.ExpectCreated(&appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nvidia-device-plugin-daemonset", + Namespace: "kube-system", + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": "nvidia-device-plugin-ds", + }, + }, + UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ + Type: appsv1.RollingUpdateDaemonSetStrategyType, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": "nvidia-device-plugin-ds", + }, + }, + Spec: v1.PodSpec{ + Tolerations: []v1.Toleration{ + { + Key: "nvidia.com/gpu", + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }, + }, + PriorityClassName: "system-node-critical", + Containers: []v1.Container{ + { + Name: "nvidia-device-plugin-ctr", + Image: "nvcr.io/nvidia/k8s-device-plugin:v0.12.3", + Env: []v1.EnvVar{ + { + Name: "FAIL_ON_INIT_ERROR", + Value: "false", + }, + }, + SecurityContext: &v1.SecurityContext{ + AllowPrivilegeEscalation: lo.ToPtr(false), + Capabilities: &v1.Capabilities{ + Drop: []v1.Capability{"ALL"}, + }, + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: "device-plugin", + MountPath: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: "device-plugin", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + }, + }, + }, + }) +} + +func ExpectAMDDevicePluginCreated() { + GinkgoHelper() + env.ExpectCreated(&appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "amdgpu-device-plugin-daemonset", + Namespace: "kube-system", + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": "amdgpu-dp-ds", + }, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "name": "amdgpu-dp-ds", + }, + }, + Spec: v1.PodSpec{ + PriorityClassName: "system-node-critical", + Tolerations: []v1.Toleration{ + { + Key: "amd.com/gpu", + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }, + }, + Containers: []v1.Container{ + { + Name: "amdgpu-dp-cntr", + Image: "rocm/k8s-device-plugin", + SecurityContext: &v1.SecurityContext{ + AllowPrivilegeEscalation: lo.ToPtr(false), + Capabilities: &v1.Capabilities{ + Drop: []v1.Capability{"ALL"}, + }, + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: "dp", + MountPath: "/var/lib/kubelet/device-plugins", + }, + { + Name: "sys", + MountPath: "/sys", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: "dp", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + { + Name: "sys", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/sys", + }, + }, + }, + }, + }, + }, + }, + }) +} + +func ExpectHabanaDevicePluginCreated() { + GinkgoHelper() + env.ExpectCreated(&v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: "habana-system", + }, + }) + env.ExpectCreated(&appsv1.DaemonSet{ + ObjectMeta: metav1.ObjectMeta{ + Name: "habanalabs-device-plugin-daemonset", + Namespace: "habana-system", + }, + Spec: appsv1.DaemonSetSpec{ + Selector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "name": "habanalabs-device-plugin-ds", + }, + }, + UpdateStrategy: appsv1.DaemonSetUpdateStrategy{ + Type: appsv1.RollingUpdateDaemonSetStrategyType, + }, + Template: v1.PodTemplateSpec{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "scheduler.alpha.kubernetes.io/critical-pod": "", + }, + Labels: map[string]string{ + "name": "habanalabs-device-plugin-ds", + }, + }, + Spec: v1.PodSpec{ + Tolerations: []v1.Toleration{ + { + Key: "habana.ai/gaudi", + Operator: v1.TolerationOpExists, + Effect: v1.TaintEffectNoSchedule, + }, + }, + PriorityClassName: "system-node-critical", + Containers: []v1.Container{ + { + Name: "habanalabs-device-plugin-ctr", + Image: "vault.habana.ai/docker-k8s-device-plugin/docker-k8s-device-plugin:latest", + SecurityContext: &v1.SecurityContext{ + Privileged: lo.ToPtr(true), + }, + VolumeMounts: []v1.VolumeMount{ + { + Name: "device-plugin", + MountPath: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + Volumes: []v1.Volume{ + { + Name: "device-plugin", + VolumeSource: v1.VolumeSource{ + HostPath: &v1.HostPathVolumeSource{ + Path: "/var/lib/kubelet/device-plugins", + }, + }, + }, + }, + }, + }, + }, + }) +} diff --git a/test/suites/beta/integration/hash_test.go b/test/suites/beta/integration/hash_test.go new file mode 100644 index 000000000000..552abb3ffffe --- /dev/null +++ b/test/suites/beta/integration/hash_test.go @@ -0,0 +1,53 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "sigs.k8s.io/controller-runtime/pkg/client" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/apis/v1beta1" +) + +var _ = Describe("CRD Hash", func() { + It("should have NodePool hash", func() { + env.ExpectCreated(nodeClass, nodePool) + + Eventually(func(g Gomega) { + np := &corev1beta1.NodePool{} + err := env.Client.Get(env, client.ObjectKeyFromObject(nodePool), np) + g.Expect(err).ToNot(HaveOccurred()) + + hash, found := np.Annotations[corev1beta1.NodePoolHashAnnotationKey] + g.Expect(found).To(BeTrue()) + g.Expect(hash).To(Equal(np.Hash())) + }) + }) + It("should have EC2NodeClass hash", func() { + env.ExpectCreated(nodeClass) + + Eventually(func(g Gomega) { + nc := &v1beta1.EC2NodeClass{} + err := env.Client.Get(env, client.ObjectKeyFromObject(nodeClass), nc) + g.Expect(err).ToNot(HaveOccurred()) + + hash, found := nc.Annotations[v1beta1.AnnotationNodeClassHash] + g.Expect(found).To(BeTrue()) + g.Expect(hash).To(Equal(nc.Hash())) + }) + }) +}) diff --git a/test/suites/beta/integration/instance_profile_test.go b/test/suites/beta/integration/instance_profile_test.go new file mode 100644 index 000000000000..b40432aae5eb --- /dev/null +++ b/test/suites/beta/integration/instance_profile_test.go @@ -0,0 +1,58 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/iam" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + coretest "github.com/aws/karpenter-core/pkg/test" + awserrors "github.com/aws/karpenter/pkg/errors" + "github.com/aws/karpenter/pkg/providers/instanceprofile" +) + +var _ = Describe("InstanceProfile Generation", func() { + It("should generate the InstanceProfile when setting the role", func() { + pod := coretest.Pod() + env.ExpectCreated(nodePool, nodeClass, pod) + env.EventuallyExpectHealthy(pod) + node := env.ExpectCreatedNodeCount("==", 1)[0] + + instance := env.GetInstance(node.Name) + Expect(instance.IamInstanceProfile).ToNot(BeNil()) + Expect(instance.IamInstanceProfile.Arn).To(ContainSubstring(nodeClass.Spec.Role)) + + instanceProfile := env.ExpectInstanceProfileExists(instanceprofile.GetProfileName(env.Context, env.Region, nodeClass)) + Expect(instanceProfile.Roles).To(HaveLen(1)) + Expect(lo.FromPtr(instanceProfile.Roles[0].RoleName)).To(Equal(nodeClass.Spec.Role)) + }) + It("should remove the generated InstanceProfile when deleting the NodeClass", func() { + pod := coretest.Pod() + env.ExpectCreated(nodePool, nodeClass, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectDeleted(nodePool, nodeClass) + Eventually(func(g Gomega) { + _, err := env.IAMAPI.GetInstanceProfileWithContext(env.Context, &iam.GetInstanceProfileInput{ + InstanceProfileName: aws.String(instanceprofile.GetProfileName(env.Context, env.Region, nodeClass)), + }) + g.Expect(awserrors.IsNotFound(err)).To(BeTrue()) + }).Should(Succeed()) + }) +}) diff --git a/test/suites/beta/integration/kubelet_config_test.go b/test/suites/beta/integration/kubelet_config_test.go new file mode 100644 index 000000000000..729c2b1c4576 --- /dev/null +++ b/test/suites/beta/integration/kubelet_config_test.go @@ -0,0 +1,282 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "math" + "time" + + . "github.com/onsi/ginkgo/v2" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "knative.dev/pkg/ptr" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter/test/pkg/environment/aws" + + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" +) + +var _ = Describe("KubeletConfiguration Overrides", func() { + Context("All kubelet configuration set", func() { + BeforeEach(func() { + // MaxPods needs to account for the daemonsets that will run on the nodes + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ + MaxPods: ptr.Int32(110), + PodsPerCore: ptr.Int32(10), + SystemReserved: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: resource.MustParse("200Mi"), + v1.ResourceEphemeralStorage: resource.MustParse("1Gi"), + }, + KubeReserved: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: resource.MustParse("200Mi"), + v1.ResourceEphemeralStorage: resource.MustParse("1Gi"), + }, + EvictionHard: map[string]string{ + "memory.available": "5%", + "nodefs.available": "5%", + "nodefs.inodesFree": "5%", + "imagefs.available": "5%", + "imagefs.inodesFree": "5%", + "pid.available": "3%", + }, + EvictionSoft: map[string]string{ + "memory.available": "10%", + "nodefs.available": "10%", + "nodefs.inodesFree": "10%", + "imagefs.available": "10%", + "imagefs.inodesFree": "10%", + "pid.available": "6%", + }, + EvictionSoftGracePeriod: map[string]metav1.Duration{ + "memory.available": {Duration: time.Minute * 2}, + "nodefs.available": {Duration: time.Minute * 2}, + "nodefs.inodesFree": {Duration: time.Minute * 2}, + "imagefs.available": {Duration: time.Minute * 2}, + "imagefs.inodesFree": {Duration: time.Minute * 2}, + "pid.available": {Duration: time.Minute * 2}, + }, + EvictionMaxPodGracePeriod: ptr.Int32(120), + ImageGCHighThresholdPercent: ptr.Int32(50), + ImageGCLowThresholdPercent: ptr.Int32(10), + CPUCFSQuota: ptr.Bool(false), + } + }) + DescribeTable("Linux AMIFamilies", + func(amiFamily *string) { + nodeClass.Spec.AMIFamily = amiFamily + // Need to enable nodepool-level OS-scoping for now since DS evaluation is done off of the nodepool + // requirements, not off of the instance type options so scheduling can fail if nodepools aren't + // properly scoped + // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + Values: []string{"m7a", "r7a", "c7a"}, + }, + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Linux)}, + }, + }...) + pod := test.Pod(test.PodOptions{ + NodeSelector: map[string]string{ + v1.LabelOSStable: string(v1.Linux), + v1.LabelArchStable: "amd64", + }, + }) + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + }, + Entry("when the AMIFamily is AL2", &v1beta1.AMIFamilyAL2), + Entry("when the AMIFamily is Ubuntu", &v1beta1.AMIFamilyUbuntu), + Entry("when the AMIFamily is Bottlerocket", &v1beta1.AMIFamilyBottlerocket), + ) + DescribeTable("Windows AMIFamilies", + func(amiFamily *string) { + env.ExpectWindowsIPAMEnabled() + DeferCleanup(func() { + env.ExpectWindowsIPAMDisabled() + }) + + nodeClass.Spec.AMIFamily = amiFamily + // Need to enable nodepool-level OS-scoping for now since DS evaluation is done off of the nodepool + // requirements, not off of the instance type options so scheduling can fail if nodepool aren't + // properly scoped + // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.*, c7a.* ENI data (https://github.com/aws/karpenter/issues/4472) + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + Values: []string{"m7a", "r7a", "c7a"}, + }, + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Windows)}, + }, + }...) + pod := test.Pod(test.PodOptions{ + Image: aws.WindowsDefaultImage, + NodeSelector: map[string]string{ + v1.LabelOSStable: string(v1.Windows), + v1.LabelArchStable: "amd64", + }, + }) + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthyWithTimeout(time.Minute*15, pod) + env.ExpectCreatedNodeCount("==", 1) + }, + Entry("when the AMIFamily is Windows2019", &v1beta1.AMIFamilyWindows2019), + Entry("when the AMIFamily is Windows2022", &v1beta1.AMIFamilyWindows2022), + ) + }) + It("should schedule pods onto separate nodes when maxPods is set", func() { + // MaxPods needs to account for the daemonsets that will run on the nodes + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Linux)}, + }, + }...) + + // Get the DS pod count and use it to calculate the DS pod overhead + dsCount := env.GetDaemonSetCount(nodePool) + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ + MaxPods: ptr.Int32(1 + int32(dsCount)), + } + + numPods := 3 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("100m")}, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 3) + env.ExpectUniqueNodeNames(selector, 3) + }) + It("should schedule pods onto separate nodes when podsPerCore is set", func() { + // PodsPerCore needs to account for the daemonsets that will run on the nodes + // This will have 4 pods available on each node (2 taken by daemonset pods) + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceCPU, + Operator: v1.NodeSelectorOpIn, + Values: []string{"2"}, + }, + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Linux)}, + }, + }...) + numPods := 4 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("100m")}, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + // Get the DS pod count and use it to calculate the DS pod overhead + // We calculate podsPerCore to split the test pods and the DS pods between two nodes: + // 1. If # of DS pods is odd, we will have i.e. ceil((3+2)/2) = 3 + // Since we restrict node to two cores, we will allow 6 pods. One node will have 3 + // DS pods and 3 test pods. Other node will have 1 test pod and 3 DS pods + // 2. If # of DS pods is even, we will have i.e. ceil((4+2)/2) = 3 + // Since we restrict node to two cores, we will allow 6 pods. Both nodes will have + // 4 DS pods and 2 test pods. + dsCount := env.GetDaemonSetCount(nodePool) + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ + PodsPerCore: ptr.Int32(int32(math.Ceil(float64(2+dsCount) / 2))), + } + + env.ExpectCreated(nodeClass, nodePool, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 2) + env.ExpectUniqueNodeNames(selector, 2) + }) + It("should ignore podsPerCore value when Bottlerocket is used", func() { + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyBottlerocket + // All pods should schedule to a single node since we are ignoring podsPerCore value + // This would normally schedule to 3 nodes if not using Bottlerocket + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceCPU, + Operator: v1.NodeSelectorOpIn, + Values: []string{"2"}, + }, + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Linux)}, + }, + }...) + nodePool.Spec.Template.Spec.Kubelet.PodsPerCore = ptr.Int32(1) + numPods := 6 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("100m")}, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(nodeClass, nodePool, dep) + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + env.ExpectUniqueNodeNames(selector, 1) + }) +}) diff --git a/test/suites/beta/integration/lease_garbagecollection_test.go b/test/suites/beta/integration/lease_garbagecollection_test.go new file mode 100644 index 000000000000..e20cf8528983 --- /dev/null +++ b/test/suites/beta/integration/lease_garbagecollection_test.go @@ -0,0 +1,44 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "time" + + coordinationsv1 "k8s.io/api/coordination/v1" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" + + "github.com/aws/karpenter-core/pkg/test" + + . "github.com/onsi/ginkgo/v2" +) + +var _ = Describe("Lease Garbage Collection", func() { + var badLease *coordinationsv1.Lease + BeforeEach(func() { + badLease = &coordinationsv1.Lease{ + ObjectMeta: v1.ObjectMeta{ + CreationTimestamp: v1.Time{Time: time.Now().Add(-time.Hour * 2)}, + Name: "new-lease", + Namespace: "kube-node-lease", + Labels: map[string]string{test.DiscoveryLabel: "unspecified"}, + }, + } + }) + It("should delete node lease that does not contain an OwnerReference", func() { + env.ExpectCreated(badLease) + env.EventuallyExpectNotFound(badLease) + }) +}) diff --git a/test/suites/beta/integration/scheduling_test.go b/test/suites/beta/integration/scheduling_test.go new file mode 100644 index 000000000000..985cc06484b5 --- /dev/null +++ b/test/suites/beta/integration/scheduling_test.go @@ -0,0 +1,388 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "fmt" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/sets" + "knative.dev/pkg/ptr" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/test/pkg/debug" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { + var selectors sets.Set[string] + + BeforeAll(func() { + selectors = sets.New[string]() + }) + AfterAll(func() { + // Ensure that we're exercising all well known labels + Expect(lo.Keys(selectors)).To(ContainElements(append(corev1beta1.WellKnownLabels.UnsortedList(), lo.Keys(corev1beta1.NormalizedLabels)...))) + }) + It("should apply annotations to the node", func() { + nodePool.Spec.Template.Annotations = map[string]string{ + "foo": "bar", + corev1beta1.DoNotDisruptAnnotationKey: "true", + } + pod := test.Pod() + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + Expect(env.GetNode(pod.Spec.NodeName).Annotations).To(And(HaveKeyWithValue("foo", "bar"), HaveKeyWithValue(corev1beta1.DoNotDisruptAnnotationKey, "true"))) + }) + It("should support well-known labels for instance type selection", func() { + nodeSelector := map[string]string{ + // Well Known + corev1beta1.NodePoolLabelKey: nodePool.Name, + v1.LabelInstanceTypeStable: "c5.large", + // Well Known to AWS + v1beta1.LabelInstanceHypervisor: "nitro", + v1beta1.LabelInstanceCategory: "c", + v1beta1.LabelInstanceGeneration: "5", + v1beta1.LabelInstanceFamily: "c5", + v1beta1.LabelInstanceSize: "large", + v1beta1.LabelInstanceCPU: "2", + v1beta1.LabelInstanceMemory: "4096", + v1beta1.LabelInstanceNetworkBandwidth: "750", + v1beta1.LabelInstancePods: "29", + } + selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels + requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { + return v1.NodeSelectorRequirement{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}} + }) + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodeSelector: nodeSelector, + NodePreferences: requirements, + NodeRequirements: requirements, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support well-known labels for local NVME storage", func() { + selectors.Insert(v1beta1.LabelInstanceLocalNVME) // Add node selector keys to selectors used in testing to ensure we test all labels + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodePreferences: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceLocalNVME, + Operator: v1.NodeSelectorOpGt, + Values: []string{"0"}, + }, + }, + NodeRequirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceLocalNVME, + Operator: v1.NodeSelectorOpGt, + Values: []string{"0"}, + }, + }, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support well-known labels for encryption in transit", func() { + selectors.Insert(v1beta1.LabelInstanceEncryptionInTransitSupported) // Add node selector keys to selectors used in testing to ensure we test all labels + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodePreferences: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceEncryptionInTransitSupported, + Operator: v1.NodeSelectorOpIn, + Values: []string{"true"}, + }, + }, + NodeRequirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceEncryptionInTransitSupported, + Operator: v1.NodeSelectorOpIn, + Values: []string{"true"}, + }, + }, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support well-known deprecated labels", func() { + nodeSelector := map[string]string{ + // Deprecated Labels + v1.LabelFailureDomainBetaRegion: env.Region, + v1.LabelFailureDomainBetaZone: fmt.Sprintf("%sa", env.Region), + "beta.kubernetes.io/arch": "amd64", + "beta.kubernetes.io/os": "linux", + v1.LabelInstanceType: "c5.large", + } + selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels + requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { + return v1.NodeSelectorRequirement{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}} + }) + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodeSelector: nodeSelector, + NodePreferences: requirements, + NodeRequirements: requirements, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support well-known labels for topology and architecture", func() { + nodeSelector := map[string]string{ + // Well Known + corev1beta1.NodePoolLabelKey: nodePool.Name, + v1.LabelTopologyRegion: env.Region, + v1.LabelTopologyZone: fmt.Sprintf("%sa", env.Region), + v1.LabelOSStable: "linux", + v1.LabelArchStable: "amd64", + corev1beta1.CapacityTypeLabelKey: corev1beta1.CapacityTypeOnDemand, + } + selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels + requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { + return v1.NodeSelectorRequirement{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}} + }) + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodeSelector: nodeSelector, + NodePreferences: requirements, + NodeRequirements: requirements, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support well-known labels for a gpu (nvidia)", func() { + nodeSelector := map[string]string{ + v1beta1.LabelInstanceGPUName: "t4", + v1beta1.LabelInstanceGPUMemory: "16384", + v1beta1.LabelInstanceGPUManufacturer: "nvidia", + v1beta1.LabelInstanceGPUCount: "1", + } + selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels + requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { + return v1.NodeSelectorRequirement{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}} + }) + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodeSelector: nodeSelector, + NodePreferences: requirements, + NodeRequirements: requirements, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support well-known labels for an accelerator (inferentia)", func() { + nodeSelector := map[string]string{ + v1beta1.LabelInstanceAcceleratorName: "inferentia", + v1beta1.LabelInstanceAcceleratorManufacturer: "aws", + v1beta1.LabelInstanceAcceleratorCount: "1", + } + selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels + requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { + return v1.NodeSelectorRequirement{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}} + }) + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodeSelector: nodeSelector, + NodePreferences: requirements, + NodeRequirements: requirements, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support well-known labels for windows-build version", func() { + env.ExpectWindowsIPAMEnabled() + DeferCleanup(func() { + env.ExpectWindowsIPAMDisabled() + }) + + nodeSelector := map[string]string{ + // Well Known + v1.LabelWindowsBuild: v1beta1.Windows2022Build, + v1.LabelOSStable: string(v1.Windows), // Specify the OS to enable vpc-resource-controller to inject the PrivateIPv4Address resource + } + selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels + requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { + return v1.NodeSelectorRequirement{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}} + }) + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodeSelector: nodeSelector, + NodePreferences: requirements, + NodeRequirements: requirements, + Image: aws.WindowsDefaultImage, + }}) + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022 + // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + Values: []string{"m7a", "r7a", "c7a"}, + }, + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, + }...) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCountWithTimeout(time.Minute*15, labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should support the node-restriction.kubernetes.io label domain", func() { + // Assign labels to the nodepool so that it has known values + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1.LabelNamespaceNodeRestriction + "/team", + Operator: v1.NodeSelectorOpExists, + }, + { + Key: v1.LabelNamespaceNodeRestriction + "/custom-label", + Operator: v1.NodeSelectorOpExists, + }, + }...) + nodeSelector := map[string]string{ + v1.LabelNamespaceNodeRestriction + "/team": "team-1", + v1.LabelNamespaceNodeRestriction + "/custom-label": "custom-value", + } + selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels + requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { + return v1.NodeSelectorRequirement{Key: key, Operator: v1.NodeSelectorOpIn, Values: []string{value}} + }) + deployment := test.Deployment(test.DeploymentOptions{Replicas: 1, PodOptions: test.PodOptions{ + NodeSelector: nodeSelector, + NodePreferences: requirements, + NodeRequirements: requirements, + }}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should provision a node for naked pods", func() { + pod := test.Pod() + + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should provision a node for a deployment", Label(debug.NoWatch), Label(debug.NoEvents), func() { + deployment := test.Deployment(test.DeploymentOptions{Replicas: 50}) + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("<=", 2) // should probably all land on a single node, but at worst two depending on batching + }) + It("should provision a node for a self-affinity deployment", func() { + // just two pods as they all need to land on the same node + podLabels := map[string]string{"test": "self-affinity"} + deployment := test.Deployment(test.DeploymentOptions{ + Replicas: 2, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: podLabels, + }, + PodRequirements: []v1.PodAffinityTerm{ + { + LabelSelector: &metav1.LabelSelector{MatchLabels: podLabels}, + TopologyKey: v1.LabelHostname, + }, + }, + }, + }) + + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), 2) + env.ExpectCreatedNodeCount("==", 1) + }) + It("should provision three nodes for a zonal topology spread", func() { + // one pod per zone + podLabels := map[string]string{"test": "zonal-spread"} + deployment := test.Deployment(test.DeploymentOptions{ + Replicas: 3, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: podLabels, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelTopologyZone, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{MatchLabels: podLabels}, + }, + }, + }, + }) + + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(podLabels), 3) + env.ExpectCreatedNodeCount("==", 3) + }) + It("should provision a node using a NodePool with higher priority", func() { + nodePoolLowPri := test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Weight: ptr.Int32(10), + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"t3.nano"}, + }, + }, + }, + }, + }, + }) + nodePoolHighPri := test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Weight: ptr.Int32(100), + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c4.large"}, + }, + }, + }, + }, + }, + }) + pod := test.Pod() + env.ExpectCreated(pod, nodeClass, nodePoolLowPri, nodePoolHighPri) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + Expect(ptr.StringValue(env.GetInstance(pod.Spec.NodeName).InstanceType)).To(Equal("c4.large")) + Expect(env.GetNode(pod.Spec.NodeName).Labels[corev1beta1.NodePoolLabelKey]).To(Equal(nodePoolHighPri.Name)) + }) +}) diff --git a/test/suites/beta/integration/security_group_test.go b/test/suites/beta/integration/security_group_test.go new file mode 100644 index 000000000000..68da2d3ad10c --- /dev/null +++ b/test/suites/beta/integration/security_group_test.go @@ -0,0 +1,93 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "time" + + "github.com/aws/aws-sdk-go/service/ec2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var _ = Describe("SecurityGroups", func() { + It("should use the security-group-id selector", func() { + securityGroups := env.GetSecurityGroups(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(len(securityGroups)).To(BeNumerically(">", 1)) + nodeClass.Spec.SecurityGroupSelectorTerms = lo.Map(securityGroups, func(sg aws.SecurityGroup, _ int) v1beta1.SecurityGroupSelectorTerm { + return v1beta1.SecurityGroupSelectorTerm{ + ID: lo.FromPtr(sg.GroupId), + } + }) + pod := test.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("SecurityGroups", ConsistOf(&securityGroups[0].GroupIdentifier, &securityGroups[1].GroupIdentifier))) + }) + + It("should use the security group selector with multiple tag values", func() { + securityGroups := env.GetSecurityGroups(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(len(securityGroups)).To(BeNumerically(">", 1)) + first := securityGroups[0] + last := securityGroups[len(securityGroups)-1] + + nodeClass.Spec.SecurityGroupSelectorTerms = []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"Name": lo.FromPtr(lo.FindOrElse(first.Tags, &ec2.Tag{}, func(tag *ec2.Tag) bool { return lo.FromPtr(tag.Key) == "Name" }).Value)}, + }, + { + Tags: map[string]string{"Name": lo.FromPtr(lo.FindOrElse(last.Tags, &ec2.Tag{}, func(tag *ec2.Tag) bool { return lo.FromPtr(tag.Key) == "Name" }).Value)}, + }, + } + pod := test.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("SecurityGroups", ConsistOf(&first.GroupIdentifier, &last.GroupIdentifier))) + }) + + It("should update the EC2NodeClass status security groups", func() { + env.ExpectCreated(nodeClass) + EventuallyExpectSecurityGroups(env, nodeClass) + }) +}) + +func EventuallyExpectSecurityGroups(env *aws.Environment, nodeClass *v1beta1.EC2NodeClass) { + securityGroups := env.GetSecurityGroups(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(securityGroups).ToNot(HaveLen(0)) + + ids := sets.New(lo.Map(securityGroups, func(s aws.SecurityGroup, _ int) string { + return lo.FromPtr(s.GroupId) + })...) + Eventually(func(g Gomega) { + temp := &v1beta1.EC2NodeClass{} + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClass), temp)).To(Succeed()) + g.Expect(sets.New(lo.Map(temp.Status.SecurityGroups, func(s v1beta1.SecurityGroup, _ int) string { + return s.ID + })...).Equal(ids)) + }).WithTimeout(10 * time.Second).Should(Succeed()) +} diff --git a/test/suites/beta/integration/storage_test.go b/test/suites/beta/integration/storage_test.go new file mode 100644 index 000000000000..af9c38b658cc --- /dev/null +++ b/test/suites/beta/integration/storage_test.go @@ -0,0 +1,119 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "fmt" + + "github.com/aws/aws-sdk-go/aws" + appsv1 "k8s.io/api/apps/v1" + v1 "k8s.io/api/core/v1" + storagev1 "k8s.io/api/storage/v1" + "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + . "github.com/onsi/ginkgo/v2" + + "github.com/aws/karpenter-core/pkg/test" +) + +// This test requires the EBS CSI driver to be installed +var _ = Describe("Dynamic PVC", func() { + It("should run a pod with a dynamic persistent volume", func() { + // Ensure that the EBS driver is installed, or we can't run the test. + var ds appsv1.DaemonSet + if err := env.Client.Get(env.Context, client.ObjectKey{ + Namespace: "kube-system", + Name: "ebs-csi-node", + }, &ds); err != nil { + if errors.IsNotFound(err) { + Skip(fmt.Sprintf("skipping dynamic PVC test due to missing EBS driver %s", err)) + } else { + Fail(fmt.Sprintf("determining EBS driver status, %s", err)) + } + } + storageClassName := "ebs-sc-test" + bindMode := storagev1.VolumeBindingWaitForFirstConsumer + sc := test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: storageClassName, + }, + Provisioner: aws.String("ebs.csi.aws.com"), + VolumeBindingMode: &bindMode, + }) + + pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "ebs-claim", + }, + StorageClassName: aws.String(storageClassName), + Resources: v1.ResourceRequirements{Requests: v1.ResourceList{v1.ResourceStorage: resource.MustParse("5Gi")}}, + }) + + pod := test.Pod(test.PodOptions{ + PersistentVolumeClaims: []string{pvc.Name}, + }) + + env.ExpectCreated(nodeClass, nodePool, sc, pvc, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + env.ExpectDeleted(pod) + }) +}) + +var _ = Describe("Static PVC", func() { + It("should run a pod with a static persistent volume", func() { + storageClassName := "nfs-test" + bindMode := storagev1.VolumeBindingWaitForFirstConsumer + sc := test.StorageClass(test.StorageClassOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: storageClassName, + }, + VolumeBindingMode: &bindMode, + }) + + pv := test.PersistentVolume(test.PersistentVolumeOptions{ + ObjectMeta: metav1.ObjectMeta{Name: "nfs-test-volume"}, + StorageClassName: "nfs-test", + }) + + // the server here doesn't need to actually exist for the pod to start running + pv.Spec.NFS = &v1.NFSVolumeSource{ + Server: "fake.server", + Path: "/some/path", + } + pv.Spec.CSI = nil + + pvc := test.PersistentVolumeClaim(test.PersistentVolumeClaimOptions{ + ObjectMeta: metav1.ObjectMeta{ + Name: "nfs-claim", + }, + StorageClassName: aws.String(storageClassName), + VolumeName: pv.Name, + Resources: v1.ResourceRequirements{Requests: v1.ResourceList{v1.ResourceStorage: resource.MustParse("5Gi")}}, + }) + + pod := test.Pod(test.PodOptions{ + PersistentVolumeClaims: []string{pvc.Name}, + }) + + env.ExpectCreated(nodeClass, nodePool, sc, pv, pvc, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + env.ExpectDeleted(pod) + }) +}) diff --git a/test/suites/beta/integration/subnet_test.go b/test/suites/beta/integration/subnet_test.go new file mode 100644 index 000000000000..af5eb321bea2 --- /dev/null +++ b/test/suites/beta/integration/subnet_test.go @@ -0,0 +1,183 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "time" + + "github.com/aws/aws-sdk-go/service/ec2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/onsi/gomega/types" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var _ = Describe("Subnets", func() { + It("should use the subnet-id selector", func() { + subnets := env.GetSubnets(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(len(subnets)).ToNot(Equal(0)) + shuffledAZs := lo.Shuffle(lo.Keys(subnets)) + firstSubnet := subnets[shuffledAZs[0]][0] + + nodeClass.Spec.SubnetSelectorTerms = []v1beta1.SubnetSelectorTerm{ + { + ID: firstSubnet, + }, + } + pod := test.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("SubnetId", HaveValue(Equal(firstSubnet)))) + }) + It("should use resource based naming as node names", func() { + subnets := env.GetSubnets(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(len(subnets)).ToNot(Equal(0)) + + allSubnets := lo.Flatten(lo.Values(subnets)) + + ExpectResourceBasedNamingEnabled(allSubnets...) + DeferCleanup(func() { + ExpectResourceBasedNamingDisabled(allSubnets...) + }) + pod := test.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + ExceptNodeNameToContainInstanceID(pod.Spec.NodeName) + }) + It("should use the subnet tag selector with multiple tag values", func() { + // Get all the subnets for the cluster + subnets := env.GetSubnetNameAndIds(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(len(subnets)).To(BeNumerically(">", 1)) + firstSubnet := subnets[0] + lastSubnet := subnets[len(subnets)-1] + + nodeClass.Spec.SubnetSelectorTerms = []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"Name": firstSubnet.Name}, + }, + { + Tags: map[string]string{"Name": lastSubnet.Name}, + }, + } + pod := test.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("SubnetId", HaveValue(BeElementOf(firstSubnet.ID, lastSubnet.ID)))) + }) + + It("should use a subnet within the AZ requested", func() { + subnets := env.GetSubnets(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(len(subnets)).ToNot(Equal(0)) + shuffledAZs := lo.Shuffle(lo.Keys(subnets)) + + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1.LabelZoneFailureDomainStable, + Operator: "In", + Values: []string{shuffledAZs[0]}, + }, + }...) + pod := test.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + env.ExpectInstance(pod.Spec.NodeName).To(HaveField("SubnetId", Or( + lo.Map(subnets[shuffledAZs[0]], func(subnetID string, _ int) types.GomegaMatcher { return HaveValue(Equal(subnetID)) })..., + ))) + }) + + It("should have the NodeClass status for subnets", func() { + env.ExpectCreated(nodeClass) + EventuallyExpectSubnets(env, nodeClass) + }) +}) + +func ExpectResourceBasedNamingEnabled(subnetIDs ...string) { + for subnetID := range subnetIDs { + _, err := env.EC2API.ModifySubnetAttribute(&ec2.ModifySubnetAttributeInput{ + EnableResourceNameDnsARecordOnLaunch: &ec2.AttributeBooleanValue{ + Value: lo.ToPtr(true), + }, + SubnetId: lo.ToPtr(subnetIDs[subnetID]), + }) + Expect(err).To(BeNil()) + _, err = env.EC2API.ModifySubnetAttribute(&ec2.ModifySubnetAttributeInput{ + PrivateDnsHostnameTypeOnLaunch: lo.ToPtr("resource-name"), + SubnetId: lo.ToPtr(subnetIDs[subnetID]), + }) + Expect(err).To(BeNil()) + } +} + +func ExpectResourceBasedNamingDisabled(subnetIDs ...string) { + for subnetID := range subnetIDs { + _, err := env.EC2API.ModifySubnetAttribute(&ec2.ModifySubnetAttributeInput{ + EnableResourceNameDnsARecordOnLaunch: &ec2.AttributeBooleanValue{ + Value: lo.ToPtr(false), + }, + SubnetId: lo.ToPtr(subnetIDs[subnetID]), + }) + Expect(err).To(BeNil()) + _, err = env.EC2API.ModifySubnetAttribute(&ec2.ModifySubnetAttributeInput{ + PrivateDnsHostnameTypeOnLaunch: lo.ToPtr("ip-name"), + SubnetId: lo.ToPtr(subnetIDs[subnetID]), + }) + Expect(err).To(BeNil()) + } +} + +func ExceptNodeNameToContainInstanceID(nodeName string) { + instance := env.GetInstance(nodeName) + Expect(nodeName).To(Not(Equal(lo.FromPtr(instance.InstanceId)))) + ContainSubstring(nodeName, lo.FromPtr(instance.InstanceId)) +} + +// SubnetInfo is a simple struct for testing +type SubnetInfo struct { + Name string + ID string +} + +func EventuallyExpectSubnets(env *aws.Environment, nodeClass *v1beta1.EC2NodeClass) { + subnets := env.GetSubnets(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(subnets).ToNot(HaveLen(0)) + ids := sets.New(lo.Flatten(lo.Values(subnets))...) + + Eventually(func(g Gomega) { + temp := &v1beta1.EC2NodeClass{} + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClass), temp)).To(Succeed()) + g.Expect(sets.New(lo.Map(temp.Status.Subnets, func(s v1beta1.Subnet, _ int) string { + return s.ID + })...).Equal(ids)) + }).WithTimeout(10 * time.Second).Should(Succeed()) +} diff --git a/test/suites/beta/integration/suite_test.go b/test/suites/beta/integration/suite_test.go new file mode 100644 index 000000000000..26d7a8c62fc1 --- /dev/null +++ b/test/suites/beta/integration/suite_test.go @@ -0,0 +1,77 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "fmt" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var env *aws.Environment +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool + +func TestIntegration(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Integration") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + nodePool = coretest.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }, + }, + }) +}) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) diff --git a/test/suites/beta/integration/tags_test.go b/test/suites/beta/integration/tags_test.go new file mode 100644 index 000000000000..afd6024ca5df --- /dev/null +++ b/test/suites/beta/integration/tags_test.go @@ -0,0 +1,110 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "github.com/aws/aws-sdk-go/service/ec2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "time" + + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/providers/instance" + "github.com/aws/karpenter/pkg/test" +) + +var _ = Describe("Tags", func() { + Context("Static Tags", func() { + It("should tag all associated resources", func() { + nodeClass.Spec.Tags = map[string]string{"TestTag": "TestVal"} + pod := coretest.Pod() + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + instance := env.GetInstance(pod.Spec.NodeName) + volumeTags := tagMap(env.GetVolume(instance.BlockDeviceMappings[0].Ebs.VolumeId).Tags) + instanceTags := tagMap(instance.Tags) + + Expect(instanceTags).To(HaveKeyWithValue("TestTag", "TestVal")) + Expect(volumeTags).To(HaveKeyWithValue("TestTag", "TestVal")) + }) + }) + + Context("Tagging Controller", func() { + It("should tag with karpenter.sh/nodeclaim and Name tag", func() { + pod := coretest.Pod() + + env.ExpectCreated(nodePool, nodeClass, pod) + env.EventuallyExpectCreatedNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + nodeName := client.ObjectKeyFromObject(node) + + Eventually(func(g Gomega) { + node = &v1.Node{} + g.Expect(env.Client.Get(env.Context, nodeName, node)).To(Succeed()) + g.Expect(node.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationInstanceTagged, "true")) + }, time.Minute) + + nodeInstance := instance.NewInstance(lo.ToPtr(env.GetInstance(node.Name))) + Expect(nodeInstance.Tags).To(HaveKeyWithValue("Name", node.Name)) + Expect(nodeInstance.Tags).To(HaveKey("karpenter.sh/nodeclaim")) + }) + + It("shouldn't overwrite custom Name tags", func() { + nodeClass = test.EC2NodeClass(*nodeClass, v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{ + Tags: map[string]string{"Name": "custom-name"}, + }}) + nodePool = coretest.NodePool(*nodePool, corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, + }, + }, + }, + }) + pod := coretest.Pod() + + env.ExpectCreated(nodePool, nodeClass, pod) + env.EventuallyExpectCreatedNodeCount("==", 1) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + nodeName := client.ObjectKeyFromObject(node) + + Eventually(func(g Gomega) { + node = &v1.Node{} + g.Expect(env.Client.Get(env.Context, nodeName, node)).To(Succeed()) + g.Expect(node.Annotations).To(HaveKeyWithValue(v1beta1.AnnotationInstanceTagged, "true")) + }, time.Minute) + + nodeInstance := instance.NewInstance(lo.ToPtr(env.GetInstance(node.Name))) + Expect(nodeInstance.Tags).To(HaveKeyWithValue("Name", "custom-name")) + Expect(nodeInstance.Tags).To(HaveKey("karpenter.sh/nodeclaim")) + }) + }) +}) + +func tagMap(tags []*ec2.Tag) map[string]string { + return lo.SliceToMap(tags, func(tag *ec2.Tag) (string, string) { + return *tag.Key, *tag.Value + }) +} diff --git a/test/suites/beta/integration/termination_test.go b/test/suites/beta/integration/termination_test.go new file mode 100644 index 000000000000..246470f9b1ec --- /dev/null +++ b/test/suites/beta/integration/termination_test.go @@ -0,0 +1,50 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + + "github.com/aws/karpenter-core/pkg/test" +) + +var _ = Describe("Termination", func() { + It("should terminate the node and the instance on deletion", func() { + pod := test.Pod() + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + nodes := env.Monitor.CreatedNodes() + instanceID := env.ExpectParsedProviderID(nodes[0].Spec.ProviderID) + env.GetInstance(nodes[0].Name) + + // Pod is deleted so that we don't re-provision after node deletion + // NOTE: We have to do this right now to deal with a race condition in nodepool ownership + // This can be removed once this race is resolved with the NodePool + env.ExpectDeleted(pod) + + // Node is deleted and now should be not found + env.ExpectDeleted(nodes[0]) + env.EventuallyExpectNotFound(nodes[0]) + Eventually(func(g Gomega) { + g.Expect(lo.FromPtr(env.GetInstanceByID(instanceID).State.Name)).To(Equal("shutting-down")) + }, time.Second*10).Should(Succeed()) + }) +}) diff --git a/test/suites/beta/integration/testdata/al2_no_mime_userdata_input.sh b/test/suites/beta/integration/testdata/al2_no_mime_userdata_input.sh new file mode 100644 index 000000000000..37058c604012 --- /dev/null +++ b/test/suites/beta/integration/testdata/al2_no_mime_userdata_input.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo "Running custom user data script" diff --git a/test/suites/beta/integration/testdata/al2_userdata_input.sh b/test/suites/beta/integration/testdata/al2_userdata_input.sh new file mode 100644 index 000000000000..afc1580817ae --- /dev/null +++ b/test/suites/beta/integration/testdata/al2_userdata_input.sh @@ -0,0 +1,10 @@ +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="BOUNDARY" + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +echo "Running custom user data script" + +--BOUNDARY-- diff --git a/test/suites/beta/integration/testdata/amd_driver_input.sh b/test/suites/beta/integration/testdata/amd_driver_input.sh new file mode 100644 index 000000000000..58a10e211987 --- /dev/null +++ b/test/suites/beta/integration/testdata/amd_driver_input.sh @@ -0,0 +1,46 @@ +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="BOUNDARY" + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +cd +sudo amazon-linux-extras install epel -y +sudo yum update -y + +# Create a script to install the AMD Radeon GPU +cat << EOF > /tmp/amd-install.sh +#!/bin/bash +export echo PATH=/usr/local/bin:$PATH +aws s3 cp --recursive s3://ec2-amd-linux-drivers/latest/ . --no-sign-request +tar -xf amdgpu-pro-*rhel*.tar.xz +cd amdgpu-pro-20.20-1184451-rhel-7.8 +./amdgpu-pro-install -y --opencl=pal,legacy +systemctl disable amd-install.service +reboot +EOF +sudo chmod +x /tmp/amd-install.sh + +# Create a service that will run on system reboot +cat << EOF > /etc/systemd/system/amd-install.service +[Unit] +Description=install amd drivers + +[Service] +ExecStart=/bin/bash /tmp/amd-install.sh + +[Install] +WantedBy=multi-user.target +EOF +sudo systemctl enable amd-install.service + +# Run the EKS bootstrap script and then reboot +exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 +/etc/eks/bootstrap.sh '%s' --apiserver-endpoint '%s' --b64-cluster-ca '%s' \ +--use-max-pods false \ +--container-runtime containerd \ +--kubelet-extra-args '--node-labels=karpenter.sh/nodepool=%s,testing/cluster=unspecified' + +reboot +--BOUNDARY-- \ No newline at end of file diff --git a/test/suites/beta/integration/testdata/br_userdata_input.sh b/test/suites/beta/integration/testdata/br_userdata_input.sh new file mode 100644 index 000000000000..c157a6769852 --- /dev/null +++ b/test/suites/beta/integration/testdata/br_userdata_input.sh @@ -0,0 +1,4 @@ +[settings.kubernetes] +kube-api-qps = 30 +[settings.kubernetes.node-taints] +"node.cilium.io/agent-not-ready" = ["true:NoExecute"] diff --git a/test/suites/beta/integration/testdata/windows_userdata_input.ps1 b/test/suites/beta/integration/testdata/windows_userdata_input.ps1 new file mode 100644 index 000000000000..fcdb159f7ce7 --- /dev/null +++ b/test/suites/beta/integration/testdata/windows_userdata_input.ps1 @@ -0,0 +1 @@ +Write-Host "Running custom user data script" \ No newline at end of file diff --git a/test/suites/beta/integration/validation_test.go b/test/suites/beta/integration/validation_test.go new file mode 100644 index 000000000000..593fc98f00f6 --- /dev/null +++ b/test/suites/beta/integration/validation_test.go @@ -0,0 +1,185 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + "fmt" + "time" + + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "knative.dev/pkg/ptr" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/apis/v1beta1" +) + +var _ = Describe("Validation", func() { + Context("NodePool", func() { + It("should error when a restricted label is used in labels (karpenter.sh/nodepool)", func() { + nodePool.Spec.Template.Labels = map[string]string{ + corev1beta1.NodePoolLabelKey: "my-custom-nodepool", + } + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error when a restricted label is used in labels (kubernetes.io/custom-label)", func() { + nodePool.Spec.Template.Labels = map[string]string{ + "kubernetes.io/custom-label": "custom-value", + } + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should allow a restricted label exception to be used in labels (node-restriction.kubernetes.io/custom-label)", func() { + nodePool.Spec.Template.Labels = map[string]string{ + v1.LabelNamespaceNodeRestriction + "/custom-label": "custom-value", + } + Expect(env.Client.Create(env.Context, nodePool)).To(Succeed()) + }) + It("should error when a requirement references a restricted label (karpenter.sh/nodepool)", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"default"}, + }, + }...) + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error when a requirement uses In but has no values", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{}, + }, + }...) + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error when a requirement uses an unknown operator", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: "within", + Values: []string{corev1beta1.CapacityTypeSpot}, + }, + }...) + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error when Gt is used with multiple integer values", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceMemory, + Operator: v1.NodeSelectorOpGt, + Values: []string{"1000000", "2000000"}, + }, + }...) + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error when Lt is used with multiple integer values", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceMemory, + Operator: v1.NodeSelectorOpLt, + Values: []string{"1000000", "2000000"}, + }, + }...) + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error when ttlSecondAfterEmpty is negative", func() { + nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenEmpty + nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{Duration: lo.ToPtr(-time.Second)} + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error when ConsolidationPolicy=WhenUnderutilized is used with consolidateAfter", func() { + nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenUnderutilized + nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{Duration: lo.ToPtr(time.Minute)} + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error if imageGCHighThresholdPercent is less than imageGCLowThresholdPercent", func() { + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ + ImageGCHighThresholdPercent: ptr.Int32(10), + ImageGCLowThresholdPercent: ptr.Int32(60), + } + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + It("should error if imageGCHighThresholdPercent or imageGCLowThresholdPercent is negative", func() { + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ + ImageGCHighThresholdPercent: ptr.Int32(-10), + } + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ + ImageGCLowThresholdPercent: ptr.Int32(-10), + } + Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) + }) + }) + Context("EC2NodeClass", func() { + It("should error when amiSelectorTerms are not defined for amiFamily Custom", func() { + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + }) + It("should fail for poorly formatted AMI ids", func() { + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: "must-start-with-ami", + }, + } + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + }) + It("should succeed when tags don't contain restricted keys", func() { + nodeClass.Spec.Tags = map[string]string{"karpenter.sh/custom-key": "custom-value", "kubernetes.io/role/key": "custom-value"} + Expect(env.Client.Create(env.Context, nodeClass)).To(Succeed()) + }) + It("should error when tags contains a restricted key", func() { + nodeClass.Spec.Tags = map[string]string{"karpenter.sh/nodepool": "custom-value"} + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + + nodeClass.Spec.Tags = map[string]string{"karpenter.sh/managed-by": env.ClusterName} + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + + nodeClass.Spec.Tags = map[string]string{fmt.Sprintf("kubernetes.io/cluster/%s", env.ClusterName): "owned"} + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + }) + It("should fail when securityGroupSelectorTerms has id and other filters", func() { + nodeClass.Spec.SecurityGroupSelectorTerms = []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + ID: "sg-12345", + }, + } + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + }) + It("should fail when subnetSelectorTerms has id and other filters", func() { + nodeClass.Spec.SubnetSelectorTerms = []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + ID: "subnet-12345", + }, + } + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + }) + It("should fail when amiSelectorTerms has id and other filters", func() { + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + ID: "ami-12345", + }, + } + Expect(env.Client.Create(env.Context, nodeClass)).ToNot(Succeed()) + }) + }) +}) From 3dcfc9c8ba2281ccdb2ac5bea10f8d3174edb100 Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Fri, 20 Oct 2023 17:47:40 -0700 Subject: [PATCH 13/47] test: add in consolidation v1beta1 tests (#4878) --- .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 1 + test/suites/beta/consolidation/suite_test.go | 368 +++++++++++++++++++ 3 files changed, 370 insertions(+), 1 deletion(-) create mode 100644 test/suites/beta/consolidation/suite_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index 4d2b1bda843b..a5c633fae5f0 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Integration, Beta/Drift, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 689786ca9250..fde21f6bcec9 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -17,6 +17,7 @@ on: options: - Beta/Integration - Beta/Drift + - Beta/Consolidation - Alpha/Integration - Alpha/Machine - Alpha/Consolidation diff --git a/test/suites/beta/consolidation/suite_test.go b/test/suites/beta/consolidation/suite_test.go new file mode 100644 index 000000000000..e1200ef04e7d --- /dev/null +++ b/test/suites/beta/consolidation/suite_test.go @@ -0,0 +1,368 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package consolidation_test + +import ( + "fmt" + "strings" + "testing" + "time" + + "github.com/aws/aws-sdk-go/aws" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/test/pkg/debug" + + awstest "github.com/aws/karpenter/pkg/test" + environmentaws "github.com/aws/karpenter/test/pkg/environment/aws" + "github.com/aws/karpenter/test/pkg/environment/common" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" +) + +var env *environmentaws.Environment + +func TestConsolidation(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = environmentaws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/Consolidation") +} + +var nodeClass *v1beta1.EC2NodeClass + +var _ = BeforeEach(func() { + nodeClass = awstest.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{{Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}}}, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{{Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}}}, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + env.BeforeEach() +}) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Beta/Consolidation", func() { + It("should consolidate nodes (delete)", Label(debug.NoWatch), Label(debug.NoEvents), func() { + nodePool := test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Disruption: corev1beta1.Disruption{ + ConsolidationPolicy: corev1beta1.ConsolidationPolicyWhenUnderutilized, + // Disable Consolidation until we're ready + ConsolidateAfter: &corev1beta1.NillableDuration{}, + }, + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + // we don't replace spot nodes, so this forces us to only delete nodes + Values: []string{corev1beta1.CapacityTypeSpot}, + }, + { + Key: v1beta1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"medium", "large", "xlarge"}, + }, + { + Key: v1beta1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + // remove some cheap burstable and the odd c1 instance types so we have + // more control over what gets provisioned + Values: []string{"t2", "t3", "c1", "t3a", "t4g"}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, + }, + }, + }, + }) + + var numPods int32 = 100 + dep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1")}, + }, + }, + }) + + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreatedNodeCount("==", 0) + env.ExpectCreated(nodePool, nodeClass, dep) + + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + + // reduce the number of pods by 60% + dep.Spec.Replicas = aws.Int32(40) + env.ExpectUpdated(dep) + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) + + // Enable consolidation as WhenUnderutilized doesn't allow a consolidateAfter value + nodePool.Spec.Disruption.ConsolidateAfter = nil + env.ExpectUpdated(nodePool) + + // With consolidation enabled, we now must delete nodes + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.6) + + env.ExpectDeleted(dep) + }) + It("should consolidate on-demand nodes (replace)", func() { + nodePool := test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Disruption: corev1beta1.Disruption{ + ConsolidationPolicy: corev1beta1.ConsolidationPolicyWhenUnderutilized, + // Disable Consolidation until we're ready + ConsolidateAfter: &corev1beta1.NillableDuration{}, + }, + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + // we don't replace spot nodes, so this forces us to only delete nodes + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + { + Key: v1beta1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"large", "2xlarge"}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, + }, + }, + }, + }) + + var numPods int32 = 3 + largeDep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "large-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "large-app", + }, + }, + }, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("4")}, + }, + }, + }) + smallDep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "small-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "small-app", + }, + }, + }, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1.5")}, + }, + }, + }) + + selector := labels.SelectorFromSet(largeDep.Spec.Selector.MatchLabels) + env.ExpectCreatedNodeCount("==", 0) + env.ExpectCreated(nodePool, nodeClass, largeDep, smallDep) + + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + + // 3 nodes due to the anti-affinity rules + env.ExpectCreatedNodeCount("==", 3) + + // scaling down the large deployment leaves only small pods on each node + largeDep.Spec.Replicas = aws.Int32(0) + env.ExpectUpdated(largeDep) + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, "<", 0.5) + + nodePool.Spec.Disruption.ConsolidateAfter = nil + env.ExpectUpdated(nodePool) + + // With consolidation enabled, we now must replace each node in turn to consolidate due to the anti-affinity + // rules on the smaller deployment. The 2xl nodes should go to a large + env.EventuallyExpectAvgUtilization(v1.ResourceCPU, ">", 0.8) + + var nodes v1.NodeList + Expect(env.Client.List(env.Context, &nodes)).To(Succeed()) + numLargeNodes := 0 + numOtherNodes := 0 + for _, n := range nodes.Items { + // only count the nodes created by the provisoiner + if n.Labels[corev1beta1.NodePoolLabelKey] != nodePool.Name { + continue + } + if strings.HasSuffix(n.Labels[v1.LabelInstanceTypeStable], ".large") { + numLargeNodes++ + } else { + numOtherNodes++ + } + } + + // all of the 2xlarge nodes should have been replaced with large instance types + Expect(numLargeNodes).To(Equal(3)) + // and we should have no other nodes + Expect(numOtherNodes).To(Equal(0)) + + env.ExpectDeleted(largeDep, smallDep) + }) + It("should consolidate on-demand nodes to spot (replace)", func() { + nodePool := test.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Disruption: corev1beta1.Disruption{ + ConsolidationPolicy: corev1beta1.ConsolidationPolicyWhenUnderutilized, + // Disable Consolidation until we're ready + ConsolidateAfter: &corev1beta1.NillableDuration{}, + }, + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + // we don't replace spot nodes, so this forces us to only delete nodes + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + { + Key: v1beta1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"large"}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, + }, + }, + }, + }) + + var numPods int32 = 2 + smallDep := test.Deployment(test.DeploymentOptions{ + Replicas: numPods, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "small-app"}, + }, + TopologySpreadConstraints: []v1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: v1.LabelHostname, + WhenUnsatisfiable: v1.DoNotSchedule, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app": "small-app", + }, + }, + }, + }, + ResourceRequirements: v1.ResourceRequirements{ + Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1.5")}, + }, + }, + }) + + selector := labels.SelectorFromSet(smallDep.Spec.Selector.MatchLabels) + env.ExpectCreatedNodeCount("==", 0) + env.ExpectCreated(nodePool, nodeClass, smallDep) + + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + env.ExpectCreatedNodeCount("==", int(numPods)) + + // Enable spot capacity type after the on-demand node is provisioned + // Expect the node to consolidate to a spot instance as it will be a cheaper + // instance than on-demand + nodePool.Spec.Disruption.ConsolidateAfter = nil + nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand, corev1beta1.CapacityTypeSpot}, + }, + { + Key: v1beta1.LabelInstanceSize, + Operator: v1.NodeSelectorOpIn, + Values: []string{"large"}, + }, + } + env.ExpectUpdated(nodePool) + + // Eventually expect the on-demand nodes to be consolidated into + // spot nodes after some time + Eventually(func(g Gomega) { + var nodes v1.NodeList + Expect(env.Client.List(env.Context, &nodes)).To(Succeed()) + var spotNodes []*v1.Node + var otherNodes []*v1.Node + for i, n := range nodes.Items { + // only count the nodes created by the nodePool + if n.Labels[corev1beta1.NodePoolLabelKey] != nodePool.Name { + continue + } + if n.Labels[corev1beta1.CapacityTypeLabelKey] == corev1beta1.CapacityTypeSpot { + spotNodes = append(spotNodes, &nodes.Items[i]) + } else { + otherNodes = append(otherNodes, &nodes.Items[i]) + } + } + // all the on-demand nodes should have been replaced with spot nodes + msg := fmt.Sprintf("node names, spot= %v, other = %v", common.NodeNames(spotNodes), common.NodeNames(otherNodes)) + g.Expect(len(spotNodes)).To(BeNumerically("==", numPods), msg) + // and we should have no other nodes + g.Expect(len(otherNodes)).To(BeNumerically("==", 0), msg) + }, time.Minute*10).Should(Succeed()) + + env.ExpectDeleted(smallDep) + }) +}) From 3d6bd3f40336a531494e1538d4bcae1f2a243fdb Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Fri, 20 Oct 2023 22:09:44 -0700 Subject: [PATCH 14/47] chore(deps): bump the go-deps group with 1 update (#4879) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index da744a5d60d3..69208f082e58 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible - github.com/aws/aws-sdk-go v1.46.0 + github.com/aws/aws-sdk-go v1.46.1 github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 diff --git a/go.sum b/go.sum index 4e834d92c3f0..bf105d6435f5 100644 --- a/go.sum +++ b/go.sum @@ -55,8 +55,8 @@ github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/aws/aws-sdk-go v1.46.0 h1:Igh7W8P+sA6mXJ9yhreOSweefLapcqekhxQlY1llxcM= -github.com/aws/aws-sdk-go v1.46.0/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.46.1 h1:U26quvBWFZMQuultLw5tloW4GnmWaChEwMZNq8uYatw= +github.com/aws/aws-sdk-go v1.46.1/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604 h1:eQFElFqH3K64na70WZBh6FUFonVRKhtyUptWtpO/JdI= github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604/go.mod h1:rb3kp/3cj38tACF6udfpmIvKoQMwirSVoHNlrd66LyE= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= From ab8454bde1bdc5130ff740fb1b82aeed9e06e26d Mon Sep 17 00:00:00 2001 From: Christian Melendez Date: Sat, 21 Oct 2023 07:38:20 +0200 Subject: [PATCH 15/47] docs: Add the Karpenter Blueprints and Karpeter Tutorial as additional resources in the Getting Started page (#4854) --- website/content/en/docs/getting-started/_index.md | 2 ++ website/content/en/preview/getting-started/_index.md | 2 ++ website/content/en/v0.28/getting-started/_index.md | 2 ++ website/content/en/v0.29/getting-started/_index.md | 2 ++ website/content/en/v0.30/getting-started/_index.md | 2 ++ website/content/en/v0.31/getting-started/_index.md | 2 ++ 6 files changed, 12 insertions(+) diff --git a/website/content/en/docs/getting-started/_index.md b/website/content/en/docs/getting-started/_index.md index 77e7a529548f..c159f68f5e48 100644 --- a/website/content/en/docs/getting-started/_index.md +++ b/website/content/en/docs/getting-started/_index.md @@ -24,3 +24,5 @@ Learn more about Karpenter and how to get started below. * [EC2 Spot Workshop for Karpenter](https://ec2spotworkshops.com/karpenter.html) * [EKS Karpenter Workshop](https://www.eksworkshop.com/docs/autoscaling/compute/karpenter/) * [Advanced EKS Immersion Karpenter Workshop](https://catalog.workshops.aws/eks-advanced/karpenter/) +* [Karpenter Blueprints](https://github.com/aws-samples/karpenter-blueprints) +* [Tutorial: Run Kubernetes Clusters for Less with Amazon EC2 Spot and Karpenter](https://community.aws/tutorials/run-kubernetes-clusters-for-less-with-amazon-ec2-spot-and-karpenter#step-6-optional-simulate-spot-interruption) diff --git a/website/content/en/preview/getting-started/_index.md b/website/content/en/preview/getting-started/_index.md index 12fd2a9f9fd2..a70f6bbc422b 100644 --- a/website/content/en/preview/getting-started/_index.md +++ b/website/content/en/preview/getting-started/_index.md @@ -22,3 +22,5 @@ Learn more about Karpenter and how to get started below. * [EC2 Spot Workshop for Karpenter](https://ec2spotworkshops.com/karpenter.html) * [EKS Karpenter Workshop](https://www.eksworkshop.com/docs/autoscaling/compute/karpenter/) * [Advanced EKS Immersion Karpenter Workshop](https://catalog.workshops.aws/eks-advanced/karpenter/) +* [Karpenter Blueprints](https://github.com/aws-samples/karpenter-blueprints) +* [Tutorial: Run Kubernetes Clusters for Less with Amazon EC2 Spot and Karpenter](https://community.aws/tutorials/run-kubernetes-clusters-for-less-with-amazon-ec2-spot-and-karpenter#step-6-optional-simulate-spot-interruption) diff --git a/website/content/en/v0.28/getting-started/_index.md b/website/content/en/v0.28/getting-started/_index.md index 77e7a529548f..2108eea40683 100644 --- a/website/content/en/v0.28/getting-started/_index.md +++ b/website/content/en/v0.28/getting-started/_index.md @@ -24,3 +24,5 @@ Learn more about Karpenter and how to get started below. * [EC2 Spot Workshop for Karpenter](https://ec2spotworkshops.com/karpenter.html) * [EKS Karpenter Workshop](https://www.eksworkshop.com/docs/autoscaling/compute/karpenter/) * [Advanced EKS Immersion Karpenter Workshop](https://catalog.workshops.aws/eks-advanced/karpenter/) +* [Karpenter Blueprints](https://github.com/aws-samples/karpenter-blueprints) +* [Tutorial: Run Kubernetes Clusters for Less with Amazon EC2 Spot and Karpenter](https://community.aws/tutorials/run-kubernetes-clusters-for-less-with-amazon-ec2-spot-and-karpenter#step-6-optional-simulate-spot-interruption) \ No newline at end of file diff --git a/website/content/en/v0.29/getting-started/_index.md b/website/content/en/v0.29/getting-started/_index.md index 77e7a529548f..c159f68f5e48 100644 --- a/website/content/en/v0.29/getting-started/_index.md +++ b/website/content/en/v0.29/getting-started/_index.md @@ -24,3 +24,5 @@ Learn more about Karpenter and how to get started below. * [EC2 Spot Workshop for Karpenter](https://ec2spotworkshops.com/karpenter.html) * [EKS Karpenter Workshop](https://www.eksworkshop.com/docs/autoscaling/compute/karpenter/) * [Advanced EKS Immersion Karpenter Workshop](https://catalog.workshops.aws/eks-advanced/karpenter/) +* [Karpenter Blueprints](https://github.com/aws-samples/karpenter-blueprints) +* [Tutorial: Run Kubernetes Clusters for Less with Amazon EC2 Spot and Karpenter](https://community.aws/tutorials/run-kubernetes-clusters-for-less-with-amazon-ec2-spot-and-karpenter#step-6-optional-simulate-spot-interruption) diff --git a/website/content/en/v0.30/getting-started/_index.md b/website/content/en/v0.30/getting-started/_index.md index 77e7a529548f..c159f68f5e48 100644 --- a/website/content/en/v0.30/getting-started/_index.md +++ b/website/content/en/v0.30/getting-started/_index.md @@ -24,3 +24,5 @@ Learn more about Karpenter and how to get started below. * [EC2 Spot Workshop for Karpenter](https://ec2spotworkshops.com/karpenter.html) * [EKS Karpenter Workshop](https://www.eksworkshop.com/docs/autoscaling/compute/karpenter/) * [Advanced EKS Immersion Karpenter Workshop](https://catalog.workshops.aws/eks-advanced/karpenter/) +* [Karpenter Blueprints](https://github.com/aws-samples/karpenter-blueprints) +* [Tutorial: Run Kubernetes Clusters for Less with Amazon EC2 Spot and Karpenter](https://community.aws/tutorials/run-kubernetes-clusters-for-less-with-amazon-ec2-spot-and-karpenter#step-6-optional-simulate-spot-interruption) diff --git a/website/content/en/v0.31/getting-started/_index.md b/website/content/en/v0.31/getting-started/_index.md index 77e7a529548f..c159f68f5e48 100644 --- a/website/content/en/v0.31/getting-started/_index.md +++ b/website/content/en/v0.31/getting-started/_index.md @@ -24,3 +24,5 @@ Learn more about Karpenter and how to get started below. * [EC2 Spot Workshop for Karpenter](https://ec2spotworkshops.com/karpenter.html) * [EKS Karpenter Workshop](https://www.eksworkshop.com/docs/autoscaling/compute/karpenter/) * [Advanced EKS Immersion Karpenter Workshop](https://catalog.workshops.aws/eks-advanced/karpenter/) +* [Karpenter Blueprints](https://github.com/aws-samples/karpenter-blueprints) +* [Tutorial: Run Kubernetes Clusters for Less with Amazon EC2 Spot and Karpenter](https://community.aws/tutorials/run-kubernetes-clusters-for-less-with-amazon-ec2-spot-and-karpenter#step-6-optional-simulate-spot-interruption) From 72d7e666123203f5a883cf9bc4e2357960db058b Mon Sep 17 00:00:00 2001 From: Myles Williams Date: Fri, 20 Oct 2023 23:40:05 -0600 Subject: [PATCH 16/47] docs: Add section regarding c1.medium and m1.small swap provisioning failures. (#4858) Co-authored-by: Jonathan Innis --- website/content/en/docs/troubleshooting.md | 11 +++++++++++ website/content/en/preview/troubleshooting.md | 11 +++++++++++ website/content/en/v0.27/troubleshooting.md | 11 +++++++++++ website/content/en/v0.28/troubleshooting.md | 11 +++++++++++ website/content/en/v0.29/troubleshooting.md | 11 +++++++++++ website/content/en/v0.30/troubleshooting.md | 11 +++++++++++ website/content/en/v0.31/troubleshooting.md | 11 +++++++++++ 7 files changed, 77 insertions(+) diff --git a/website/content/en/docs/troubleshooting.md b/website/content/en/docs/troubleshooting.md index d75b3d527d77..28952034ec9d 100644 --- a/website/content/en/docs/troubleshooting.md +++ b/website/content/en/docs/troubleshooting.md @@ -179,6 +179,17 @@ approach, and now it's much more restrictive. ## Provisioning +### Instances with swap volumes fail to register with control plane + +Some instance types (c1.medium and m1.small) are given limited amount of memory (see [Instance Store swap volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html)). They are subsequently configured to use a swap volume, which will cause the kubelet to fail on launch. The following error can be seen in the systemd logs: + +```bash +"command failed" err="failed to run Kubelet: running with swap on is not supported, please disable swap!..." +``` + +##### Solutions +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. + ### DaemonSets can result in deployment failures For Karpenter versions 0.5.3 and earlier, DaemonSets were not properly considered when provisioning nodes. diff --git a/website/content/en/preview/troubleshooting.md b/website/content/en/preview/troubleshooting.md index bc50aa8693cc..ae3b1f47bf88 100644 --- a/website/content/en/preview/troubleshooting.md +++ b/website/content/en/preview/troubleshooting.md @@ -179,6 +179,17 @@ approach, and now it's much more restrictive. ## Provisioning +### Instances with swap volumes fail to register with control plane + +Some instance types (c1.medium and m1.small) are given limited amount of memory (see [Instance Store swap volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html)). They are subsequently configured to use a swap volume, which will cause the kubelet to fail on launch. The following error can be seen in the systemd logs: + +```bash +"command failed" err="failed to run Kubelet: running with swap on is not supported, please disable swap!..." +``` + +##### Solutions +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. + ### DaemonSets can result in deployment failures For Karpenter versions 0.5.3 and earlier, DaemonSets were not properly considered when provisioning nodes. diff --git a/website/content/en/v0.27/troubleshooting.md b/website/content/en/v0.27/troubleshooting.md index 9b2a09507bb5..81c1b5b0b239 100644 --- a/website/content/en/v0.27/troubleshooting.md +++ b/website/content/en/v0.27/troubleshooting.md @@ -194,6 +194,17 @@ If you see this issue happens while using the`extraObjects` key from the values ## Provisioning +### Instances with swap volumes fail to register with control plane + +Some instance types (c1.medium and m1.small) are given limited amount of memory (see [Instance Store swap volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html)). They are subsequently configured to use a swap volume, which will cause the kubelet to fail on launch. The following error can be seen in the systemd logs: + +```bash +"command failed" err="failed to run Kubelet: running with swap on is not supported, please disable swap!..." +``` + +##### Solutions +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. + ### DaemonSets can result in deployment failures For Karpenter versions 0.5.3 and earlier, DaemonSets were not properly considered when provisioning nodes. diff --git a/website/content/en/v0.28/troubleshooting.md b/website/content/en/v0.28/troubleshooting.md index b4c1a8ef4b60..867a43e8d1fb 100644 --- a/website/content/en/v0.28/troubleshooting.md +++ b/website/content/en/v0.28/troubleshooting.md @@ -179,6 +179,17 @@ approach, and now it's much more restrictive. ## Provisioning +### Instances with swap volumes fail to register with control plane + +Some instance types (c1.medium and m1.small) are given limited amount of memory (see [Instance Store swap volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html)). They are subsequently configured to use a swap volume, which will cause the kubelet to fail on launch. The following error can be seen in the systemd logs: + +```bash +"command failed" err="failed to run Kubelet: running with swap on is not supported, please disable swap!..." +``` + +##### Solutions +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. + ### DaemonSets can result in deployment failures For Karpenter versions 0.5.3 and earlier, DaemonSets were not properly considered when provisioning nodes. diff --git a/website/content/en/v0.29/troubleshooting.md b/website/content/en/v0.29/troubleshooting.md index 6cc1a290b4a9..1bfc0b1a543c 100644 --- a/website/content/en/v0.29/troubleshooting.md +++ b/website/content/en/v0.29/troubleshooting.md @@ -179,6 +179,17 @@ approach, and now it's much more restrictive. ## Provisioning +### Instances with swap volumes fail to register with control plane + +Some instance types (c1.medium and m1.small) are given limited amount of memory (see [Instance Store swap volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html)). They are subsequently configured to use a swap volume, which will cause the kubelet to fail on launch. The following error can be seen in the systemd logs: + +```bash +"command failed" err="failed to run Kubelet: running with swap on is not supported, please disable swap!..." +``` + +##### Solutions +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. + ### DaemonSets can result in deployment failures For Karpenter versions 0.5.3 and earlier, DaemonSets were not properly considered when provisioning nodes. diff --git a/website/content/en/v0.30/troubleshooting.md b/website/content/en/v0.30/troubleshooting.md index d75b3d527d77..28952034ec9d 100644 --- a/website/content/en/v0.30/troubleshooting.md +++ b/website/content/en/v0.30/troubleshooting.md @@ -179,6 +179,17 @@ approach, and now it's much more restrictive. ## Provisioning +### Instances with swap volumes fail to register with control plane + +Some instance types (c1.medium and m1.small) are given limited amount of memory (see [Instance Store swap volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html)). They are subsequently configured to use a swap volume, which will cause the kubelet to fail on launch. The following error can be seen in the systemd logs: + +```bash +"command failed" err="failed to run Kubelet: running with swap on is not supported, please disable swap!..." +``` + +##### Solutions +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. + ### DaemonSets can result in deployment failures For Karpenter versions 0.5.3 and earlier, DaemonSets were not properly considered when provisioning nodes. diff --git a/website/content/en/v0.31/troubleshooting.md b/website/content/en/v0.31/troubleshooting.md index d75b3d527d77..28952034ec9d 100644 --- a/website/content/en/v0.31/troubleshooting.md +++ b/website/content/en/v0.31/troubleshooting.md @@ -179,6 +179,17 @@ approach, and now it's much more restrictive. ## Provisioning +### Instances with swap volumes fail to register with control plane + +Some instance types (c1.medium and m1.small) are given limited amount of memory (see [Instance Store swap volumes](https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instance-store-swap-volumes.html)). They are subsequently configured to use a swap volume, which will cause the kubelet to fail on launch. The following error can be seen in the systemd logs: + +```bash +"command failed" err="failed to run Kubelet: running with swap on is not supported, please disable swap!..." +``` + +##### Solutions +Disabling swap will allow kubelet to join the cluster successfully, however users should be mindful of performance, and consider adjusting the Provisioner requirements to use larger instance types. + ### DaemonSets can result in deployment failures For Karpenter versions 0.5.3 and earlier, DaemonSets were not properly considered when provisioning nodes. From af6387d77657ba0cfc32b51f1ac0df826869738c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?P=C3=A5l-Magnus=20Sl=C3=A5tto?= Date: Sun, 22 Oct 2023 22:25:43 +0200 Subject: [PATCH 17/47] docs: Add contribution info (#4882) --- .github/pull-request-template.md | 2 ++ README.md | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/pull-request-template.md b/.github/pull-request-template.md index 8245555f898f..82c72c32b446 100644 --- a/.github/pull-request-template.md +++ b/.github/pull-request-template.md @@ -7,6 +7,8 @@ docs: <-- Documentation changes that do not impact code test: <-- Test changes that do not impact behavior ci: <-- Changes that affect test or rollout automation !${type}: <-- Include ! if your change includes a backwards incompatible change. + +Please review the Karpenter contribution docs at https://karpenter.sh/docs/contributing/ before submitting your pull request. --> Fixes #N/A diff --git a/README.md b/README.md index 543ea6c48aca..3d394c8a789f 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ Karpenter improves the efficiency and cost of running workloads on Kubernetes cl * **Provisioning** nodes that meet the requirements of the pods * **Removing** the nodes when the nodes are no longer needed -Come discuss Karpenter in the [#karpenter](https://kubernetes.slack.com/archives/C02SFFZSA2K) channel, in the [Kubernetes slack](https://slack.k8s.io/) or join the [Karpenter working group](https://karpenter.sh/docs/contributing/working-group/) bi-weekly calls. +Come discuss Karpenter in the [#karpenter](https://kubernetes.slack.com/archives/C02SFFZSA2K) channel, in the [Kubernetes slack](https://slack.k8s.io/) or join the [Karpenter working group](https://karpenter.sh/docs/contributing/working-group/) bi-weekly calls. If you want to contribute to the Karpenter project, please refer to the Karpenter docs. Check out the [Docs](https://karpenter.sh/docs/) to learn more. From 6c508720bcca9be64078f897c857e6a5e54ea375 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Sun, 22 Oct 2023 23:12:43 -0700 Subject: [PATCH 18/47] chore: Restrict owner key in `amiSelectorTerms` from being set with tags (#4885) --- .../karpenter.k8s.aws_ec2nodeclasses.yaml | 6 ++++-- pkg/apis/v1beta1/ec2nodeclass.go | 3 ++- pkg/apis/v1beta1/ec2nodeclass_validation.go | 2 ++ .../ec2nodeclass_validation_cel_test.go | 21 ++++++++++++++++++- .../ec2nodeclass_validation_webhook_test.go | 20 ++++++++++++++++++ .../karpenter-convert/pkg/convert/convert.go | 4 ---- 6 files changed, 48 insertions(+), 8 deletions(-) diff --git a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml index 8382d076f248..8045ddb48973 100644 --- a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml +++ b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml @@ -90,8 +90,10 @@ spec: rule: self.all(x, has(x.tags) || has(x.id) || has(x.name)) - message: '''id'' is mutually exclusive, cannot be set with a combination of other fields in amiSelectorTerms' - rule: '!self.all(x, has(x.id) && (has(x.tags) || has(x.name)) || - has(x.owner))' + rule: '!self.all(x, has(x.id) && (has(x.tags) || has(x.name) || + has(x.owner)))' + - message: '''owner'' cannot be set with ''tags''' + rule: '!self.all(x, has(x.owner) && has(x.tags))' blockDeviceMappings: description: BlockDeviceMappings to be applied to provisioned nodes. items: diff --git a/pkg/apis/v1beta1/ec2nodeclass.go b/pkg/apis/v1beta1/ec2nodeclass.go index 82e4bd8f884c..fd3dbebd0094 100644 --- a/pkg/apis/v1beta1/ec2nodeclass.go +++ b/pkg/apis/v1beta1/ec2nodeclass.go @@ -43,7 +43,8 @@ type EC2NodeClassSpec struct { SecurityGroupSelectorTerms []SecurityGroupSelectorTerm `json:"securityGroupSelectorTerms" hash:"ignore"` // AMISelectorTerms is a list of or ami selector terms. The terms are ORed. // +kubebuilder:validation:XValidation:message="expected at least one, got none, ['tags', 'id', 'name']",rule="self.all(x, has(x.tags) || has(x.id) || has(x.name))" - // +kubebuilder:validation:XValidation:message="'id' is mutually exclusive, cannot be set with a combination of other fields in amiSelectorTerms",rule="!self.all(x, has(x.id) && (has(x.tags) || has(x.name)) || has(x.owner))" + // +kubebuilder:validation:XValidation:message="'id' is mutually exclusive, cannot be set with a combination of other fields in amiSelectorTerms",rule="!self.all(x, has(x.id) && (has(x.tags) || has(x.name) || has(x.owner)))" + // +kubebuilder:validation:XValidation:message="'owner' cannot be set with 'tags'",rule="!self.all(x, has(x.owner) && has(x.tags))" // +kubebuilder:validation:MaxItems:=30 // +optional AMISelectorTerms []AMISelectorTerm `json:"amiSelectorTerms,omitempty" hash:"ignore"` diff --git a/pkg/apis/v1beta1/ec2nodeclass_validation.go b/pkg/apis/v1beta1/ec2nodeclass_validation.go index ae7a96e8fbdf..af35320cda58 100644 --- a/pkg/apis/v1beta1/ec2nodeclass_validation.go +++ b/pkg/apis/v1beta1/ec2nodeclass_validation.go @@ -133,6 +133,8 @@ func (in *AMISelectorTerm) validate() (errs *apis.FieldError) { errs = errs.Also(apis.ErrGeneric("expect at least one, got none", "tags", "id", "name")) } else if in.ID != "" && (len(in.Tags) > 0 || in.Name != "" || in.Owner != "") { errs = errs.Also(apis.ErrGeneric(`"id" is mutually exclusive, cannot be set with a combination of other fields in`)) + } else if in.Owner != "" && len(in.Tags) > 0 { + errs = errs.Also(apis.ErrGeneric(`"owner" cannot be set with "tags" in`)) } return errs } diff --git a/pkg/apis/v1beta1/ec2nodeclass_validation_cel_test.go b/pkg/apis/v1beta1/ec2nodeclass_validation_cel_test.go index a850b5545dfa..4b5aa4e77cd9 100644 --- a/pkg/apis/v1beta1/ec2nodeclass_validation_cel_test.go +++ b/pkg/apis/v1beta1/ec2nodeclass_validation_cel_test.go @@ -35,7 +35,6 @@ var _ = Describe("CEL/Validation", func() { var nc *v1beta1.EC2NodeClass BeforeEach(func() { - env.Version.Minor() if env.Version.Minor() < 25 { Skip("CEL Validation is for 1.25>") } @@ -344,6 +343,15 @@ var _ = Describe("CEL/Validation", func() { } Expect(env.Client.Create(ctx, nc)).To(Succeed()) }) + It("should succeed with a valid ami selector on name and owner", func() { + nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Name: "testname", + Owner: "testowner", + }, + } + Expect(env.Client.Create(ctx, nc)).To(Succeed()) + }) It("should fail when a ami selector term has no values", func() { nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ {}, @@ -378,6 +386,17 @@ var _ = Describe("CEL/Validation", func() { } Expect(env.Client.Create(ctx, nc)).ToNot(Succeed()) }) + It("should fail when an ami selector term has an owner key with tags", func() { + nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Owner: "testowner", + Tags: map[string]string{ + "test": "testvalue", + }, + }, + } + Expect(env.Client.Create(ctx, nc)).ToNot(Succeed()) + }) It("should fail when the last ami selector is invalid", func() { nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ { diff --git a/pkg/apis/v1beta1/ec2nodeclass_validation_webhook_test.go b/pkg/apis/v1beta1/ec2nodeclass_validation_webhook_test.go index 1a7f53836dd8..b4ffc8daf9ca 100644 --- a/pkg/apis/v1beta1/ec2nodeclass_validation_webhook_test.go +++ b/pkg/apis/v1beta1/ec2nodeclass_validation_webhook_test.go @@ -355,6 +355,15 @@ var _ = Describe("Webhook/Validation", func() { } Expect(nc.Validate(ctx)).To(Succeed()) }) + It("should succeed with a valid ami selector on name and owner", func() { + nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Name: "testname", + Owner: "testowner", + }, + } + Expect(nc.Validate(ctx)).To(Succeed()) + }) It("should fail when a ami selector term has no values", func() { nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ {}, @@ -389,6 +398,17 @@ var _ = Describe("Webhook/Validation", func() { } Expect(nc.Validate(ctx)).ToNot(Succeed()) }) + It("should fail when an ami selector term has an owner key with tags", func() { + nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + Owner: "testowner", + Tags: map[string]string{ + "test": "testvalue", + }, + }, + } + Expect(nc.Validate(ctx)).ToNot(Succeed()) + }) It("should fail when the last ami selector is invalid", func() { nc.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ { diff --git a/tools/karpenter-convert/pkg/convert/convert.go b/tools/karpenter-convert/pkg/convert/convert.go index 53fc8fd07b11..37b377bf3b81 100644 --- a/tools/karpenter-convert/pkg/convert/convert.go +++ b/tools/karpenter-convert/pkg/convert/convert.go @@ -167,7 +167,6 @@ func (o *Context) RunConvert() error { func dropFields(buffer bytes.Buffer) string { output := buffer.String() output = strings.Replace(output, "status: {}\n", "", -1) - output = strings.Replace(output, " creationTimestamp: null\n", "", -1) output = strings.Replace(output, " creationTimestamp: null\n", "", -1) output = strings.Replace(output, " resources: {}\n", "", -1) @@ -240,9 +239,6 @@ func convertProvisioner(resource runtime.Object, o *Context) runtime.Object { Finalizers: coreprovisioner.Finalizers, } - // Reset timestamp if present - nodepool.Spec.Template.CreationTimestamp = metav1.Time{} - // Cleanup the status provided in input nodepool.Status = corev1beta1.NodePoolStatus{} From 69a9bfb124bca678f54f22c7ceb29d8f7f2d1b1a Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 06:49:34 +0000 Subject: [PATCH 19/47] chore(deps): bump the go-deps group with 1 update (#4886) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 69208f082e58..2fc5ecb3a964 100644 --- a/go.mod +++ b/go.mod @@ -13,7 +13,7 @@ require ( github.com/imdario/mergo v0.3.16 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.13.0 - github.com/onsi/gomega v1.28.0 + github.com/onsi/gomega v1.28.1 github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pelletier/go-toml/v2 v2.1.0 github.com/prometheus/client_golang v1.17.0 diff --git a/go.sum b/go.sum index bf105d6435f5..c893c4082f6a 100644 --- a/go.sum +++ b/go.sum @@ -279,8 +279,8 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo/v2 v2.13.0 h1:0jY9lJquiL8fcf3M4LAXN5aMlS/b2BV86HFFPCPMgE4= github.com/onsi/ginkgo/v2 v2.13.0/go.mod h1:TE309ZR8s5FsKKpuB1YAQYBzCaAfUgatB/xlT/ETL/o= -github.com/onsi/gomega v1.28.0 h1:i2rg/p9n/UqIDAMFUJ6qIUUMcsqOuUHgbpbu235Vr1c= -github.com/onsi/gomega v1.28.0/go.mod h1:A1H2JE76sI14WIP57LMKj7FVfCHx3g3BcZVjJG8bjX8= +github.com/onsi/gomega v1.28.1 h1:MijcGUbfYuznzK/5R4CPNoUP/9Xvuo20sXfEm6XxoTA= +github.com/onsi/gomega v1.28.1/go.mod h1:9sxs+SwGrKI0+PWe4Fxa9tFQQBG5xSsSbMXOI8PPpoQ= github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= github.com/pelletier/go-toml/v2 v2.1.0 h1:FnwAJ4oYMvbT/34k9zzHuZNrhlz48GB3/s6at6/MHO4= From 41025d2f475c2ff67765fc1ad2838e2578bb8691 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 09:26:31 -0700 Subject: [PATCH 20/47] test: Add NodeClaim testing for E2E for v1beta1 (#4891) --- .github/workflows/e2e-matrix.yaml | 2 +- .../beta/nodeclaim/garbage_collection_test.go | 149 ++++++++ test/suites/beta/nodeclaim/nodeclaim_test.go | 354 ++++++++++++++++++ test/suites/beta/nodeclaim/suite_test.go | 77 ++++ .../al2_userdata_custom_labels_input.sh | 14 + .../nodeclaim/testdata/al2_userdata_input.sh | 14 + 6 files changed, 609 insertions(+), 1 deletion(-) create mode 100644 test/suites/beta/nodeclaim/garbage_collection_test.go create mode 100644 test/suites/beta/nodeclaim/nodeclaim_test.go create mode 100644 test/suites/beta/nodeclaim/suite_test.go create mode 100644 test/suites/beta/nodeclaim/testdata/al2_userdata_custom_labels_input.sh create mode 100644 test/suites/beta/nodeclaim/testdata/al2_userdata_input.sh diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index a5c633fae5f0..a82e5cfa2b2f 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/test/suites/beta/nodeclaim/garbage_collection_test.go b/test/suites/beta/nodeclaim/garbage_collection_test.go new file mode 100644 index 000000000000..703b4edd505b --- /dev/null +++ b/test/suites/beta/nodeclaim/garbage_collection_test.go @@ -0,0 +1,149 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeclaim_test + +import ( + "encoding/base64" + "fmt" + "os" + "time" + + "github.com/aws/aws-sdk-go/aws" + "github.com/aws/aws-sdk-go/service/ec2" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/settings" + awserrors "github.com/aws/karpenter/pkg/errors" + "github.com/aws/karpenter/pkg/utils" + environmentaws "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var _ = Describe("NodeClaimGarbageCollection", func() { + var customAMI string + var instanceInput *ec2.RunInstancesInput + + BeforeEach(func() { + securityGroups := env.GetSecurityGroups(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + subnets := env.GetSubnetNameAndIds(map[string]string{"karpenter.sh/discovery": env.ClusterName}) + Expect(securityGroups).ToNot(HaveLen(0)) + Expect(subnets).ToNot(HaveLen(0)) + + customAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) + instanceInput = &ec2.RunInstancesInput{ + InstanceType: aws.String("c5.large"), + IamInstanceProfile: &ec2.IamInstanceProfileSpecification{ + Name: aws.String(settings.FromContext(env.Context).DefaultInstanceProfile), + }, + SecurityGroupIds: lo.Map(securityGroups, func(s environmentaws.SecurityGroup, _ int) *string { + return s.GroupIdentifier.GroupId + }), + SubnetId: aws.String(subnets[0].ID), + BlockDeviceMappings: []*ec2.BlockDeviceMapping{ + { + DeviceName: aws.String("/dev/xvda"), + Ebs: &ec2.EbsBlockDevice{ + Encrypted: aws.Bool(true), + DeleteOnTermination: aws.Bool(true), + VolumeType: aws.String(ec2.VolumeTypeGp3), + VolumeSize: aws.Int64(20), + }, + }, + }, + ImageId: aws.String(customAMI), // EKS AL2-based AMI + TagSpecifications: []*ec2.TagSpecification{ + { + ResourceType: aws.String(ec2.ResourceTypeInstance), + Tags: []*ec2.Tag{ + { + Key: aws.String(fmt.Sprintf("kubernetes.io/cluster/%s", env.ClusterName)), + Value: aws.String("owned"), + }, + { + Key: aws.String(corev1beta1.NodePoolLabelKey), + Value: aws.String(nodePool.Name), + }, + }, + }, + }, + MinCount: aws.Int64(1), + MaxCount: aws.Int64(1), + } + }) + It("should succeed to garbage collect an Instance that was launched by a NodeClaim but has no Instance mapping", func() { + // Update the userData for the instance input with the correct NodePool + rawContent, err := os.ReadFile("testdata/al2_userdata_input.sh") + Expect(err).ToNot(HaveOccurred()) + instanceInput.UserData = lo.ToPtr(base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf(string(rawContent), env.ClusterName, + env.ClusterEndpoint, env.ExpectCABundle(), nodePool.Name)))) + + // Create an instance manually to mock Karpenter launching an instance + out := env.ExpectRunInstances(instanceInput) + Expect(out.Instances).To(HaveLen(1)) + + // Always ensure that we cleanup the instance + DeferCleanup(func() { + _, err := env.EC2API.TerminateInstances(&ec2.TerminateInstancesInput{ + InstanceIds: []*string{out.Instances[0].InstanceId}, + }) + if awserrors.IsNotFound(err) { + return + } + Expect(err).ToNot(HaveOccurred()) + }) + + // Wait for the node to register with the cluster + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + + // Update the tags to add the karpenter.sh/managed-by tag + _, err = env.EC2API.CreateTagsWithContext(env.Context, &ec2.CreateTagsInput{ + Resources: []*string{out.Instances[0].InstanceId}, + Tags: []*ec2.Tag{ + { + Key: aws.String(corev1beta1.ManagedByAnnotationKey), + Value: aws.String(env.ClusterName), + }, + }, + }) + Expect(err).ToNot(HaveOccurred()) + + // Eventually expect the node and the instance to be removed (shutting-down) + env.EventuallyExpectNotFound(node) + Eventually(func(g Gomega) { + g.Expect(lo.FromPtr(env.GetInstanceByID(aws.StringValue(out.Instances[0].InstanceId)).State.Name)).To(Equal("shutting-down")) + }, time.Second*10).Should(Succeed()) + }) + It("should succeed to garbage collect an Instance that was deleted without the cluster's knowledge", func() { + // Disable the interruption queue for the garbage collection coretest + env.ExpectSettingsOverridden(v1.EnvVar{Name: "INTERRUPTION_QUEUE", Value: ""}) + + pod := coretest.Pod() + env.ExpectCreated(nodeClass, nodePool, pod) + env.EventuallyExpectHealthy(pod) + node := env.ExpectCreatedNodeCount("==", 1)[0] + + _, err := env.EC2API.TerminateInstances(&ec2.TerminateInstancesInput{ + InstanceIds: aws.StringSlice([]string{lo.Must(utils.ParseInstanceID(node.Spec.ProviderID))}), + }) + Expect(err).ToNot(HaveOccurred()) + + // The garbage collection mechanism should eventually delete this NodeClaim and Node + env.EventuallyExpectNotFound(node) + }) +}) diff --git a/test/suites/beta/nodeclaim/nodeclaim_test.go b/test/suites/beta/nodeclaim/nodeclaim_test.go new file mode 100644 index 000000000000..d9511f4c30a6 --- /dev/null +++ b/test/suites/beta/nodeclaim/nodeclaim_test.go @@ -0,0 +1,354 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeclaim_test + +import ( + "encoding/base64" + "fmt" + "os" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "sigs.k8s.io/controller-runtime/pkg/client" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter-core/pkg/utils/resources" + "github.com/aws/karpenter/pkg/apis/v1beta1" +) + +var _ = Describe("StandaloneNodeClaim", func() { + It("should create a standard NodeClaim within the 'c' instance family", func() { + nodeClaim := test.NodeClaim(corev1beta1.NodeClaim{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c"}, + }, + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + env.ExpectCreated(nodeClass, nodeClaim) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + nodeClaim = env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + Expect(node.Labels).To(HaveKeyWithValue(v1beta1.LabelInstanceCategory, "c")) + env.EventuallyExpectNodeClaimsReady(nodeClaim) + }) + It("should create a standard NodeClaim based on resource requests", func() { + nodeClaim := test.NodeClaim(corev1beta1.NodeClaim{ + Spec: corev1beta1.NodeClaimSpec{ + Resources: corev1beta1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("3"), + v1.ResourceMemory: resource.MustParse("64Gi"), + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + env.ExpectCreated(nodeClass, nodeClaim) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + nodeClaim = env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + Expect(resources.Fits(nodeClaim.Spec.Resources.Requests, node.Status.Allocatable)) + env.EventuallyExpectNodeClaimsReady(nodeClaim) + }) + It("should create a NodeClaim propagating all the NodeClaim spec details", func() { + nodeClaim := test.NodeClaim(corev1beta1.NodeClaim{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + "custom-annotation": "custom-value", + }, + Labels: map[string]string{ + "custom-label": "custom-value", + }, + }, + Spec: corev1beta1.NodeClaimSpec{ + Taints: []v1.Taint{ + { + Key: "custom-taint", + Effect: v1.TaintEffectNoSchedule, + Value: "custom-value", + }, + { + Key: "other-custom-taint", + Effect: v1.TaintEffectNoExecute, + Value: "other-custom-value", + }, + }, + Resources: corev1beta1.ResourceRequirements{ + Requests: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("3"), + v1.ResourceMemory: resource.MustParse("16Gi"), + }, + }, + Kubelet: &corev1beta1.KubeletConfiguration{ + MaxPods: lo.ToPtr[int32](110), + PodsPerCore: lo.ToPtr[int32](10), + SystemReserved: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: resource.MustParse("200Mi"), + v1.ResourceEphemeralStorage: resource.MustParse("1Gi"), + }, + KubeReserved: v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("200m"), + v1.ResourceMemory: resource.MustParse("200Mi"), + v1.ResourceEphemeralStorage: resource.MustParse("1Gi"), + }, + EvictionHard: map[string]string{ + "memory.available": "5%", + "nodefs.available": "5%", + "nodefs.inodesFree": "5%", + "imagefs.available": "5%", + "imagefs.inodesFree": "5%", + "pid.available": "3%", + }, + EvictionSoft: map[string]string{ + "memory.available": "10%", + "nodefs.available": "10%", + "nodefs.inodesFree": "10%", + "imagefs.available": "10%", + "imagefs.inodesFree": "10%", + "pid.available": "6%", + }, + EvictionSoftGracePeriod: map[string]metav1.Duration{ + "memory.available": {Duration: time.Minute * 2}, + "nodefs.available": {Duration: time.Minute * 2}, + "nodefs.inodesFree": {Duration: time.Minute * 2}, + "imagefs.available": {Duration: time.Minute * 2}, + "imagefs.inodesFree": {Duration: time.Minute * 2}, + "pid.available": {Duration: time.Minute * 2}, + }, + EvictionMaxPodGracePeriod: lo.ToPtr[int32](120), + ImageGCHighThresholdPercent: lo.ToPtr[int32](50), + ImageGCLowThresholdPercent: lo.ToPtr[int32](10), + }, + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + env.ExpectCreated(nodeClass, nodeClaim) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + Expect(node.Annotations).To(HaveKeyWithValue("custom-annotation", "custom-value")) + Expect(node.Labels).To(HaveKeyWithValue("custom-label", "custom-value")) + Expect(node.Spec.Taints).To(ContainElements( + v1.Taint{ + Key: "custom-taint", + Effect: v1.TaintEffectNoSchedule, + Value: "custom-value", + }, + v1.Taint{ + Key: "other-custom-taint", + Effect: v1.TaintEffectNoExecute, + Value: "other-custom-value", + }, + )) + Expect(node.OwnerReferences).To(ContainElement( + metav1.OwnerReference{ + APIVersion: corev1beta1.SchemeGroupVersion.String(), + Kind: "NodeClaim", + Name: nodeClaim.Name, + UID: nodeClaim.UID, + BlockOwnerDeletion: lo.ToPtr(true), + }, + )) + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nodeClaim) + }) + It("should remove the cloudProvider NodeClaim when the cluster NodeClaim is deleted", func() { + nodeClaim := test.NodeClaim(corev1beta1.NodeClaim{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c"}, + }, + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + env.ExpectCreated(nodeClass, nodeClaim) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + nodeClaim = env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + + instanceID := env.ExpectParsedProviderID(node.Spec.ProviderID) + env.GetInstance(node.Name) + + // Node is deleted and now should be not found + env.ExpectDeleted(nodeClaim) + env.EventuallyExpectNotFound(nodeClaim, node) + + Eventually(func(g Gomega) { + g.Expect(lo.FromPtr(env.GetInstanceByID(instanceID).State.Name)).To(Equal("shutting-down")) + }, time.Second*10).Should(Succeed()) + }) + It("should delete a NodeClaim from the node termination finalizer", func() { + nodeClaim := test.NodeClaim(corev1beta1.NodeClaim{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c"}, + }, + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + env.ExpectCreated(nodeClass, nodeClaim) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + nodeClaim = env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + + instanceID := env.ExpectParsedProviderID(node.Spec.ProviderID) + env.GetInstance(node.Name) + + // Delete the node and expect both the node and nodeClaim to be gone as well as the instance to be shutting-down + env.ExpectDeleted(node) + env.EventuallyExpectNotFound(nodeClaim, node) + + Eventually(func(g Gomega) { + g.Expect(lo.FromPtr(env.GetInstanceByID(instanceID).State.Name)).To(Equal("shutting-down")) + }, time.Second*10).Should(Succeed()) + }) + It("should create a NodeClaim with custom labels passed through the userData", func() { + customAMI := env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) + // Update the userData for the instance input with the correct NodePool + rawContent, err := os.ReadFile("testdata/al2_userdata_custom_labels_input.sh") + Expect(err).ToNot(HaveOccurred()) + + // Create userData that adds custom labels through the --kubelet-extra-args + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: customAMI}} + nodeClass.Spec.UserData = lo.ToPtr(base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf(string(rawContent), env.ClusterName, + env.ClusterEndpoint, env.ExpectCABundle())))) + + nodeClaim := test.NodeClaim(corev1beta1.NodeClaim{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c"}, + }, + { + Key: v1.LabelArchStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"amd64"}, + }, + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + env.ExpectCreated(nodeClass, nodeClaim) + node := env.EventuallyExpectInitializedNodeCount("==", 1)[0] + Expect(node.Labels).To(HaveKeyWithValue("custom-label", "custom-value")) + Expect(node.Labels).To(HaveKeyWithValue("custom-label2", "custom-value2")) + + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectNodeClaimsReady(nodeClaim) + }) + It("should delete a NodeClaim after the registration timeout when the node doesn't register", func() { + customAMI := env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) + // Update the userData for the instance input with the correct NodePool + rawContent, err := os.ReadFile("testdata/al2_userdata_input.sh") + Expect(err).ToNot(HaveOccurred()) + + // Create userData that adds custom labels through the --kubelet-extra-args + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyCustom + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: customAMI}} + + // Giving bad clusterName and clusterEndpoint to the userData + nodeClass.Spec.UserData = lo.ToPtr(base64.StdEncoding.EncodeToString([]byte(fmt.Sprintf(string(rawContent), "badName", "badEndpoint", env.ExpectCABundle())))) + + nodeClaim := test.NodeClaim(corev1beta1.NodeClaim{ + Spec: corev1beta1.NodeClaimSpec{ + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c"}, + }, + { + Key: v1.LabelArchStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"amd64"}, + }, + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + }, + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }) + + env.ExpectCreated(nodeClass, nodeClaim) + nodeClaim = env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + + // Expect that the nodeClaim eventually launches and has false Registration/Initialization + Eventually(func(g Gomega) { + temp := &corev1beta1.NodeClaim{} + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(nodeClaim), temp)).To(Succeed()) + g.Expect(temp.StatusConditions().GetCondition(corev1beta1.Launched).IsTrue()).To(BeTrue()) + g.Expect(temp.StatusConditions().GetCondition(corev1beta1.Registered).IsFalse()).To(BeTrue()) + g.Expect(temp.StatusConditions().GetCondition(corev1beta1.Initialized).IsFalse()).To(BeTrue()) + }).Should(Succeed()) + + // Expect that the nodeClaim is eventually de-provisioned due to the registration timeout + env.EventuallyExpectNotFoundAssertion(nodeClaim).WithTimeout(time.Minute * 20).Should(Succeed()) + }) +}) diff --git a/test/suites/beta/nodeclaim/suite_test.go b/test/suites/beta/nodeclaim/suite_test.go new file mode 100644 index 000000000000..2d8b16c45dc5 --- /dev/null +++ b/test/suites/beta/nodeclaim/suite_test.go @@ -0,0 +1,77 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodeclaim_test + +import ( + "fmt" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var env *aws.Environment +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool + +func TestNodeClaim(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/NodeClaim") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + nodePool = coretest.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }, + }, + }) +}) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) diff --git a/test/suites/beta/nodeclaim/testdata/al2_userdata_custom_labels_input.sh b/test/suites/beta/nodeclaim/testdata/al2_userdata_custom_labels_input.sh new file mode 100644 index 000000000000..fc1d8a3853f8 --- /dev/null +++ b/test/suites/beta/nodeclaim/testdata/al2_userdata_custom_labels_input.sh @@ -0,0 +1,14 @@ +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="BOUNDARY" + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 +/etc/eks/bootstrap.sh '%s' --apiserver-endpoint '%s' --b64-cluster-ca '%s' \ +--use-max-pods false \ +--container-runtime containerd \ +--kubelet-extra-args '--node-labels=testing/cluster=unspecified,custom-label=custom-value,custom-label2=custom-value2' + +--BOUNDARY-- diff --git a/test/suites/beta/nodeclaim/testdata/al2_userdata_input.sh b/test/suites/beta/nodeclaim/testdata/al2_userdata_input.sh new file mode 100644 index 000000000000..9a40bf4562a4 --- /dev/null +++ b/test/suites/beta/nodeclaim/testdata/al2_userdata_input.sh @@ -0,0 +1,14 @@ +MIME-Version: 1.0 +Content-Type: multipart/mixed; boundary="BOUNDARY" + +--BOUNDARY +Content-Type: text/x-shellscript; charset="us-ascii" + +#!/bin/bash +exec > >(tee /var/log/user-data.log|logger -t user-data -s 2>/dev/console) 2>&1 +/etc/eks/bootstrap.sh '%s' --apiserver-endpoint '%s' --b64-cluster-ca '%s' \ +--use-max-pods false \ +--container-runtime containerd \ +--kubelet-extra-args '--node-labels=karpenter.sh/nodepool=%s,testing/cluster=unspecified' + +--BOUNDARY-- From 358f151a9a4e85a3ee6f8e71f826427c9fb73728 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 09:26:42 -0700 Subject: [PATCH 21/47] docs: Fix typo of NodePools in `upgrade-guide.md` (#4892) --- website/content/en/preview/upgrading/upgrade-guide.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index 5de3c4e81b52..462681831765 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -28,7 +28,7 @@ If you get the error `invalid ownership metadata; label validation error:` while In general, you can reapply the CRDs in the `crds` directory of the Karpenter helm chart: ```shell -kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.sh_nodepols.yaml +kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.sh_nodepools.yaml kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.sh_nodeclaims.yaml kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml ``` From 2529674ab1bb0b6f2e7490751aa31b24018e16fd Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 09:41:04 -0700 Subject: [PATCH 22/47] test: Add Expiration testing for E2E for v1beta1 (#4887) --- .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 2 + test/pkg/environment/common/environment.go | 8 + test/pkg/environment/common/expectations.go | 36 +- .../suites/beta/expiration/expiration_test.go | 355 ++++++++++++++++++ 5 files changed, 401 insertions(+), 2 deletions(-) create mode 100644 test/suites/beta/expiration/expiration_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index a82e5cfa2b2f..6e76c78540f3 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/Expiration, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index fde21f6bcec9..0da41c069041 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -18,6 +18,8 @@ on: - Beta/Integration - Beta/Drift - Beta/Consolidation + - Beta/Expiration + - Beta/NodeClaim - Alpha/Integration - Alpha/Machine - Alpha/Consolidation diff --git a/test/pkg/environment/common/environment.go b/test/pkg/environment/common/environment.go index 026aa71f8368..bab195b7c677 100644 --- a/test/pkg/environment/common/environment.go +++ b/test/pkg/environment/common/environment.go @@ -37,6 +37,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" coreapis "github.com/aws/karpenter-core/pkg/apis" + "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/operator" "github.com/aws/karpenter-core/pkg/operator/injection" "github.com/aws/karpenter/pkg/apis" @@ -116,6 +117,13 @@ func NewClient(ctx context.Context, config *rest.Config) client.Client { node := o.(*v1.Node) return []string{strconv.FormatBool(node.Spec.Unschedulable)} })) + lo.Must0(cache.IndexField(ctx, &v1.Node{}, "spec.taints[*].karpenter.sh/disruption", func(o client.Object) []string { + node := o.(*v1.Node) + t, _ := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { + return t.Key == v1beta1.DisruptionTaintKey + }) + return []string{t.Value} + })) c := lo.Must(client.New(config, client.Options{Scheme: scheme, Cache: &client.CacheOptions{Reader: cache}})) diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index 8f3844da19b8..d0a9905cb610 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -460,7 +460,7 @@ func (env *Environment) ConsistentlyExpectMachineCount(comparator string, count return lo.ToSlicePtr(machineList.Items) } -func (env *Environment) EventuallyExpectCordonedNodeCount(comparator string, count int) []*v1.Node { +func (env *Environment) EventuallyExpectCordonedNodeCountLegacy(comparator string, count int) []*v1.Node { GinkgoHelper() By(fmt.Sprintf("waiting for cordoned nodes to be %s to %d", comparator, count)) nodeList := &v1.NodeList{} @@ -472,6 +472,40 @@ func (env *Environment) EventuallyExpectCordonedNodeCount(comparator string, cou return lo.ToSlicePtr(nodeList.Items) } +func (env *Environment) EventuallyExpectNodesUncordonedLegacyWithTimeout(timeout time.Duration, nodes ...*v1.Node) { + GinkgoHelper() + By(fmt.Sprintf("waiting for %d nodes to be uncordoned", len(nodes))) + nodeList := &v1.NodeList{} + Eventually(func(g Gomega) { + g.Expect(env.Client.List(env, nodeList, client.MatchingFields{"spec.unschedulable": "true"})).To(Succeed()) + cordonedNodeNames := lo.Map(nodeList.Items, func(n v1.Node, _ int) string { return n.Name }) + g.Expect(cordonedNodeNames).ToNot(ContainElements(lo.Map(nodes, func(n *v1.Node, _ int) interface{} { return n.Name })...)) + }).WithTimeout(timeout).Should(Succeed()) +} + +func (env *Environment) EventuallyExpectCordonedNodeCount(comparator string, count int) []*v1.Node { + GinkgoHelper() + By(fmt.Sprintf("waiting for cordoned nodes to be %s to %d", comparator, count)) + nodeList := &v1.NodeList{} + Eventually(func(g Gomega) { + g.Expect(env.Client.List(env, nodeList, client.MatchingFields{"spec.taints[*].karpenter.sh/disruption": "disrupting"})).To(Succeed()) + g.Expect(len(nodeList.Items)).To(BeNumerically(comparator, count), + fmt.Sprintf("expected %d cordoned nodes, had %d (%v)", count, len(nodeList.Items), NodeNames(lo.ToSlicePtr(nodeList.Items)))) + }).Should(Succeed()) + return lo.ToSlicePtr(nodeList.Items) +} + +func (env *Environment) EventuallyExpectNodesUncordonedWithTimeout(timeout time.Duration, nodes ...*v1.Node) { + GinkgoHelper() + By(fmt.Sprintf("waiting for %d nodes to be uncordoned", len(nodes))) + nodeList := &v1.NodeList{} + Eventually(func(g Gomega) { + g.Expect(env.Client.List(env, nodeList, client.MatchingFields{"spec.taints[*].karpenter.sh/disruption": "disrupting"})).To(Succeed()) + cordonedNodeNames := lo.Map(nodeList.Items, func(n v1.Node, _ int) string { return n.Name }) + g.Expect(cordonedNodeNames).ToNot(ContainElements(lo.Map(nodes, func(n *v1.Node, _ int) interface{} { return n.Name })...)) + }).WithTimeout(timeout).Should(Succeed()) +} + func (env *Environment) EventuallyExpectNodeCount(comparator string, count int) []*v1.Node { GinkgoHelper() By(fmt.Sprintf("waiting for nodes to be %s to %d", comparator, count)) diff --git a/test/suites/beta/expiration/expiration_test.go b/test/suites/beta/expiration/expiration_test.go new file mode 100644 index 000000000000..224d12827e8f --- /dev/null +++ b/test/suites/beta/expiration/expiration_test.go @@ -0,0 +1,355 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package expiration_test + +import ( + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/sets" + "sigs.k8s.io/controller-runtime/pkg/client" + + "github.com/aws/aws-sdk-go/service/ssm" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/environment/aws" + + coretest "github.com/aws/karpenter-core/pkg/test" +) + +var env *aws.Environment +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool + +func TestExpiration(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/Expiration") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + nodePool = coretest.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }, + Disruption: corev1beta1.Disruption{ + ExpireAfter: corev1beta1.NillableDuration{Duration: lo.ToPtr(time.Second * 30)}, + }, + }, + }) +}) + +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Expiration", func() { + It("should expire the node after the expiration is reached", func() { + var numPods int32 = 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + corev1beta1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + env.Monitor.Reset() // Reset the monitor so that we can expect a single node to be spun up after expiration + + // Expect that the NodeClaim will get an expired status condition + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + // Remove the do-not-disrupt annotation so that the Nodes are now deprovisionable + for _, pod := range env.ExpectPodsMatchingSelector(selector) { + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + } + + // Eventually the node will be set as unschedulable, which means its actively being deprovisioned + Eventually(func(g Gomega) { + n := &v1.Node{} + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), n)).Should(Succeed()) + _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { + return corev1beta1.IsDisruptingTaint(t) + }) + g.Expect(ok).To(BeTrue()) + }).Should(Succeed()) + + // Set the expireAfter to "Never" to make sure new node isn't deleted + // This is CRITICAL since it prevents nodes that are immediately spun up from immediately being expired and + // racing at the end of the E2E test, leaking node resources into subsequent tests + nodePool.Spec.Disruption.ExpireAfter.Duration = nil + env.ExpectUpdated(nodePool) + + // After the deletion timestamp is set and all pods are drained + // the node should be gone + env.EventuallyExpectNotFound(nodeClaim, node) + + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + }) + It("should replace expired node with a single node and schedule all pods", func() { + var numPods int32 = 5 + // We should setup a PDB that will only allow a minimum of 1 pod to be pending at a time + minAvailable := intstr.FromInt32(numPods - 1) + pdb := coretest.PodDisruptionBudget(coretest.PDBOptions{ + Labels: map[string]string{ + "app": "large-app", + }, + MinAvailable: &minAvailable, + }) + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: numPods, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Annotations: map[string]string{ + corev1beta1.DoNotDisruptAnnotationKey: "true", + }, + Labels: map[string]string{"app": "large-app"}, + }, + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + env.ExpectCreated(nodeClass, nodePool, pdb, dep) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + env.Monitor.Reset() // Reset the monitor so that we can expect a single node to be spun up after expiration + + // Set the expireAfter value to get the node deleted + nodePool.Spec.Disruption.ExpireAfter.Duration = lo.ToPtr(time.Minute) + env.ExpectUpdated(nodePool) + + // Expect that the NodeClaim will get an expired status condition + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + // Remove the do-not-disruption annotation so that the Nodes are now deprovisionable + for _, pod := range env.ExpectPodsMatchingSelector(selector) { + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + } + + // Eventually the node will be set as unschedulable, which means its actively being deprovisioned + Eventually(func(g Gomega) { + n := &v1.Node{} + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), n)).Should(Succeed()) + _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { + return corev1beta1.IsDisruptingTaint(t) + }) + g.Expect(ok).To(BeTrue()) + }).Should(Succeed()) + + // Set the expireAfter to "Never" to make sure new node isn't deleted + // This is CRITICAL since it prevents nodes that are immediately spun up from immediately being expired and + // racing at the end of the E2E test, leaking node resources into subsequent tests + nodePool.Spec.Disruption.ExpireAfter.Duration = nil + env.ExpectUpdated(nodePool) + + // After the deletion timestamp is set and all pods are drained + // the node should be gone + env.EventuallyExpectNotFound(nodeClaim, node) + + env.EventuallyExpectCreatedNodeClaimCount("==", 1) + env.EventuallyExpectCreatedNodeCount("==", 1) + env.EventuallyExpectHealthyPodCount(selector, int(numPods)) + }) + Context("Expiration Failure", func() { + It("should not continue to expire if a node never registers", func() { + // Launch a new NodeClaim + var numPods int32 = 2 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []v1.PodAffinityTerm{{ + TopologyKey: v1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Set a configuration that will not register a NodeClaim + parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: lo.ToPtr("/aws/service/ami-amazon-linux-latest/amzn-ami-hvm-x86_64-ebs"), + }) + Expect(err).ToNot(HaveOccurred()) + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: *parameter.Parameter.Value, + }, + } + env.ExpectCreatedOrUpdated(nodeClass) + + // Should see the NodeClaim has expired + Eventually(func(g Gomega) { + for _, nc := range startingNodeClaimState { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nc), nc)).To(Succeed()) + g.Expect(nc.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + } + }).Should(Succeed()) + + // Expect nodes To get cordoned + cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + + // Expire should fail and the original node should be uncordoned + // TODO: reduce timeouts when deprovisioning waits are factored out + env.EventuallyExpectNodesUncordonedWithTimeout(11*time.Minute, cordonedNodes...) + + // The nodeclaims that never registers will be removed + Eventually(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + g.Expect(len(nodeClaims.Items)).To(BeNumerically("==", int(numPods))) + }).WithTimeout(6 * time.Minute).Should(Succeed()) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + Consistently(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + + startingNodeClaimUIDs := lo.Map(startingNodeClaimState, func(nc *corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + nodeClaimUIDs := lo.Map(nodeClaims.Items, func(nc corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + g.Expect(sets.New(nodeClaimUIDs...).IsSuperset(sets.New(startingNodeClaimUIDs...))).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + It("should not continue to expiration if a node registers but never becomes initialized", func() { + // Set a configuration that will allow us to make a NodeClaim not be initialized + nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com/taint", Effect: v1.TaintEffectPreferNoSchedule}} + + // Launch a new NodeClaim + var numPods int32 = 2 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: 2, + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{Labels: map[string]string{"app": "inflate"}}, + PodAntiRequirements: []v1.PodAffinityTerm{{ + TopologyKey: v1.LabelHostname, + LabelSelector: &metav1.LabelSelector{ + MatchLabels: map[string]string{"app": "inflate"}, + }}, + }, + }, + }) + env.ExpectCreated(dep, nodeClass, nodePool) + + startingNodeClaimState := env.EventuallyExpectCreatedNodeClaimCount("==", int(numPods)) + nodes := env.EventuallyExpectCreatedNodeCount("==", int(numPods)) + + // Remove the startup taints from these nodes to initialize them + Eventually(func(g Gomega) { + for _, node := range nodes { + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).To(Succeed()) + stored := node.DeepCopy() + node.Spec.Taints = lo.Reject(node.Spec.Taints, func(t v1.Taint, _ int) bool { return t.Key == "example.com/taint" }) + g.Expect(env.Client.Patch(env.Context, node, client.MergeFrom(stored))).To(Succeed()) + } + }).Should(Succeed()) + + // Should see the NodeClaim has expired + Eventually(func(g Gomega) { + for _, nc := range startingNodeClaimState { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nc), nc)).To(Succeed()) + g.Expect(nc.StatusConditions().GetCondition(corev1beta1.Expired).IsTrue()).To(BeTrue()) + } + }).Should(Succeed()) + + // Expect nodes To be cordoned + cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + + // Expire should fail and original node should be uncordoned and no NodeClaims should be removed + // TODO: reduce timeouts when deprovisioning waits are factored out + env.EventuallyExpectNodesUncordonedWithTimeout(11*time.Minute, cordonedNodes...) + + // Expect that the new NodeClaim/Node is kept around after the un-cordon + nodeList := &v1.NodeList{} + Expect(env.Client.List(env, nodeList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeList.Items).To(HaveLen(int(numPods) + 1)) + + nodeClaimList := &corev1beta1.NodeClaimList{} + Expect(env.Client.List(env, nodeClaimList, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + Expect(nodeClaimList.Items).To(HaveLen(int(numPods) + 1)) + + // Expect all the NodeClaims that existed on the initial provisioning loop are not removed + Consistently(func(g Gomega) { + nodeClaims := &corev1beta1.NodeClaimList{} + g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + + startingNodeClaimUIDs := lo.Map(startingNodeClaimState, func(nc *corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + nodeClaimUIDs := lo.Map(nodeClaims.Items, func(nc corev1beta1.NodeClaim, _ int) types.UID { return nc.UID }) + g.Expect(sets.New(nodeClaimUIDs...).IsSuperset(sets.New(startingNodeClaimUIDs...))).To(BeTrue()) + }, "2m").Should(Succeed()) + }) + }) +}) From 85ede37530fa0a8ce9f2d9b9b405e65972d1059c Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 10:40:53 -0700 Subject: [PATCH 23/47] test: Add Chaos testing for E2E for v1beta1 (#4888) --- .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 1 + test/suites/beta/chaos/suite_test.go | 249 +++++++++++++++++++++++++++ 3 files changed, 251 insertions(+), 1 deletion(-) create mode 100644 test/suites/beta/chaos/suite_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index 6e76c78540f3..8bbebb3148d9 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/Expiration, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/Chaos, Beta/Expiration, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 0da41c069041..55cda1920501 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -18,6 +18,7 @@ on: - Beta/Integration - Beta/Drift - Beta/Consolidation + - Beta/Chaos - Beta/Expiration - Beta/NodeClaim - Alpha/Integration diff --git a/test/suites/beta/chaos/suite_test.go b/test/suites/beta/chaos/suite_test.go new file mode 100644 index 000000000000..6674568a0ee5 --- /dev/null +++ b/test/suites/beta/chaos/suite_test.go @@ -0,0 +1,249 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package chaos_test + +import ( + "context" + "fmt" + "sync/atomic" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/informers" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + controllerruntime "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/manager" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" + nodeutils "github.com/aws/karpenter-core/pkg/utils/node" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/debug" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var env *aws.Environment +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool + +func TestChaos(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/Chaos") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + nodePool = coretest.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }, + }, + }) +}) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Chaos", func() { + Describe("Runaway Scale-Up", func() { + It("should not produce a runaway scale-up when consolidation is enabled", Label(debug.NoWatch), Label(debug.NoEvents), func() { + ctx, cancel := context.WithCancel(env.Context) + defer cancel() + + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeSpot}, + }) + nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenUnderutilized + nodePool.Spec.Disruption.ConsolidateAfter = nil + + numPods := 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: lo.ToPtr[int64](0), + }, + }) + // Start a controller that adds taints to nodes after creation + Expect(startTaintAdder(ctx, env.Config)).To(Succeed()) + startNodeCountMonitor(ctx, env.Client) + + // Create a deployment with a single pod + env.ExpectCreated(nodeClass, nodePool, dep) + + // Expect that we never get over a high number of nodes + Consistently(func(g Gomega) { + list := &v1.NodeList{} + g.Expect(env.Client.List(env.Context, list, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + g.Expect(len(list.Items)).To(BeNumerically("<", 35)) + }, time.Minute*5).Should(Succeed()) + }) + It("should not produce a runaway scale-up when emptiness is enabled", Label(debug.NoWatch), Label(debug.NoEvents), func() { + ctx, cancel := context.WithCancel(env.Context) + defer cancel() + + nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenEmpty + nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{Duration: lo.ToPtr(30 * time.Second)} + numPods := 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: lo.ToPtr[int64](0), + }, + }) + // Start a controller that adds taints to nodes after creation + Expect(startTaintAdder(ctx, env.Config)).To(Succeed()) + startNodeCountMonitor(ctx, env.Client) + + // Create a deployment with a single pod + env.ExpectCreated(nodeClass, nodePool, dep) + + // Expect that we never get over a high number of nodes + Consistently(func(g Gomega) { + list := &v1.NodeList{} + g.Expect(env.Client.List(env.Context, list, client.HasLabels{coretest.DiscoveryLabel})).To(Succeed()) + g.Expect(len(list.Items)).To(BeNumerically("<", 35)) + }, time.Minute*5).Should(Succeed()) + }) + }) +}) + +type taintAdder struct { + kubeClient client.Client +} + +func (t *taintAdder) Reconcile(ctx context.Context, req reconcile.Request) (reconcile.Result, error) { + node := &v1.Node{} + if err := t.kubeClient.Get(ctx, req.NamespacedName, node); err != nil { + return reconcile.Result{}, client.IgnoreNotFound(err) + } + mergeFrom := client.MergeFrom(node.DeepCopy()) + taint := v1.Taint{ + Key: "test", + Value: "true", + Effect: v1.TaintEffectNoExecute, + } + if !lo.Contains(node.Spec.Taints, taint) { + node.Spec.Taints = append(node.Spec.Taints, taint) + if err := t.kubeClient.Patch(ctx, node, mergeFrom); err != nil { + return reconcile.Result{}, err + } + } + return reconcile.Result{}, nil +} + +func (t *taintAdder) Builder(mgr manager.Manager) *controllerruntime.Builder { + return controllerruntime.NewControllerManagedBy(mgr). + For(&v1.Node{}). + WithEventFilter(predicate.NewPredicateFuncs(func(obj client.Object) bool { + node := obj.(*v1.Node) + if _, ok := node.Labels[coretest.DiscoveryLabel]; !ok { + return false + } + return true + })) +} + +func startTaintAdder(ctx context.Context, config *rest.Config) error { + mgr, err := controllerruntime.NewManager(config, controllerruntime.Options{}) + if err != nil { + return err + } + adder := &taintAdder{kubeClient: mgr.GetClient()} + if err = adder.Builder(mgr).Complete(adder); err != nil { + return err + } + go func() { + Expect(mgr.Start(ctx)).To(Succeed()) + }() + return nil +} + +func startNodeCountMonitor(ctx context.Context, kubeClient client.Client) { + createdNodes := atomic.Int64{} + deletedNodes := atomic.Int64{} + + factory := informers.NewSharedInformerFactoryWithOptions(env.KubeClient, time.Second*30, + informers.WithTweakListOptions(func(l *metav1.ListOptions) { l.LabelSelector = corev1beta1.NodePoolLabelKey })) + nodeInformer := factory.Core().V1().Nodes().Informer() + _ = lo.Must(nodeInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(_ interface{}) { + createdNodes.Add(1) + }, + DeleteFunc: func(_ interface{}) { + deletedNodes.Add(1) + }, + })) + factory.Start(ctx.Done()) + go func() { + for { + list := &v1.NodeList{} + if err := kubeClient.List(ctx, list, client.HasLabels{coretest.DiscoveryLabel}); err == nil { + readyCount := lo.CountBy(list.Items, func(n v1.Node) bool { + return nodeutils.GetCondition(&n, v1.NodeReady).Status == v1.ConditionTrue + }) + fmt.Printf("[NODE COUNT] CURRENT: %d | READY: %d | CREATED: %d | DELETED: %d\n", len(list.Items), readyCount, createdNodes.Load(), deletedNodes.Load()) + } + select { + case <-ctx.Done(): + return + case <-time.After(time.Second * 5): + } + } + }() +} From 2b795f2e29bb796ffef73ce4aa9bc7eda0c0be47 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 11:00:53 -0700 Subject: [PATCH 24/47] test: Add Interruption testing for E2E for v1beta1 (#4889) --- .github/workflows/e2e-matrix.yaml | 2 +- .github/workflows/e2e.yaml | 1 + test/suites/beta/interruption/suite_test.go | 250 ++++++++++++++++++++ 3 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 test/suites/beta/interruption/suite_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index 8bbebb3148d9..33af1b767a3b 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,7 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/Chaos, Beta/Expiration, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/Interruption, Beta/Chaos, Beta/Expiration, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 55cda1920501..56088257d93b 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -18,6 +18,7 @@ on: - Beta/Integration - Beta/Drift - Beta/Consolidation + - Beta/Interruption - Beta/Chaos - Beta/Expiration - Beta/NodeClaim diff --git a/test/suites/beta/interruption/suite_test.go b/test/suites/beta/interruption/suite_test.go new file mode 100644 index 000000000000..06e4a308de77 --- /dev/null +++ b/test/suites/beta/interruption/suite_test.go @@ -0,0 +1,250 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package interruption_test + +import ( + "fmt" + "testing" + "time" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/uuid" + "knative.dev/pkg/ptr" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/controllers/interruption/messages" + "github.com/aws/karpenter/pkg/controllers/interruption/messages/scheduledchange" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/pkg/utils" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var env *aws.Environment +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool + +func TestInterruption(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/Interruption") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + env.ExpectQueueExists() + nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + nodePool = coretest.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + }, + }, + }, + }) +}) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("Interruption", Label("AWS"), func() { + It("should terminate the spot instance and spin-up a new node on spot interruption warning", func() { + By("Creating a single healthy node with a healthy deployment") + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeSpot}, + }) + numPods := 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: ptr.Int64(0), + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(nodeClass, nodePool, dep) + + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.CreatedNodes()[0] + instanceID, err := utils.ParseInstanceID(node.Spec.ProviderID) + Expect(err).ToNot(HaveOccurred()) + + By("interrupting the spot instance") + exp := env.ExpectSpotInterruptionExperiment(instanceID) + DeferCleanup(func() { + env.ExpectExperimentTemplateDeleted(*exp.ExperimentTemplateId) + }) + + // We are expecting the node to be terminated before the termination is complete + By("waiting to receive the interruption and terminate the node") + env.EventuallyExpectNotFoundAssertion(node).WithTimeout(time.Second * 110).Should(Succeed()) + env.EventuallyExpectHealthyPodCount(selector, 1) + }) + It("should terminate the node at the API server when the EC2 instance is stopped", func() { + By("Creating a single healthy node with a healthy deployment") + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }) + numPods := 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: ptr.Int64(0), + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(nodeClass, nodePool, dep) + + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.CreatedNodes()[0] + + By("Stopping the EC2 instance without the EKS cluster's knowledge") + env.ExpectInstanceStopped(node.Name) // Make a call to the EC2 api to stop the instance + env.EventuallyExpectNotFoundAssertion(node).WithTimeout(time.Minute).Should(Succeed()) // shorten the timeout since we should react faster + env.EventuallyExpectHealthyPodCount(selector, 1) + }) + It("should terminate the node at the API server when the EC2 instance is terminated", func() { + By("Creating a single healthy node with a healthy deployment") + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }) + numPods := 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: ptr.Int64(0), + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(nodeClass, nodePool, dep) + + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.CreatedNodes()[0] + + By("Terminating the EC2 instance without the EKS cluster's knowledge") + env.ExpectInstanceTerminated(node.Name) // Make a call to the EC2 api to stop the instance + env.EventuallyExpectNotFoundAssertion(node).WithTimeout(time.Minute).Should(Succeed()) // shorten the timeout since we should react faster + env.EventuallyExpectHealthyPodCount(selector, 1) + }) + It("should terminate the node when receiving a scheduled change health event", func() { + By("Creating a single healthy node with a healthy deployment") + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }) + numPods := 1 + dep := coretest.Deployment(coretest.DeploymentOptions{ + Replicas: int32(numPods), + PodOptions: coretest.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{"app": "my-app"}, + }, + TerminationGracePeriodSeconds: ptr.Int64(0), + }, + }) + selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + + env.ExpectCreated(nodeClass, nodePool, dep) + + env.EventuallyExpectHealthyPodCount(selector, numPods) + env.ExpectCreatedNodeCount("==", 1) + + node := env.Monitor.CreatedNodes()[0] + instanceID, err := utils.ParseInstanceID(node.Spec.ProviderID) + Expect(err).ToNot(HaveOccurred()) + + By("Creating a scheduled change health event in the SQS message queue") + env.ExpectMessagesCreated(scheduledChangeMessage(env.Region, "000000000000", instanceID)) + env.EventuallyExpectNotFoundAssertion(node).WithTimeout(time.Minute).Should(Succeed()) // shorten the timeout since we should react faster + env.EventuallyExpectHealthyPodCount(selector, 1) + }) +}) + +func scheduledChangeMessage(region, accountID, involvedInstanceID string) scheduledchange.Message { + return scheduledchange.Message{ + Metadata: messages.Metadata{ + Version: "0", + Account: accountID, + DetailType: "AWS Health Event", + ID: string(uuid.NewUUID()), + Region: region, + Resources: []string{ + fmt.Sprintf("arn:aws:ec2:%s:instance/%s", region, involvedInstanceID), + }, + Source: "aws.health", + Time: time.Now(), + }, + Detail: scheduledchange.Detail{ + Service: "EC2", + EventTypeCategory: "scheduledChange", + AffectedEntities: []scheduledchange.AffectedEntity{ + { + EntityValue: involvedInstanceID, + }, + }, + }, + } +} From c716531926e8f582b5623a42af828394d7bc3beb Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 11:59:07 -0700 Subject: [PATCH 25/47] test: Add IPv6 testing for E2E for v1beta1 (#4890) --- .github/workflows/e2e-matrix.yaml | 20 ++++- .github/workflows/e2e.yaml | 7 +- test/suites/beta/ipv6/suite_test.go | 119 ++++++++++++++++++++++++++++ 3 files changed, 142 insertions(+), 4 deletions(-) create mode 100644 test/suites/beta/ipv6/suite_test.go diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index 33af1b767a3b..ecf19ccfb9d7 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -49,7 +49,25 @@ jobs: strategy: fail-fast: false matrix: - suite: [Beta/Integration, Beta/Drift, Beta/Consolidation, Beta/Interruption, Beta/Chaos, Beta/Expiration, Beta/NodeClaim, Alpha/Integration, Alpha/Machine, Alpha/Consolidation, Alpha/Utilization, Alpha/Interruption, Alpha/Drift, Alpha/Expiration, Alpha/Chaos, Alpha/IPv6] + suite: + - Beta/Integration + - Beta/NodeClaim + - Beta/Consolidation + - Beta/Interruption + - Beta/Drift + - Beta/Expiration + - Beta/Chaos + - Beta/IPv6 + - Alpha/Integration + - Alpha/Machine + - Alpha/Consolidation + - Alpha/Utilization + - Alpha/Interruption + - Alpha/Drift + - Alpha/Expiration + - Alpha/Chaos + - Alpha/IPv6 + - Alpha/Scale uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 56088257d93b..47b373afbd0b 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -16,12 +16,13 @@ on: required: true options: - Beta/Integration - - Beta/Drift + - Beta/NodeClaim - Beta/Consolidation - Beta/Interruption - - Beta/Chaos + - Beta/Drift - Beta/Expiration - - Beta/NodeClaim + - Beta/Chaos + - Beta/IPv6 - Alpha/Integration - Alpha/Machine - Alpha/Consolidation diff --git a/test/suites/beta/ipv6/suite_test.go b/test/suites/beta/ipv6/suite_test.go new file mode 100644 index 000000000000..36507ba5f82c --- /dev/null +++ b/test/suites/beta/ipv6/suite_test.go @@ -0,0 +1,119 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package ipv6_test + +import ( + "fmt" + "net" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + "github.com/samber/lo" + v1 "k8s.io/api/core/v1" + + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" + "github.com/aws/karpenter/pkg/test" + "github.com/aws/karpenter/test/pkg/environment/aws" +) + +var env *aws.Environment +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool + +func TestIPv6(t *testing.T) { + RegisterFailHandler(Fail) + BeforeSuite(func() { + env = aws.NewEnvironment(t) + }) + AfterSuite(func() { + env.Stop() + }) + RunSpecs(t, "Beta/IPv6") +} + +var _ = BeforeEach(func() { + env.BeforeEach() + nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ + Spec: v1beta1.EC2NodeClassSpec{ + AMIFamily: &v1beta1.AMIFamilyAL2, + SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + }, + Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + }, + }) + nodePool = coretest.NodePool(corev1beta1.NodePool{ + Spec: corev1beta1.NodePoolSpec{ + Template: corev1beta1.NodeClaimTemplate{ + Spec: corev1beta1.NodeClaimSpec{ + NodeClassRef: &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + }, + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"t3a.small"}, + }, + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"on-demand"}, + }, + }, + }, + }, + }, + }) +}) +var _ = AfterEach(func() { env.Cleanup() }) +var _ = AfterEach(func() { env.AfterEach() }) + +var _ = Describe("IPv6", func() { + It("should provision an IPv6 node by discovering kube-dns IPv6", func() { + pod := coretest.Pod() + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + node := env.GetNode(pod.Spec.NodeName) + internalIPv6Addrs := lo.Filter(node.Status.Addresses, func(addr v1.NodeAddress, _ int) bool { + return addr.Type == v1.NodeInternalIP && net.ParseIP(addr.Address).To4() == nil + }) + Expect(internalIPv6Addrs).To(HaveLen(1)) + }) + It("should provision an IPv6 node by discovering kubeletConfig kube-dns IP", func() { + clusterDNSAddr := env.ExpectIPv6ClusterDNS() + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ClusterDNS: []string{clusterDNSAddr}} + pod := coretest.Pod() + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + node := env.GetNode(pod.Spec.NodeName) + internalIPv6Addrs := lo.Filter(node.Status.Addresses, func(addr v1.NodeAddress, _ int) bool { + return addr.Type == v1.NodeInternalIP && net.ParseIP(addr.Address).To4() == nil + }) + Expect(internalIPv6Addrs).To(HaveLen(1)) + }) +}) From 67ead12802239a2929fcece1c248a727d764f26f Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 14:42:44 -0700 Subject: [PATCH 26/47] test: Fix E2E test permissions and Upgrade testing (#4880) --- .github/workflows/e2e-upgrade.yaml | 2 +- test/cloudformation/iam_cloudformation.yaml | 197 +++--------------- test/pkg/environment/aws/environment.go | 4 + test/pkg/environment/aws/expectations.go | 9 + test/pkg/environment/common/monitor.go | 4 +- test/suites/alpha/drift/suite_test.go | 18 +- .../alpha/expiration/expiration_test.go | 5 +- test/suites/alpha/integration/ami_test.go | 2 +- .../integration/extended_resources_test.go | 3 +- .../alpha/integration/kubelet_config_test.go | 4 +- .../utilization_test.go} | 23 +- .../alpha/machine/garbage_collection_test.go | 2 - .../suites/alpha/scale/deprovisioning_test.go | 1 - test/suites/beta/drift/suite_test.go | 15 +- test/suites/beta/integration/ami_test.go | 13 +- test/suites/beta/integration/cni_test.go | 1 - .../suites/beta/integration/emptiness_test.go | 1 - .../integration/extended_resources_test.go | 4 +- .../beta/integration/instance_profile_test.go | 7 +- .../beta/integration/kubelet_config_test.go | 6 +- .../beta/integration/scheduling_test.go | 3 +- .../beta/integration/utilization_test.go | 43 ++++ 22 files changed, 125 insertions(+), 242 deletions(-) rename test/suites/alpha/{utilization/suite_test.go => integration/utilization_test.go} (81%) create mode 100644 test/suites/beta/integration/utilization_test.go diff --git a/.github/workflows/e2e-upgrade.yaml b/.github/workflows/e2e-upgrade.yaml index a7f1dff781ac..1e418d872335 100644 --- a/.github/workflows/e2e-upgrade.yaml +++ b/.github/workflows/e2e-upgrade.yaml @@ -152,7 +152,7 @@ jobs: - name: run the Upgrade test suite run: | aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} - CLUSTER_NAME=${{ env.CLUSTER_NAME }} INTERRUPTION_QUEUE=${{ env.CLUSTER_NAME }} CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ env.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" TEST_SUITE="Integration" make e2etests + CLUSTER_NAME=${{ env.CLUSTER_NAME }} INTERRUPTION_QUEUE=${{ env.CLUSTER_NAME }} CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ env.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" TEST_SUITE="Beta/Integration" make e2etests - name: notify slack of success or failure uses: ./.github/actions/e2e/slack/notify if: (success() || failure()) && github.event_name != 'workflow_run' && github.event_name != 'conformance' diff --git a/test/cloudformation/iam_cloudformation.yaml b/test/cloudformation/iam_cloudformation.yaml index 2263997cfbe2..1d6d3a3d518e 100644 --- a/test/cloudformation/iam_cloudformation.yaml +++ b/test/cloudformation/iam_cloudformation.yaml @@ -270,116 +270,7 @@ Resources: Statement: - Effect: Allow Action: - # Tag Permissions - - ec2:DescribeTags - # Internet Gateway Permissions - - ec2:DescribeEgressOnlyInternetGateways - - ec2:DescribeInternetGateways - # Elastic IP Permissions - - ec2:DescribeAddresses - # Instance Permissions - - ec2:DescribeInstanceTypeOfferings - - ec2:DescribeInstanceTypes - - ec2:DescribeInstances - - ec2:DescribeKeyPairs - # Launch Template Permissions - - ec2:DescribeLaunchTemplateVersions - - ec2:DescribeLaunchTemplates - # NAT Gateway Permissions - - ec2:DescribeNatGateways - # Network Interface Permissions - - ec2:DescribeNetworkInterfaces - # Route Table Permissions - - ec2:DescribeRouteTables - # Security Group Permissions - - ec2:DescribeSecurityGroups - # Subnet Permissions - - ec2:DescribeAvailabilityZones - - ec2:DescribeSubnets - # Volume Permissions - - ec2:DescribeVolumes - - ec2:DescribeVolumesModifications - - ec2:DescribeSnapshots - # Network ACL Permissions - - ec2:DescribeNetworkAcls - # VPC Permissions - - ec2:DescribeVpcs - # Image Permissions - - ec2:DescribeImages - # Tag Permissions - - ec2:CreateTags - - ec2:DeleteTags - # Internet Gateway Permissions - - ec2:CreateEgressOnlyInternetGateway - - ec2:DeleteEgressOnlyInternetGateway - # Elastic IP Permissions - - ec2:AllocateAddress - - ec2:ReleaseAddress - # Instance Permissions - - ec2:ModifyInstanceAttribute - - ec2:DescribeInstanceAttribute - - ec2:RunInstances - - ec2:StopInstances - - ec2:TerminateInstances - - ec2:AttachNetworkInterface - - ec2:ModifyNetworkInterfaceAttribute - - ec2:DetachNetworkInterface - # Internet Gateway Permissions - - ec2:AttachInternetGateway - - ec2:CreateInternetGateway - - ec2:DeleteInternetGateway - - ec2:DetachInternetGateway - # Launch Template Permissions - - ec2:CreateLaunchTemplate - - ec2:DeleteLaunchTemplate - # Fleet Permissions - - ec2:CreateFleet - # NAT Gateway Permissions - - ec2:CreateNatGateway - - ec2:DeleteNatGateway - # Network Interface Permissions - - ec2:AssignPrivateIpAddresses - - ec2:UnassignPrivateIpAddresses - - ec2:AssignIpv6Addresses - - ec2:UnassignIpv6Addresses - - ec2:AttachNetworkInterface - - ec2:DetachNetworkInterface - - ec2:CreateNetworkInterface - - ec2:ModifyNetworkInterfaceAttribute - - ec2:DeleteNetworkInterface - - ec2:CreateNetworkInterfacePermission - # Route Table Permissions - - ec2:CreateRoute - - ec2:CreateRouteTable - - ec2:DeleteRoute - - ec2:DeleteRouteTable - - ec2:AssociateRouteTable - - ec2:DisassociateRouteTable - # Security Group Permissions - - ec2:AuthorizeSecurityGroupIngress - - ec2:CreateSecurityGroup - - ec2:DeleteSecurityGroup - - ec2:RevokeSecurityGroupIngress - # Subnet Permissions - - ec2:CreateSubnet - - ec2:DeleteSubnet - - ec2:ModifySubnetAttribute - # Volume Permissions - - ec2:CreateSnapshot - - ec2:DeleteSnapshot - - ec2:CreateVolume - - ec2:DeleteVolume - - ec2:AttachVolume - - ec2:ModifyVolume - - ec2:DetachVolume - # VPC Permissions - - ec2:AssociateVpcCidrBlock - - ec2:DisassociateVpcCidrBlock - - ec2:CreateVpc - - ec2:DeleteVpc - - ec2:DescribeVpcAttribute - - ec2:ModifyVpcAttribute - - ec2:RunInstances + - ec2:* # Read-Only Permissions to pull ECR images needed by the NodeInstanceRole - ecr:GetAuthorizationToken - ecr:BatchCheckLayerAvailability @@ -397,75 +288,22 @@ Resources: - autoscaling:DescribeAutoScalingGroups - autoscaling:UpdateAutoScalingGroup # EKS ServiceRole permissions needed to handle LoadBalancer - - elasticloadbalancing:AddTags - - elasticloadbalancing:ApplySecurityGroupsToLoadBalancer - - elasticloadbalancing:AttachLoadBalancerToSubnets - - elasticloadbalancing:ConfigureHealthCheck - - elasticloadbalancing:CreateListener - - elasticloadbalancing:CreateLoadBalancer - - elasticloadbalancing:CreateLoadBalancerListeners - - elasticloadbalancing:CreateLoadBalancerPolicy - - elasticloadbalancing:CreateTargetGroup - - elasticloadbalancing:DeleteListener - - elasticloadbalancing:DeleteLoadBalancer - - elasticloadbalancing:DeleteLoadBalancerListeners - - elasticloadbalancing:DeleteTargetGroup - - elasticloadbalancing:DeregisterInstancesFromLoadBalancer - - elasticloadbalancing:DeregisterTargets - - elasticloadbalancing:DescribeListeners - - elasticloadbalancing:DescribeLoadBalancerAttributes - - elasticloadbalancing:DescribeLoadBalancerPolicies - - elasticloadbalancing:DescribeLoadBalancers - - elasticloadbalancing:DescribeTargetGroupAttributes - - elasticloadbalancing:DescribeTargetGroups - - elasticloadbalancing:DescribeTargetHealth - - elasticloadbalancing:DetachLoadBalancerFromSubnets - - elasticloadbalancing:ModifyListener - - elasticloadbalancing:ModifyLoadBalancerAttributes - - elasticloadbalancing:ModifyTargetGroup - - elasticloadbalancing:ModifyTargetGroupAttributes - - elasticloadbalancing:RegisterInstancesWithLoadBalancer - - elasticloadbalancing:RegisterTargets - - elasticloadbalancing:SetLoadBalancerPoliciesForBackendServer - - elasticloadbalancing:SetLoadBalancerPoliciesOfListener + - elasticloadbalancing:* - kms:CreateGrant - kms:GenerateDataKeyWithoutPlaintext - kms:DescribeKey # SSM Permissions for AmazonSSMManagedInstanceCore policy applied to the NodeInstanceRole - - ssm:DescribeAssociation - - ssm:GetDeployablePatchSnapshotForInstance - - ssm:GetDocument - - ssm:DescribeDocument - - ssm:GetManifest - - ssm:GetParameter - - ssm:GetParameters - - ssm:ListAssociations - - ssm:ListInstanceAssociations - - ssm:PutInventory - - ssm:PutComplianceItems - - ssm:PutConfigurePackageResult - - ssm:UpdateAssociationStatus - - ssm:UpdateInstanceAssociationStatus - - ssm:UpdateInstanceInformation + - ssm:* # SSM Permissions for AmazonSSMManagedInstanceCore policy applied to the NodeInstanceRole - - ssmmessages:CreateControlChannel - - ssmmessages:CreateDataChannel - - ssmmessages:OpenControlChannel - - ssmmessages:OpenDataChannel + - ssmmessages:* # SSM Permissions for AmazonSSMManagedInstanceCore policy applied to the NodeInstanceRole - - ec2messages:AcknowledgeMessage - - ec2messages:DeleteMessage - - ec2messages:FailMessage - - ec2messages:GetEndpoint - - ec2messages:GetMessages - - ec2messages:SendReply + - ec2messages:* - sqs:DeleteMessage - sqs:GetQueueAttributes - sqs:GetQueueUrl - sqs:SendMessage - sqs:ReceiveMessage - pricing:GetProducts - - ec2:DescribeSpotPriceHistory - eks:DescribeCluster Resource: "*" - Effect: Allow @@ -473,6 +311,31 @@ Resources: Resource: - !Sub "arn:${AWS::Partition}:iam::${AWS::AccountId}:role/KarpenterNodeRole-*" - !GetAtt FISInterruptionRole.Arn + - Effect: Allow + Action: iam:CreateInstanceProfile + Resource: "*" + Condition: + StringLike: + aws:RequestTag/karpenter.k8s.aws/ec2nodeclass: "*" + - Effect: Allow + Action: iam:TagInstanceProfile + Resource: "*" + Condition: + StringLike: + aws:RequestTag/karpenter.k8s.aws/ec2nodeclass: "*" + aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass: "*" + - Effect: Allow + Action: + - iam:AddRoleToInstanceProfile + - iam:RemoveRoleFromInstanceProfile + - iam:DeleteInstanceProfile + Resource: "*" + Condition: + StringLike: + aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass: "*" + - Effect: Allow + Action: iam:GetInstanceProfile + Resource: "*" - Effect: Allow Action: - aps:RemoteWrite diff --git a/test/pkg/environment/aws/environment.go b/test/pkg/environment/aws/environment.go index 9fc111b9759c..f55bcbdbcaca 100644 --- a/test/pkg/environment/aws/environment.go +++ b/test/pkg/environment/aws/environment.go @@ -42,6 +42,10 @@ import ( const WindowsDefaultImage = "mcr.microsoft.com/oss/kubernetes/pause:3.9" +// ExcludedInstanceFamilies denotes instance families that have issues during resource registration due to compatibility +// issues with versions of the VPR Resource Controller +var ExcludedInstanceFamilies = []string{"m7a", "r7a", "c7a", "r7i"} + type Environment struct { *common.Environment Region string diff --git a/test/pkg/environment/aws/expectations.go b/test/pkg/environment/aws/expectations.go index 6d910e99eb85..a91eafa50719 100644 --- a/test/pkg/environment/aws/expectations.go +++ b/test/pkg/environment/aws/expectations.go @@ -27,12 +27,15 @@ import ( "github.com/aws/aws-sdk-go/service/iam" "github.com/aws/aws-sdk-go/service/ssm" "github.com/aws/aws-sdk-go/service/sts" + "github.com/mitchellh/hashstructure/v2" . "github.com/onsi/ginkgo/v2" //nolint:revive,stylecheck . "github.com/onsi/gomega" //nolint:revive,stylecheck "github.com/samber/lo" "go.uber.org/multierr" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" + + "github.com/aws/karpenter/pkg/apis/v1beta1" ) // Spot Interruption experiment details partially copied from @@ -121,6 +124,12 @@ func (env *Environment) ExpectInstanceProfileExists(profileName string) iam.Inst return lo.FromPtr(out.InstanceProfile) } +// GetInstanceProfileName gets the string for the profile name based on the cluster name, region and the NodeClass name. +// The length of this string can never exceed the maximum instance profile name limit of 128 characters. +func (env *Environment) GetInstanceProfileName(nodeClass *v1beta1.EC2NodeClass) string { + return fmt.Sprintf("%s_%d", env.ClusterName, lo.Must(hashstructure.Hash(fmt.Sprintf("%s%s", env.Region, nodeClass.Name), hashstructure.FormatV2, nil))) +} + func (env *Environment) GetInstance(nodeName string) ec2.Instance { node := env.Environment.GetNode(nodeName) return env.GetInstanceByID(env.ExpectParsedProviderID(node.Spec.ProviderID)) diff --git a/test/pkg/environment/common/monitor.go b/test/pkg/environment/common/monitor.go index 61a5e4a78ee5..dcb432dea6b1 100644 --- a/test/pkg/environment/common/monitor.go +++ b/test/pkg/environment/common/monitor.go @@ -29,6 +29,7 @@ import ( "github.com/samber/lo" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" + "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/utils/resources" ) @@ -223,7 +224,8 @@ func (m *Monitor) nodeUtilization(resource v1.ResourceName) []float64 { for nodeName, requests := range st.nodeRequests { allocatable := st.nodes[nodeName].Status.Allocatable[resource] // skip any nodes we didn't launch - if _, ok := st.nodes[nodeName].Labels[v1alpha5.ProvisionerNameLabelKey]; !ok { + if st.nodes[nodeName].Labels[v1alpha5.ProvisionerNameLabelKey] == "" && + st.nodes[nodeName].Labels[v1beta1.NodePoolLabelKey] == "" { continue } if allocatable.IsZero() { diff --git a/test/suites/alpha/drift/suite_test.go b/test/suites/alpha/drift/suite_test.go index d7fae9c7acb0..3fce8ca0e58a 100644 --- a/test/suites/alpha/drift/suite_test.go +++ b/test/suites/alpha/drift/suite_test.go @@ -88,7 +88,6 @@ var _ = Describe("Drift", Label("AWS"), func() { }, }) env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) - env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) }) It("should deprovision nodes that have drifted due to AMIs", func() { // choose an old static image @@ -122,7 +121,6 @@ var _ = Describe("Drift", Label("AWS"), func() { }) It("should not deprovision nodes that have drifted without the featureGate enabled", func() { env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) - env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=false"}) // choose an old static image parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ Name: awssdk.String("/aws/service/eks/optimized-ami/1.23/amazon-linux-2/amazon-eks-node-1.23-v20230322/image_id"), @@ -377,14 +375,11 @@ var _ = Describe("Drift", Label("AWS"), func() { }).Should(Succeed()) // Expect nodes To get cordoned - cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + cordonedNodes := env.EventuallyExpectCordonedNodeCountLegacy("==", 1) // Drift should fail and the original node should be uncordoned // TODO: reduce timeouts when deprovisioning waits are factored out - Eventually(func(g Gomega) { - g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(cordonedNodes[0]), cordonedNodes[0])) - g.Expect(cordonedNodes[0].Spec.Unschedulable).To(BeFalse()) - }).WithTimeout(11 * time.Minute).Should(Succeed()) + env.EventuallyExpectNodesUncordonedLegacyWithTimeout(11*time.Minute, cordonedNodes...) Eventually(func(g Gomega) { machines := &v1alpha5.MachineList{} @@ -435,14 +430,11 @@ var _ = Describe("Drift", Label("AWS"), func() { }).Should(Succeed()) // Expect nodes To be cordoned - cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + cordonedNodes := env.EventuallyExpectCordonedNodeCountLegacy("==", 1) // Drift should fail and original node should be uncordoned - // TODO: reduce timeouts when deprovisioning waits are factored outr - Eventually(func(g Gomega) { - g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(cordonedNodes[0]), cordonedNodes[0])) - g.Expect(cordonedNodes[0].Spec.Unschedulable).To(BeFalse()) - }).WithTimeout(12 * time.Minute).Should(Succeed()) + // TODO: reduce timeouts when deprovisioning waits are factored out + env.EventuallyExpectNodesUncordonedLegacyWithTimeout(11*time.Minute, cordonedNodes...) // Expect that the new machine/node is kept around after the un-cordon nodeList := &v1.NodeList{} diff --git a/test/suites/alpha/expiration/expiration_test.go b/test/suites/alpha/expiration/expiration_test.go index 024a49a81df4..15621c59d700 100644 --- a/test/suites/alpha/expiration/expiration_test.go +++ b/test/suites/alpha/expiration/expiration_test.go @@ -74,7 +74,6 @@ var _ = Describe("Expiration", func() { TTLSecondsUntilExpired: ptr.Int64(30), }) env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) - env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) }) It("should expire the node after the TTLSecondsUntilExpired is reached", func() { var numPods int32 = 1 @@ -238,7 +237,7 @@ var _ = Describe("Expiration", func() { }).Should(Succeed()) // Expect nodes To get cordoned - cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + cordonedNodes := env.EventuallyExpectCordonedNodeCountLegacy("==", 1) // Expire should fail and the original node should be uncordoned // TODO: reduce timeouts when deprovisioning waits are factored out @@ -306,7 +305,7 @@ var _ = Describe("Expiration", func() { }).Should(Succeed()) // Expect nodes To be cordoned - cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) + cordonedNodes := env.EventuallyExpectCordonedNodeCountLegacy("==", 1) // Expire should fail and original node should be uncordoned and no machines should be removed // TODO: reduce timeouts when deprovisioning waits are factored out diff --git a/test/suites/alpha/integration/ami_test.go b/test/suites/alpha/integration/ami_test.go index 44625235af31..40126578322c 100644 --- a/test/suites/alpha/integration/ami_test.go +++ b/test/suites/alpha/integration/ami_test.go @@ -372,7 +372,7 @@ var _ = Describe("AMI", func() { { Key: v1alpha1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: awsenv.ExcludedInstanceFamilies, }, { Key: v1alpha1.LabelInstanceCategory, diff --git a/test/suites/alpha/integration/extended_resources_test.go b/test/suites/alpha/integration/extended_resources_test.go index 98de604d8e4b..e53adbbb576d 100644 --- a/test/suites/alpha/integration/extended_resources_test.go +++ b/test/suites/alpha/integration/extended_resources_test.go @@ -31,6 +31,7 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis/v1alpha1" + awsenv "github.com/aws/karpenter/test/pkg/environment/aws" awstest "github.com/aws/karpenter/pkg/test" ) @@ -135,7 +136,7 @@ var _ = Describe("Extended Resources", func() { { Key: v1alpha1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a"}, + Values: awsenv.ExcludedInstanceFamilies, }, }, }) diff --git a/test/suites/alpha/integration/kubelet_config_test.go b/test/suites/alpha/integration/kubelet_config_test.go index 5f9afec6a7bc..1e023001214b 100644 --- a/test/suites/alpha/integration/kubelet_config_test.go +++ b/test/suites/alpha/integration/kubelet_config_test.go @@ -107,7 +107,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { v1.NodeSelectorRequirement{ Key: v1alpha1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a"}, + Values: aws.ExcludedInstanceFamilies, }) pod := test.Pod(test.PodOptions{ NodeSelector: map[string]string{ @@ -145,7 +145,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { v1.NodeSelectorRequirement{ Key: v1alpha1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: aws.ExcludedInstanceFamilies, }, v1.NodeSelectorRequirement{ Key: v1alpha1.LabelInstanceCategory, diff --git a/test/suites/alpha/utilization/suite_test.go b/test/suites/alpha/integration/utilization_test.go similarity index 81% rename from test/suites/alpha/utilization/suite_test.go rename to test/suites/alpha/integration/utilization_test.go index bfaf8524a1f1..135e66a40268 100644 --- a/test/suites/alpha/utilization/suite_test.go +++ b/test/suites/alpha/integration/utilization_test.go @@ -12,13 +12,10 @@ See the License for the specific language governing permissions and limitations under the License. */ -package utilization_test +package integration_test import ( - "testing" - . "github.com/onsi/ginkgo/v2" - . "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/labels" @@ -29,26 +26,8 @@ import ( "github.com/aws/karpenter/test/pkg/debug" awstest "github.com/aws/karpenter/pkg/test" - "github.com/aws/karpenter/test/pkg/environment/aws" ) -var env *aws.Environment - -func TestUtilization(t *testing.T) { - RegisterFailHandler(Fail) - BeforeSuite(func() { - env = aws.NewEnvironment(t) - }) - AfterSuite(func() { - env.Stop() - }) - RunSpecs(t, "Alpha/Utilization") -} - -var _ = BeforeEach(func() { env.BeforeEach() }) -var _ = AfterEach(func() { env.Cleanup() }) -var _ = AfterEach(func() { env.AfterEach() }) - var _ = Describe("Utilization", Label(debug.NoWatch), Label(debug.NoEvents), func() { It("should provision one pod per node", func() { provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ diff --git a/test/suites/alpha/machine/garbage_collection_test.go b/test/suites/alpha/machine/garbage_collection_test.go index c4b2b0112b23..3d9b140d2c15 100644 --- a/test/suites/alpha/machine/garbage_collection_test.go +++ b/test/suites/alpha/machine/garbage_collection_test.go @@ -25,7 +25,6 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" - v1 "k8s.io/api/core/v1" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/test" @@ -136,7 +135,6 @@ var _ = Describe("NodeClaimGarbageCollection", func() { It("should succeed to garbage collect a Machine that was deleted without the cluster's knowledge", func() { // Disable the interruption queue for the garbage collection test env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.interruptionQueueName": ""}) - env.ExpectSettingsOverridden(v1.EnvVar{Name: "INTERRUPTION_QUEUE", Value: ""}) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, diff --git a/test/suites/alpha/scale/deprovisioning_test.go b/test/suites/alpha/scale/deprovisioning_test.go index f80ebcb6b9bd..482176c098b3 100644 --- a/test/suites/alpha/scale/deprovisioning_test.go +++ b/test/suites/alpha/scale/deprovisioning_test.go @@ -82,7 +82,6 @@ var _ = Describe("Deprovisioning", Label(debug.NoWatch), Label(debug.NoEvents), BeforeEach(func() { env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) - env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) nodeTemplate = awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, diff --git a/test/suites/beta/drift/suite_test.go b/test/suites/beta/drift/suite_test.go index 0d26313f62ae..b1872f270dec 100644 --- a/test/suites/beta/drift/suite_test.go +++ b/test/suites/beta/drift/suite_test.go @@ -103,7 +103,6 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { }, }, }) - env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=true"}) }) It("should disrupt nodes that have drifted due to AMIs", func() { @@ -137,7 +136,6 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { env.EventuallyExpectNotFound(pod, nodeClaim, node) }) It("should not disrupt nodes that have drifted without the featureGate enabled", func() { - env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) env.ExpectSettingsOverridden(v1.EnvVar{Name: "FEATURE_GATES", Value: "Drift=false"}) // choose an old static image parameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ @@ -433,11 +431,9 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { // Drift should fail and the original node should be uncordoned // TODO: reduce timeouts when disruption waits are factored out - Eventually(func(g Gomega) { - g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(cordonedNodes[0]), cordonedNodes[0])) - g.Expect(cordonedNodes[0].Spec.Unschedulable).To(BeFalse()) - }).WithTimeout(11 * time.Minute).Should(Succeed()) + env.EventuallyExpectNodesUncordonedWithTimeout(11*time.Minute, cordonedNodes...) + // We give another 6 minutes here to handle the deletion at the 15m registration timeout Eventually(func(g Gomega) { nodeClaims := &corev1beta1.NodeClaimList{} g.Expect(env.Client.List(env, nodeClaims, client.HasLabels{test.DiscoveryLabel})).To(Succeed()) @@ -490,11 +486,8 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { cordonedNodes := env.EventuallyExpectCordonedNodeCount("==", 1) // Drift should fail and original node should be uncordoned - // TODO: reduce timeouts when disruption waits are factored outr - Eventually(func(g Gomega) { - g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(cordonedNodes[0]), cordonedNodes[0])) - g.Expect(cordonedNodes[0].Spec.Unschedulable).To(BeFalse()) - }).WithTimeout(12 * time.Minute).Should(Succeed()) + // TODO: reduce timeouts when disruption waits are factored out + env.EventuallyExpectNodesUncordonedWithTimeout(11*time.Minute, cordonedNodes...) // Expect that the new nodeClaim/node is kept around after the un-cordon nodeList := &v1.NodeList{} diff --git a/test/suites/beta/integration/ami_test.go b/test/suites/beta/integration/ami_test.go index bc2ddc96fab3..070be1f4ff3c 100644 --- a/test/suites/beta/integration/ami_test.go +++ b/test/suites/beta/integration/ami_test.go @@ -41,8 +41,13 @@ var _ = Describe("AMI", func() { customAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) }) - It("should use the AMI defined by the AMI Selector", func() { + It("should use the AMI defined by the AMI Selector Terms", func() { pod := coretest.Pod() + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{ + { + ID: customAMI, + }, + } env.ExpectCreated(pod, nodeClass, nodePool) env.EventuallyExpectHealthy(pod) env.ExpectCreatedNodeCount("==", 1) @@ -74,7 +79,7 @@ var _ = Describe("AMI", func() { env.ExpectInstance(pod.Spec.NodeName).To(HaveField("ImageId", HaveValue(Equal(customAMI)))) }) - It("should support ami selector Name but fail with incorrect owners", func() { + It("should support AMI Selector Terms for Name but fail with incorrect owners", func() { output, err := env.EC2API.DescribeImages(&ec2.DescribeImagesInput{ ImageIds: []*string{aws.String(customAMI)}, }) @@ -155,7 +160,7 @@ var _ = Describe("AMI", func() { { Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: awsenv.ExcludedInstanceFamilies, }, { Key: v1beta1.LabelInstanceCategory, @@ -290,7 +295,7 @@ var _ = Describe("AMI", func() { { Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: awsenv.ExcludedInstanceFamilies, }, { Key: v1beta1.LabelInstanceCategory, diff --git a/test/suites/beta/integration/cni_test.go b/test/suites/beta/integration/cni_test.go index eb579bd67152..a0c25211081f 100644 --- a/test/suites/beta/integration/cni_test.go +++ b/test/suites/beta/integration/cni_test.go @@ -52,7 +52,6 @@ var _ = Describe("CNITests", func() { Expect(allocatablePods).To(Equal(eniLimitedPodsFor(node.Labels["node.kubernetes.io/instance-type"]))) }) It("should set maxPods when reservedENIs is set", func() { - env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.reservedENIs": "1"}) env.ExpectSettingsOverridden(corev1.EnvVar{Name: "RESERVED_ENIS", Value: "1"}) pod := test.Pod() env.ExpectCreated(pod, nodeClass, nodePool) diff --git a/test/suites/beta/integration/emptiness_test.go b/test/suites/beta/integration/emptiness_test.go index 9c48e694a9e8..de4dc2ccc3f3 100644 --- a/test/suites/beta/integration/emptiness_test.go +++ b/test/suites/beta/integration/emptiness_test.go @@ -52,7 +52,6 @@ var _ = Describe("Emptiness", func() { By("waiting for the nodeclaim emptiness status condition to propagate") Eventually(func(g Gomega) { g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) - g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Empty)).ToNot(BeNil()) g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Empty).IsTrue()).To(BeTrue()) }).Should(Succeed()) diff --git a/test/suites/beta/integration/extended_resources_test.go b/test/suites/beta/integration/extended_resources_test.go index ba4681e4ee64..f8dc328b6fe4 100644 --- a/test/suites/beta/integration/extended_resources_test.go +++ b/test/suites/beta/integration/extended_resources_test.go @@ -30,6 +30,7 @@ import ( "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis/v1beta1" + awsenv "github.com/aws/karpenter/test/pkg/environment/aws" ) var _ = Describe("Extended Resources", func() { @@ -90,13 +91,12 @@ var _ = Describe("Extended Resources", func() { DeferCleanup(func() { env.ExpectPodENIDisabled() }) - env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.enablePodENI": "true"}) // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ { Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: awsenv.ExcludedInstanceFamilies, }, { Key: v1beta1.LabelInstanceCategory, diff --git a/test/suites/beta/integration/instance_profile_test.go b/test/suites/beta/integration/instance_profile_test.go index b40432aae5eb..0d34be68507e 100644 --- a/test/suites/beta/integration/instance_profile_test.go +++ b/test/suites/beta/integration/instance_profile_test.go @@ -23,7 +23,6 @@ import ( coretest "github.com/aws/karpenter-core/pkg/test" awserrors "github.com/aws/karpenter/pkg/errors" - "github.com/aws/karpenter/pkg/providers/instanceprofile" ) var _ = Describe("InstanceProfile Generation", func() { @@ -35,9 +34,9 @@ var _ = Describe("InstanceProfile Generation", func() { instance := env.GetInstance(node.Name) Expect(instance.IamInstanceProfile).ToNot(BeNil()) - Expect(instance.IamInstanceProfile.Arn).To(ContainSubstring(nodeClass.Spec.Role)) + Expect(lo.FromPtr(instance.IamInstanceProfile.Arn)).To(ContainSubstring(nodeClass.Status.InstanceProfile)) - instanceProfile := env.ExpectInstanceProfileExists(instanceprofile.GetProfileName(env.Context, env.Region, nodeClass)) + instanceProfile := env.ExpectInstanceProfileExists(env.GetInstanceProfileName(nodeClass)) Expect(instanceProfile.Roles).To(HaveLen(1)) Expect(lo.FromPtr(instanceProfile.Roles[0].RoleName)).To(Equal(nodeClass.Spec.Role)) }) @@ -50,7 +49,7 @@ var _ = Describe("InstanceProfile Generation", func() { env.ExpectDeleted(nodePool, nodeClass) Eventually(func(g Gomega) { _, err := env.IAMAPI.GetInstanceProfileWithContext(env.Context, &iam.GetInstanceProfileInput{ - InstanceProfileName: aws.String(instanceprofile.GetProfileName(env.Context, env.Region, nodeClass)), + InstanceProfileName: aws.String(env.GetInstanceProfileName(nodeClass)), }) g.Expect(awserrors.IsNotFound(err)).To(BeTrue()) }).Should(Succeed()) diff --git a/test/suites/beta/integration/kubelet_config_test.go b/test/suites/beta/integration/kubelet_config_test.go index 729c2b1c4576..1a7ac9c23403 100644 --- a/test/suites/beta/integration/kubelet_config_test.go +++ b/test/suites/beta/integration/kubelet_config_test.go @@ -90,7 +90,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { { Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: aws.ExcludedInstanceFamilies, }, { Key: v1beta1.LabelInstanceCategory, @@ -133,7 +133,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { { Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: aws.ExcludedInstanceFamilies, }, { Key: v1beta1.LabelInstanceCategory, @@ -259,7 +259,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { Values: []string{string(v1.Linux)}, }, }...) - nodePool.Spec.Template.Spec.Kubelet.PodsPerCore = ptr.Int32(1) + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{PodsPerCore: ptr.Int32(1)} numPods := 6 dep := test.Deployment(test.DeploymentOptions{ Replicas: int32(numPods), diff --git a/test/suites/beta/integration/scheduling_test.go b/test/suites/beta/integration/scheduling_test.go index 985cc06484b5..39013fb6c27b 100644 --- a/test/suites/beta/integration/scheduling_test.go +++ b/test/suites/beta/integration/scheduling_test.go @@ -69,7 +69,6 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { v1beta1.LabelInstanceCPU: "2", v1beta1.LabelInstanceMemory: "4096", v1beta1.LabelInstanceNetworkBandwidth: "750", - v1beta1.LabelInstancePods: "29", } selectors.Insert(lo.Keys(nodeSelector)...) // Add node selector keys to selectors used in testing to ensure we test all labels requirements := lo.MapToSlice(nodeSelector, func(key string, value string) v1.NodeSelectorRequirement { @@ -239,7 +238,7 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { { Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, - Values: []string{"m7a", "r7a", "c7a"}, + Values: aws.ExcludedInstanceFamilies, }, { Key: v1beta1.LabelInstanceCategory, diff --git a/test/suites/beta/integration/utilization_test.go b/test/suites/beta/integration/utilization_test.go new file mode 100644 index 000000000000..a84084cdbe90 --- /dev/null +++ b/test/suites/beta/integration/utilization_test.go @@ -0,0 +1,43 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package integration_test + +import ( + . "github.com/onsi/ginkgo/v2" + v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + "k8s.io/apimachinery/pkg/labels" + + "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/test/pkg/debug" +) + +var _ = Describe("Utilization", Label(debug.NoWatch), Label(debug.NoEvents), func() { + It("should provision one pod per node", func() { + nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"t3a.small"}, + }) + + deployment := test.Deployment(test.DeploymentOptions{ + Replicas: 100, + PodOptions: test.PodOptions{ResourceRequirements: v1.ResourceRequirements{Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1.5")}}}}) + + env.ExpectCreated(nodeClass, nodePool, deployment) + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.ExpectCreatedNodeCount("==", int(*deployment.Spec.Replicas)) // One pod per node enforced by instance size + }) +}) From 5d73e220105a6755ae3faa438d1414aa81c4fad9 Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Mon, 23 Oct 2023 15:30:17 -0700 Subject: [PATCH 27/47] fix: create offerings regardless of subnets (#4857) --- pkg/fake/ec2api.go | 6 +- pkg/providers/instancetype/instancetype.go | 113 +++++++++++++-------- 2 files changed, 72 insertions(+), 47 deletions(-) diff --git a/pkg/fake/ec2api.go b/pkg/fake/ec2api.go index 68a4faba4c23..0d6249f3b0a4 100644 --- a/pkg/fake/ec2api.go +++ b/pkg/fake/ec2api.go @@ -483,9 +483,9 @@ func (e *EC2API) DescribeAvailabilityZonesWithContext(context.Context, *ec2.Desc return e.DescribeAvailabilityZonesOutput.Clone(), nil } return &ec2.DescribeAvailabilityZonesOutput{AvailabilityZones: []*ec2.AvailabilityZone{ - {ZoneName: aws.String("test-zone-1a"), ZoneId: aws.String("testzone1a")}, - {ZoneName: aws.String("test-zone-1b"), ZoneId: aws.String("testzone1b")}, - {ZoneName: aws.String("test-zone-1c"), ZoneId: aws.String("testzone1c")}, + {ZoneName: aws.String("test-zone-1a"), ZoneId: aws.String("testzone1a"), ZoneType: aws.String("availability-zone")}, + {ZoneName: aws.String("test-zone-1b"), ZoneId: aws.String("testzone1b"), ZoneType: aws.String("availability-zone")}, + {ZoneName: aws.String("test-zone-1c"), ZoneId: aws.String("testzone1c"), ZoneType: aws.String("availability-zone")}, }}, nil } diff --git a/pkg/providers/instancetype/instancetype.go b/pkg/providers/instancetype/instancetype.go index 4e780e41fbf4..1af7a87e5781 100644 --- a/pkg/providers/instancetype/instancetype.go +++ b/pkg/providers/instancetype/instancetype.go @@ -20,6 +20,7 @@ import ( "net/http" "sync" "sync/atomic" + "time" "github.com/prometheus/client_golang/prometheus" @@ -44,8 +45,9 @@ import ( ) const ( - InstanceTypesCacheKey = "types" - InstanceTypeZonesCacheKeyPrefix = "zones:" + InstanceTypesCacheKey = "types" + InstanceTypeOfferingsCacheKey = "offerings" + AvailabilityZonesCacheKey = "zones" ) type Provider struct { @@ -66,6 +68,8 @@ type Provider struct { cm *pretty.ChangeMonitor // instanceTypesSeqNum is a monotonically increasing change counter used to avoid the expensive hashing operation on instance types instanceTypesSeqNum uint64 + // instanceTypeOfferingsSeqNum is a monotonically increasing change counter used to avoid the expensive hashing operation on instance types + instanceTypeOfferingsSeqNum uint64 } func NewProvider(region string, cache *cache.Cache, ec2api ec2iface.EC2API, subnetProvider *subnet.Provider, @@ -88,25 +92,35 @@ func (p *Provider) List(ctx context.Context, kc *corev1beta1.KubeletConfiguratio if err != nil { return nil, err } - // Get Viable EC2 Purchase offerings - instanceTypeZones, err := p.getInstanceTypeZones(ctx, nodeClass) + // Get InstanceTypeOfferings from EC2 + instanceTypeOfferings, err := p.getInstanceTypeOfferings(ctx) if err != nil { return nil, err } + // Get AvailabilityZones from EC2 + availabilityZones, err := p.getAvailabilityZones(ctx) + if err != nil { + return nil, err + } + // Constrain AZs from subnets + subnets, err := p.subnetProvider.List(ctx, nodeClass) + if err != nil { + return nil, err + } + subnetZones := sets.New[string](lo.Map(subnets, func(s *ec2.Subnet, _ int) string { + return aws.StringValue(s.AvailabilityZone) + })...) // Compute fully initialized instance types hash key - instanceTypeZonesHash, _ := hashstructure.Hash(instanceTypeZones, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}) + subnetHash, _ := hashstructure.Hash(subnets, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}) kcHash, _ := hashstructure.Hash(kc, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}) - key := fmt.Sprintf("%d-%d-%s-%016x-%016x", p.instanceTypesSeqNum, p.unavailableOfferings.SeqNum, nodeClass.UID, instanceTypeZonesHash, kcHash) + key := fmt.Sprintf("%d-%d-%d-%s-%016x-%016x", p.instanceTypesSeqNum, p.instanceTypeOfferingsSeqNum, p.unavailableOfferings.SeqNum, nodeClass.UID, subnetHash, kcHash) if item, ok := p.cache.Get(key); ok { return item.([]*cloudprovider.InstanceType), nil } - // Reject any instance types that don't have any offerings due to zone - result := lo.Reject(lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType { - return NewInstanceType(ctx, i, kc, p.region, nodeClass, p.createOfferings(ctx, i, instanceTypeZones[aws.StringValue(i.InstanceType)])) - }), func(i *cloudprovider.InstanceType, _ int) bool { - return len(i.Offerings) == 0 + result := lo.Map(instanceTypes, func(i *ec2.InstanceTypeInfo, _ int) *cloudprovider.InstanceType { + return NewInstanceType(ctx, i, kc, p.region, nodeClass, p.createOfferings(ctx, i, instanceTypeOfferings[aws.StringValue(i.InstanceType)], availabilityZones, subnetZones)) }) for _, instanceType := range instanceTypes { InstanceTypeVCPU.With(prometheus.Labels{ @@ -127,27 +141,27 @@ func (p *Provider) LivenessProbe(req *http.Request) error { return p.pricingProvider.LivenessProbe(req) } -func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.InstanceTypeInfo, zones sets.Set[string]) []cloudprovider.Offering { +func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.InstanceTypeInfo, instanceTypeZones, availabilityZones, subnetZones sets.Set[string]) []cloudprovider.Offering { var offerings []cloudprovider.Offering - for zone := range zones { + for az := range availabilityZones { // while usage classes should be a distinct set, there's no guarantee of that for capacityType := range sets.NewString(aws.StringValueSlice(instanceType.SupportedUsageClasses)...) { // exclude any offerings that have recently seen an insufficient capacity error from EC2 - isUnavailable := p.unavailableOfferings.IsUnavailable(*instanceType.InstanceType, zone, capacityType) + isUnavailable := p.unavailableOfferings.IsUnavailable(*instanceType.InstanceType, az, capacityType) var price float64 var ok bool switch capacityType { case ec2.UsageClassTypeSpot: - price, ok = p.pricingProvider.SpotPrice(*instanceType.InstanceType, zone) + price, ok = p.pricingProvider.SpotPrice(*instanceType.InstanceType, az) case ec2.UsageClassTypeOnDemand: price, ok = p.pricingProvider.OnDemandPrice(*instanceType.InstanceType) default: logging.FromContext(ctx).Errorf("Received unknown capacity type %s for instance type %s", capacityType, *instanceType.InstanceType) continue } - available := !isUnavailable && ok + available := !isUnavailable && ok && instanceTypeZones.Has(az) && subnetZones.Has(az) offerings = append(offerings, cloudprovider.Offering{ - Zone: zone, + Zone: az, CapacityType: capacityType, Price: price, Available: available, @@ -157,56 +171,67 @@ func (p *Provider) createOfferings(ctx context.Context, instanceType *ec2.Instan return offerings } -func (p *Provider) getInstanceTypeZones(ctx context.Context, nodeClass *v1beta1.EC2NodeClass) (map[string]sets.Set[string], error) { +func (p *Provider) getAvailabilityZones(ctx context.Context) (sets.Set[string], error) { // DO NOT REMOVE THIS LOCK ---------------------------------------------------------------------------- - // We lock here so that multiple callers to getInstanceTypeZones do not result in cache misses and multiple + // We lock here so that multiple callers to getAvailabilityZones do not result in cache misses and multiple // calls to EC2 when we could have just made one call. // TODO @joinnis: This can be made more efficient by holding a Read lock and only obtaining the Write if not in cache p.mu.Lock() defer p.mu.Unlock() + if cached, ok := p.cache.Get(AvailabilityZonesCacheKey); ok { + return cached.(sets.Set[string]), nil + } - subnetSelectorHash, err := hashstructure.Hash(nodeClass.Spec.SubnetSelectorTerms, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}) + // Get zones from EC2 + instanceTypeZones := sets.Set[string]{} + output, err := p.ec2api.DescribeAvailabilityZonesWithContext(ctx, &ec2.DescribeAvailabilityZonesInput{}) if err != nil { - return nil, fmt.Errorf("failed to hash the subnet selector: %w", err) + return nil, fmt.Errorf("describing availability zones, %w", err) } - cacheKey := fmt.Sprintf("%s%016x", InstanceTypeZonesCacheKeyPrefix, subnetSelectorHash) - if cached, ok := p.cache.Get(cacheKey); ok { - return cached.(map[string]sets.Set[string]), nil + for i := range output.AvailabilityZones { + zone := output.AvailabilityZones[i] + if aws.StringValue(zone.ZoneType) == "availability-zone" { + instanceTypeZones.Insert(aws.StringValue(zone.ZoneName)) + } } - - // Constrain AZs from subnets - subnets, err := p.subnetProvider.List(ctx, nodeClass) - if err != nil { - return nil, err + if p.cm.HasChanged("zones", instanceTypeZones) { + logging.FromContext(ctx).With("zones", instanceTypeZones.UnsortedList()).Debugf("discovered availability zones") } - if len(subnets) == 0 { - return nil, nil + p.cache.Set(AvailabilityZonesCacheKey, instanceTypeZones, 24*time.Hour) + return instanceTypeZones, nil +} + +func (p *Provider) getInstanceTypeOfferings(ctx context.Context) (map[string]sets.Set[string], error) { + // DO NOT REMOVE THIS LOCK ---------------------------------------------------------------------------- + // We lock here so that multiple callers to getInstanceTypeOfferings do not result in cache misses and multiple + // calls to EC2 when we could have just made one call. + // TODO @joinnis: This can be made more efficient by holding a Read lock and only obtaining the Write if not in cache + p.mu.Lock() + defer p.mu.Unlock() + if cached, ok := p.cache.Get(InstanceTypeOfferingsCacheKey); ok { + return cached.(map[string]sets.Set[string]), nil } - zones := sets.NewString(lo.Map(subnets, func(subnet *ec2.Subnet, _ int) string { - return aws.StringValue(subnet.AvailabilityZone) - })...) // Get offerings from EC2 - instanceTypeZones := map[string]sets.Set[string]{} + instanceTypeOfferings := map[string]sets.Set[string]{} if err := p.ec2api.DescribeInstanceTypeOfferingsPagesWithContext(ctx, &ec2.DescribeInstanceTypeOfferingsInput{LocationType: aws.String("availability-zone")}, func(output *ec2.DescribeInstanceTypeOfferingsOutput, lastPage bool) bool { for _, offering := range output.InstanceTypeOfferings { - if zones.Has(aws.StringValue(offering.Location)) { - if _, ok := instanceTypeZones[aws.StringValue(offering.InstanceType)]; !ok { - instanceTypeZones[aws.StringValue(offering.InstanceType)] = sets.New[string]() - } - instanceTypeZones[aws.StringValue(offering.InstanceType)].Insert(aws.StringValue(offering.Location)) + if _, ok := instanceTypeOfferings[aws.StringValue(offering.InstanceType)]; !ok { + instanceTypeOfferings[aws.StringValue(offering.InstanceType)] = sets.New[string]() } + instanceTypeOfferings[aws.StringValue(offering.InstanceType)].Insert(aws.StringValue(offering.Location)) } return true }); err != nil { return nil, fmt.Errorf("describing instance type zone offerings, %w", err) } - if p.cm.HasChanged("zonal-offerings", nodeClass.Spec.SubnetSelectorTerms) { - logging.FromContext(ctx).With("zones", zones.List(), "instance-type-count", len(instanceTypeZones), "node-template", nodeClass.Name).Debugf("discovered offerings for instance types") + if p.cm.HasChanged("instance-type-count", len(instanceTypeOfferings)) { + logging.FromContext(ctx).With("instance-type-count", len(instanceTypeOfferings)).Debugf("discovered offerings for instance types") } - p.cache.SetDefault(cacheKey, instanceTypeZones) - return instanceTypeZones, nil + atomic.AddUint64(&p.instanceTypeOfferingsSeqNum, 1) + p.cache.SetDefault(InstanceTypeOfferingsCacheKey, instanceTypeOfferings) + return instanceTypeOfferings, nil } // GetInstanceTypes retrieves all instance types from the ec2 DescribeInstanceTypes API using some opinionated filters From a71a5e63ed8d8917c8a45659d6661a807eaf648f Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Mon, 23 Oct 2023 15:36:26 -0700 Subject: [PATCH 28/47] test: use legacy taint logic for disruption (#4896) Co-authored-by: njtran From ed4dd5d1ac34f6b4952795d746fd1e3f88acf578 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 17:07:15 -0700 Subject: [PATCH 29/47] chore: Bump `aws/karpenter-core` to latest (#4897) --- charts/karpenter/values.yaml | 2 ++ go.mod | 6 +++--- go.sum | 12 ++++++------ .../content/en/preview/upgrading/upgrade-guide.md | 2 +- .../en/preview/upgrading/v1beta1-reference.md | 2 ++ 5 files changed, 14 insertions(+), 10 deletions(-) diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index 1e5dd3051fca..59f56038abed 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -69,6 +69,8 @@ affinity: - matchExpressions: - key: karpenter.sh/provisioner-name operator: DoesNotExist + - key: karpenter.sh/nodepool + operator: DoesNotExist podAntiAffinity: requiredDuringSchedulingIgnoredDuringExecution: - topologyKey: "kubernetes.io/hostname" diff --git a/go.mod b/go.mod index 2fc5ecb3a964..aaaafd25fa19 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.46.1 - github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604 + github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 github.com/imdario/mergo v0.3.16 @@ -106,9 +106,9 @@ require ( gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/cloud-provider v0.28.2 // indirect + k8s.io/cloud-provider v0.28.3 // indirect k8s.io/component-base v0.28.3 // indirect - k8s.io/csi-translation-lib v0.28.2 // indirect + k8s.io/csi-translation-lib v0.28.3 // indirect k8s.io/klog/v2 v2.100.1 // indirect k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect diff --git a/go.sum b/go.sum index c893c4082f6a..771e141e9210 100644 --- a/go.sum +++ b/go.sum @@ -57,8 +57,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.46.1 h1:U26quvBWFZMQuultLw5tloW4GnmWaChEwMZNq8uYatw= github.com/aws/aws-sdk-go v1.46.1/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604 h1:eQFElFqH3K64na70WZBh6FUFonVRKhtyUptWtpO/JdI= -github.com/aws/karpenter-core v0.31.1-0.20231020234031-e0623869f604/go.mod h1:rb3kp/3cj38tACF6udfpmIvKoQMwirSVoHNlrd66LyE= +github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13 h1:BUBIrmrIOdM0I48BSC5EpgKbkJo/nkOpaOEy56+dKh4= +github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c/go.mod h1:l/TIBsaCx/IrOr0Xvlj/cHLOf05QzuQKEZ1hx2XWmfU= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= @@ -760,12 +760,12 @@ k8s.io/apimachinery v0.28.3 h1:B1wYx8txOaCQG0HmYF6nbpU8dg6HvA06x5tEffvOe7A= k8s.io/apimachinery v0.28.3/go.mod h1:uQTKmIqs+rAYaq+DFaoD2X7pcjLOqbQX2AOiO0nIpb8= k8s.io/client-go v0.28.3 h1:2OqNb72ZuTZPKCl+4gTKvqao0AMOl9f3o2ijbAj3LI4= k8s.io/client-go v0.28.3/go.mod h1:LTykbBp9gsA7SwqirlCXBWtK0guzfhpoW4qSm7i9dxo= -k8s.io/cloud-provider v0.28.2 h1:9qsYm86hm4bnPgZbl9LE29Zfgjuq3NZR2dgtPioJ40s= -k8s.io/cloud-provider v0.28.2/go.mod h1:40fqf6MtgYho5Eu4gkyLgh5abxU/QKTMTIwBxt4ILyU= +k8s.io/cloud-provider v0.28.3 h1:9u+JjA3zIn0nqLOOa8tWnprFkffguSAhfBvo8p7LhBQ= +k8s.io/cloud-provider v0.28.3/go.mod h1:shAJxdrKu+SwwGUhkodxByPjaH8KBFZqXo6jU1F0ehI= k8s.io/component-base v0.28.3 h1:rDy68eHKxq/80RiMb2Ld/tbH8uAE75JdCqJyi6lXMzI= k8s.io/component-base v0.28.3/go.mod h1:fDJ6vpVNSk6cRo5wmDa6eKIG7UlIQkaFmZN2fYgIUD8= -k8s.io/csi-translation-lib v0.28.2 h1:63MIOXUn5bet2Mw7G+A7zFmLzQ/vzBrjvNYIlXYh/n0= -k8s.io/csi-translation-lib v0.28.2/go.mod h1:14Lusc0J0vnlRNXA/T7GlZcou4XFTRHC071jsz+SHvQ= +k8s.io/csi-translation-lib v0.28.3 h1:7deV+HZjV418AGikSDPW8dyzTpm4K3tNbQUp3KmR7cs= +k8s.io/csi-translation-lib v0.28.3/go.mod h1:zlrYwakCz2yji9/8EaJk+afIKPrYXPNXXLDO8DVuuTk= k8s.io/klog/v2 v2.100.1 h1:7WCHKK6K8fNhTqfBhISHQ97KrnJNFZMcQvKp7gP/tmg= k8s.io/klog/v2 v2.100.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 h1:aVUu9fTY98ivBPKR9Y5w/AuzbMm96cd3YHRTU83I780= diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index 462681831765..5d28a9eb9b96 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -196,7 +196,7 @@ Add `~/go/bin` to your $PATH, if you have not already done so. - Add the following taint to the old Provisioner: `karpenter.sh/legacy=true:NoSchedule` - For all the nodes owned by the Provisioner, delete one at a time as follows: `kubectl delete node ` -13. Update workload labels: Old alpha labels (`karpenter.sh/do-not-consolidate` and `karpenter.sh/do-not-evict`) are deprecated, but will not be dropped until Karpenter v1. However, you can begin updating those labels at any time with `karpenter.sh/do-not-disrupt`. +13. Update workload labels: Old alpha labels (`karpenter.sh/do-not-consolidate` and `karpenter.sh/do-not-evict`) are deprecated, but will not be dropped until Karpenter v1. However, you can begin updating those labels at any time with `karpenter.sh/do-not-disrupt`. Any pods that specified a `karpenter.sh/provisioner-name:DoesNotExist` requirement also need to add a `karpenter.sh/nodepool:DoesNotExist` requirement to ensure that the pods continue to not schedule to nodes unmanaged by Karpenter while migrating to v1beta1. 14. Check that there are no more Provisioner, AWSNodeTemplate, or Machine resources on your cluster. at which time you can delete the old CRDs. To validate this, run the following command and ensure that there are no outputs to any of them: diff --git a/website/content/en/preview/upgrading/v1beta1-reference.md b/website/content/en/preview/upgrading/v1beta1-reference.md index 8b21dcf530e7..970e1187430f 100644 --- a/website/content/en/preview/upgrading/v1beta1-reference.md +++ b/website/content/en/preview/upgrading/v1beta1-reference.md @@ -30,6 +30,8 @@ Karpenter v1beta1 introduces changes to some common labels, annotations, and sta | **v1alpha5** | **v1beta1** | | karpenter.sh/provisioner-name | karpenter.sh/nodepool | +> **Note**: Previously, you could use the `karpenter.sh/provisioner-name:DoesNotExist` requirement on pods to specify that pods should schedule to nodes unmanaged by Karpenter. With the addition of the `karpenter.sh/nodepool` label key, you now need to specify the `karpenter.sh/nodepool:DoesNotExist` requirement on these pods as well to ensure they don't schedule to nodes provisioned by the new NodePool resources. + | Karpenter Annotations | | |-------------------------------------|----------------------------------| From dd46c663c26bb577ac921cf4552162046780ade4 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Tue, 24 Oct 2023 00:22:12 +0000 Subject: [PATCH 30/47] chore(deps): bump the go-deps group with 1 update (#4898) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index aaaafd25fa19..08612e100a34 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible - github.com/aws/aws-sdk-go v1.46.1 + github.com/aws/aws-sdk-go v1.46.2 github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 diff --git a/go.sum b/go.sum index 771e141e9210..32dfa0edbeda 100644 --- a/go.sum +++ b/go.sum @@ -55,8 +55,8 @@ github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/aws/aws-sdk-go v1.46.1 h1:U26quvBWFZMQuultLw5tloW4GnmWaChEwMZNq8uYatw= -github.com/aws/aws-sdk-go v1.46.1/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.46.2 h1:XZbOmjtN1VCfEtQq7QNFsbxIqO+bB+bRhiOBjp6AzWc= +github.com/aws/aws-sdk-go v1.46.2/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13 h1:BUBIrmrIOdM0I48BSC5EpgKbkJo/nkOpaOEy56+dKh4= github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= From f02ebffe9927c11eaca2c9a9319b905921d130f7 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Mon, 23 Oct 2023 18:20:09 -0700 Subject: [PATCH 31/47] feat: CEL Validation for NodeClaim Requirement (#4895) --- Makefile | 1 + go.mod | 4 +- go.sum | 8 +- hack/validation/requirements.sh | 12 +++ pkg/apis/crds/karpenter.sh_nodeclaims.yaml | 28 +++++++ pkg/apis/crds/karpenter.sh_nodepools.yaml | 30 +++++++ pkg/apis/v1beta1/labels.go | 2 - .../v1beta1/nodepool_validation_cel_test.go | 81 +++++++++++++++++++ pkg/cloudprovider/nodeclaim_test.go | 2 +- pkg/providers/instancetype/nodeclass_test.go | 3 - 10 files changed, 159 insertions(+), 12 deletions(-) create mode 100755 hack/validation/requirements.sh create mode 100644 pkg/apis/v1beta1/nodepool_validation_cel_test.go diff --git a/Makefile b/Makefile index 54f6cac11e08..57bf58f0be9d 100644 --- a/Makefile +++ b/Makefile @@ -124,6 +124,7 @@ verify: tidy download ## Verify code. Includes dependencies, linting, formatting go generate ./... hack/boilerplate.sh cp $(KARPENTER_CORE_DIR)/pkg/apis/crds/* pkg/apis/crds + hack/validation/requirements.sh $(foreach dir,$(MOD_DIRS),cd $(dir) && golangci-lint run $(newline)) @git diff --quiet ||\ { echo "New file modification detected in the Git working tree. Please check in before commit."; git --no-pager diff --name-only | uniq | awk '{print " - " $$0}'; \ diff --git a/go.mod b/go.mod index 08612e100a34..513bd5b266fa 100644 --- a/go.mod +++ b/go.mod @@ -6,8 +6,8 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible - github.com/aws/aws-sdk-go v1.46.2 - github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13 + github.com/aws/aws-sdk-go v1.46.1 + github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 github.com/imdario/mergo v0.3.16 diff --git a/go.sum b/go.sum index 32dfa0edbeda..3f949a9272ca 100644 --- a/go.sum +++ b/go.sum @@ -55,10 +55,10 @@ github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/aws/aws-sdk-go v1.46.2 h1:XZbOmjtN1VCfEtQq7QNFsbxIqO+bB+bRhiOBjp6AzWc= -github.com/aws/aws-sdk-go v1.46.2/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13 h1:BUBIrmrIOdM0I48BSC5EpgKbkJo/nkOpaOEy56+dKh4= -github.com/aws/karpenter-core v0.31.1-0.20231023230955-f52b18ab8c13/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= +github.com/aws/aws-sdk-go v1.46.1 h1:U26quvBWFZMQuultLw5tloW4GnmWaChEwMZNq8uYatw= +github.com/aws/aws-sdk-go v1.46.1/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49 h1:xhotj4aEGeTH5MWM1vqdtxDQJlkoeAmYF3KeByrFef0= +github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c/go.mod h1:l/TIBsaCx/IrOr0Xvlj/cHLOf05QzuQKEZ1hx2XWmfU= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= diff --git a/hack/validation/requirements.sh b/hack/validation/requirements.sh new file mode 100755 index 000000000000..b33f9774163b --- /dev/null +++ b/hack/validation/requirements.sh @@ -0,0 +1,12 @@ +# Requirements Validation + +# Adding validation for nodeclaim + +## checking for restricted labels while filtering out well known labels +yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations += [ + {"message": "label domain \"karpenter.k8s.aws\" is restricted", "rule": "self in [\"karpenter.k8s.aws/instance-encryption-in-transit-supported\", \"karpenter.k8s.aws/instance-category\", \"karpenter.k8s.aws/instance-hypervisor\", \"karpenter.k8s.aws/instance-family\", \"karpenter.k8s.aws/instance-generation\", \"karpenter.k8s.aws/instance-local-nvme\", \"karpenter.k8s.aws/instance-size\", \"karpenter.k8s.aws/instance-cpu\",\"karpenter.k8s.aws/instance-memory\", \"karpenter.k8s.aws/instance-network-bandwidth\", \"karpenter.k8s.aws/instance-gpu-name\", \"karpenter.k8s.aws/instance-gpu-manufacturer\", \"karpenter.k8s.aws/instance-gpu-count\", \"karpenter.k8s.aws/instance-gpu-memory\", \"karpenter.k8s.aws/instance-accelerator-name\", \"karpenter.k8s.aws/instance-accelerator-manufacturer\", \"karpenter.k8s.aws/instance-accelerator-count\"] || !self.find(\"^([^/]+)\").endsWith(\"karpenter.k8s.aws\")"}]' -i pkg/apis/crds/karpenter.sh_nodeclaims.yaml +# # Adding validation for nodepool + +# ## checking for restricted labels while filtering out well known labels +yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.template.properties.spec.properties.requirements.items.properties.key.x-kubernetes-validations += [ + {"message": "label domain \"karpenter.k8s.aws\" is restricted", "rule": "self in [\"karpenter.k8s.aws/instance-encryption-in-transit-supported\", \"karpenter.k8s.aws/instance-category\", \"karpenter.k8s.aws/instance-hypervisor\", \"karpenter.k8s.aws/instance-family\", \"karpenter.k8s.aws/instance-generation\", \"karpenter.k8s.aws/instance-local-nvme\", \"karpenter.k8s.aws/instance-size\", \"karpenter.k8s.aws/instance-cpu\",\"karpenter.k8s.aws/instance-memory\", \"karpenter.k8s.aws/instance-network-bandwidth\", \"karpenter.k8s.aws/instance-gpu-name\", \"karpenter.k8s.aws/instance-gpu-manufacturer\", \"karpenter.k8s.aws/instance-gpu-count\", \"karpenter.k8s.aws/instance-gpu-memory\", \"karpenter.k8s.aws/instance-accelerator-name\", \"karpenter.k8s.aws/instance-accelerator-manufacturer\", \"karpenter.k8s.aws/instance-accelerator-count\"] || !self.find(\"^([^/]+)\").endsWith(\"karpenter.k8s.aws\")"}]' -i pkg/apis/crds/karpenter.sh_nodepools.yaml diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index 77ddfd3bf6f8..a152f6787a4f 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -182,19 +182,47 @@ spec: key: description: The label key that the selector applies to. type: string + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.startsWith("node.kubernetes.io/") || self.startsWith("node-restriction.kubernetes.io/") || !self.find("^([^/]+)").endsWith("kubernetes.io") + - message: label domain "k8s.io" is restricted + rule: self.startsWith("kops.k8s.io/") || !self.find("^([^/]+)").endsWith("k8s.io") + - message: label domain "karpenter.sh" is restricted + rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") + - message: label "kubernetes.io/hostname" is restricted + rule: self != "kubernetes.io/hostname" + - message: label domain "karpenter.k8s.aws" is restricted + rule: self in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") operator: description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string + enum: + - In + - NotIn + - Exists + - DoesNotExist + - Gt + - Lt values: description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ required: - key - operator type: object + maxItems: 30 type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' + - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' resources: description: Resources models the resource requirements for the NodeClaim to launch properties: diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 086b9e991f7f..4d93d4ed1bf2 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -218,19 +218,49 @@ spec: key: description: The label key that the selector applies to. type: string + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.startsWith("node.kubernetes.io/") || self.startsWith("node-restriction.kubernetes.io/") || !self.find("^([^/]+)").endsWith("kubernetes.io") + - message: label domain "k8s.io" is restricted + rule: self.startsWith("kops.k8s.io/") || !self.find("^([^/]+)").endsWith("k8s.io") + - message: label domain "karpenter.sh" is restricted + rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") + - message: label "karpenter.sh/nodepool" is restricted + rule: self != "karpenter.sh/nodepool" + - message: label "kubernetes.io/hostname" is restricted + rule: self != "kubernetes.io/hostname" + - message: label domain "karpenter.k8s.aws" is restricted + rule: self in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") operator: description: Represents a key's relationship to a set of values. Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string + enum: + - In + - NotIn + - Exists + - DoesNotExist + - Gt + - Lt values: description: An array of string values. If the operator is In or NotIn, the values array must be non-empty. If the operator is Exists or DoesNotExist, the values array must be empty. If the operator is Gt or Lt, the values array must have a single element, which will be interpreted as an integer. This array is replaced during a strategic merge patch. items: type: string type: array + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ required: - key - operator type: object + maxItems: 30 type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' + - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' resources: description: Resources models the resource requirements for the NodeClaim to launch properties: diff --git a/pkg/apis/v1beta1/labels.go b/pkg/apis/v1beta1/labels.go index 84396f3f4d00..21559932fa3c 100644 --- a/pkg/apis/v1beta1/labels.go +++ b/pkg/apis/v1beta1/labels.go @@ -38,7 +38,6 @@ func init() { LabelInstanceCPU, LabelInstanceMemory, LabelInstanceNetworkBandwidth, - LabelInstancePods, LabelInstanceGPUName, LabelInstanceGPUManufacturer, LabelInstanceGPUCount, @@ -105,7 +104,6 @@ var ( LabelInstanceCPU = Group + "/instance-cpu" LabelInstanceMemory = Group + "/instance-memory" LabelInstanceNetworkBandwidth = Group + "/instance-network-bandwidth" - LabelInstancePods = Group + "/instance-pods" LabelInstanceGPUName = Group + "/instance-gpu-name" LabelInstanceGPUManufacturer = Group + "/instance-gpu-manufacturer" LabelInstanceGPUCount = Group + "/instance-gpu-count" diff --git a/pkg/apis/v1beta1/nodepool_validation_cel_test.go b/pkg/apis/v1beta1/nodepool_validation_cel_test.go new file mode 100644 index 000000000000..62603326c4a5 --- /dev/null +++ b/pkg/apis/v1beta1/nodepool_validation_cel_test.go @@ -0,0 +1,81 @@ +/* +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1beta1_test + +import ( + "strings" + + "github.com/Pallinder/go-randomdata" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" + + "github.com/aws/karpenter-core/pkg/apis/v1beta1" +) + +var _ = Describe("CEL/Validation", func() { + var nodePool *v1beta1.NodePool + + BeforeEach(func() { + if env.Version.Minor() < 25 { + Skip("CEL Validation is for 1.25>") + } + nodePool = &v1beta1.NodePool{ + ObjectMeta: metav1.ObjectMeta{Name: strings.ToLower(randomdata.SillyName())}, + Spec: v1beta1.NodePoolSpec{ + Template: v1beta1.NodeClaimTemplate{ + Spec: v1beta1.NodeClaimSpec{ + NodeClassRef: &v1beta1.NodeClassReference{ + Kind: "NodeClaim", + Name: "default", + }, + Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpExists, + }, + }, + }, + }, + }, + } + }) + It("should allow restricted domains exceptions", func() { + oldNodePool := nodePool.DeepCopy() + for label := range v1beta1.LabelDomainExceptions { + nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ + {Key: label + "/test", Operator: v1.NodeSelectorOpIn, Values: []string{"test"}}, + } + Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) + Expect(nodePool.RuntimeValidate()).To(Succeed()) + Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) + nodePool = oldNodePool.DeepCopy() + } + }) + It("should allow well known label exceptions", func() { + oldNodePool := nodePool.DeepCopy() + for label := range v1beta1.WellKnownLabels.Difference(sets.New(v1beta1.NodePoolLabelKey)) { + nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ + {Key: label, Operator: v1.NodeSelectorOpIn, Values: []string{"test"}}, + } + Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) + Expect(nodePool.RuntimeValidate()).To(Succeed()) + Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) + nodePool = oldNodePool.DeepCopy() + } + }) +}) diff --git a/pkg/cloudprovider/nodeclaim_test.go b/pkg/cloudprovider/nodeclaim_test.go index 99b0851b4ca1..218d358d1e70 100644 --- a/pkg/cloudprovider/nodeclaim_test.go +++ b/pkg/cloudprovider/nodeclaim_test.go @@ -79,7 +79,7 @@ var _ = Describe("NodeClaim/CloudProvider", func() { { Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, - Values: []string{}, + Values: []string{"test-instance-type"}, }, } ExpectApplied(ctx, env.Client, nodePool, nodeClass, nodeClaim) diff --git a/pkg/providers/instancetype/nodeclass_test.go b/pkg/providers/instancetype/nodeclass_test.go index bee4807860ea..563caed3cff2 100644 --- a/pkg/providers/instancetype/nodeclass_test.go +++ b/pkg/providers/instancetype/nodeclass_test.go @@ -116,7 +116,6 @@ var _ = Describe("NodeClass/InstanceTypes", func() { v1beta1.LabelInstanceCPU: "32", v1beta1.LabelInstanceMemory: "131072", v1beta1.LabelInstanceNetworkBandwidth: "50000", - v1beta1.LabelInstancePods: "58", v1beta1.LabelInstanceGPUName: "t4", v1beta1.LabelInstanceGPUManufacturer: "nvidia", v1beta1.LabelInstanceGPUCount: "1", @@ -169,7 +168,6 @@ var _ = Describe("NodeClass/InstanceTypes", func() { v1beta1.LabelInstanceCPU: "32", v1beta1.LabelInstanceMemory: "131072", v1beta1.LabelInstanceNetworkBandwidth: "50000", - v1beta1.LabelInstancePods: "58", v1beta1.LabelInstanceGPUName: "t4", v1beta1.LabelInstanceGPUManufacturer: "nvidia", v1beta1.LabelInstanceGPUCount: "1", @@ -220,7 +218,6 @@ var _ = Describe("NodeClass/InstanceTypes", func() { v1beta1.LabelInstanceCPU: "8", v1beta1.LabelInstanceMemory: "16384", v1beta1.LabelInstanceNetworkBandwidth: "5000", - v1beta1.LabelInstancePods: "38", v1beta1.LabelInstanceAcceleratorName: "inferentia", v1beta1.LabelInstanceAcceleratorManufacturer: "aws", v1beta1.LabelInstanceAcceleratorCount: "1", From 5b15ce9d8d9204a9655048a88fad9637091c6492 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Mon, 23 Oct 2023 18:34:23 -0700 Subject: [PATCH 32/47] chore: Add more E2E test fixes for the Beta Suites (#4899) --- .github/workflows/e2e-matrix.yaml | 2 -- .../beta/expiration/{expiration_test.go => suite_test.go} | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) rename test/suites/beta/expiration/{expiration_test.go => suite_test.go} (99%) diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index ecf19ccfb9d7..c9e2948f1d5a 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -61,13 +61,11 @@ jobs: - Alpha/Integration - Alpha/Machine - Alpha/Consolidation - - Alpha/Utilization - Alpha/Interruption - Alpha/Drift - Alpha/Expiration - Alpha/Chaos - Alpha/IPv6 - - Alpha/Scale uses: ./.github/workflows/e2e.yaml with: suite: ${{ matrix.suite }} diff --git a/test/suites/beta/expiration/expiration_test.go b/test/suites/beta/expiration/suite_test.go similarity index 99% rename from test/suites/beta/expiration/expiration_test.go rename to test/suites/beta/expiration/suite_test.go index 224d12827e8f..623feec43ea3 100644 --- a/test/suites/beta/expiration/expiration_test.go +++ b/test/suites/beta/expiration/suite_test.go @@ -128,8 +128,7 @@ var _ = Describe("Expiration", func() { // Eventually the node will be set as unschedulable, which means its actively being deprovisioned Eventually(func(g Gomega) { - n := &v1.Node{} - g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), n)).Should(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).Should(Succeed()) _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { return corev1beta1.IsDisruptingTaint(t) }) @@ -197,8 +196,7 @@ var _ = Describe("Expiration", func() { // Eventually the node will be set as unschedulable, which means its actively being deprovisioned Eventually(func(g Gomega) { - n := &v1.Node{} - g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), n)).Should(Succeed()) + g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(node), node)).Should(Succeed()) _, ok := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { return corev1beta1.IsDisruptingTaint(t) }) From 4ca41b877e06a36b06ba81f068b32333cb9aa338 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 23 Oct 2023 18:51:00 -0700 Subject: [PATCH 33/47] chore(deps): bump the go-deps group with 1 update (#4901) Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- go.mod | 2 +- go.sum | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 513bd5b266fa..f28c592a6f18 100644 --- a/go.mod +++ b/go.mod @@ -6,7 +6,7 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible - github.com/aws/aws-sdk-go v1.46.1 + github.com/aws/aws-sdk-go v1.46.2 github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 diff --git a/go.sum b/go.sum index 3f949a9272ca..9967d78da446 100644 --- a/go.sum +++ b/go.sum @@ -55,8 +55,8 @@ github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/aws/aws-sdk-go v1.46.1 h1:U26quvBWFZMQuultLw5tloW4GnmWaChEwMZNq8uYatw= -github.com/aws/aws-sdk-go v1.46.1/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= +github.com/aws/aws-sdk-go v1.46.2 h1:XZbOmjtN1VCfEtQq7QNFsbxIqO+bB+bRhiOBjp6AzWc= +github.com/aws/aws-sdk-go v1.46.2/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49 h1:xhotj4aEGeTH5MWM1vqdtxDQJlkoeAmYF3KeByrFef0= github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= From 0511f5d57c9e89d51ae1144127404c2b2b47f560 Mon Sep 17 00:00:00 2001 From: Nick Tran <10810510+njtran@users.noreply.github.com> Date: Tue, 24 Oct 2023 13:01:53 -0700 Subject: [PATCH 34/47] chore: fix env vars for v1beta1 controller policy (#4909) --- .../upgrading/v1beta1-controller-policy.json | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/website/content/en/preview/upgrading/v1beta1-controller-policy.json b/website/content/en/preview/upgrading/v1beta1-controller-policy.json index 97b66c6b983d..69e70f4f4ceb 100644 --- a/website/content/en/preview/upgrading/v1beta1-controller-policy.json +++ b/website/content/en/preview/upgrading/v1beta1-controller-policy.json @@ -5,12 +5,12 @@ "Sid": "AllowScopedEC2InstanceActions", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${REGION}::image/*", - "arn:${AWS_PARTITION}:ec2:${REGION}::snapshot/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:spot-instances-request/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:security-group/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:subnet/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}::image/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}::snapshot/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:spot-instances-request/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:security-group/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:subnet/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" ], "Action": [ "ec2:RunInstances", @@ -21,11 +21,11 @@ "Sid": "AllowScopedEC2InstanceActionsWithTags", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${REGION}:*:fleet/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:volume/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:network-interface/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:fleet/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:volume/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:network-interface/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" ], "Action": [ "ec2:RunInstances", @@ -45,11 +45,11 @@ "Sid": "AllowScopedResourceCreationTagging", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${REGION}:*:fleet/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:volume/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:network-interface/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:fleet/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:volume/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:network-interface/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" ], "Action": "ec2:CreateTags", "Condition": { @@ -69,7 +69,7 @@ { "Sid": "AllowScopedResourceTagging", "Effect": "Allow", - "Resource": "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", + "Resource": "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", "Action": "ec2:CreateTags", "Condition": { "StringEquals": { @@ -90,8 +90,8 @@ "Sid": "AllowScopedDeletion", "Effect": "Allow", "Resource": [ - "arn:${AWS_PARTITION}:ec2:${REGION}:*:instance/*", - "arn:${AWS_PARTITION}:ec2:${REGION}:*:launch-template/*" + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:instance/*", + "arn:${AWS_PARTITION}:ec2:${AWS_REGION}:*:launch-template/*" ], "Action": [ "ec2:TerminateInstances", @@ -123,14 +123,14 @@ ], "Condition": { "StringEquals": { - "aws:RequestedRegion": "${REGION}" + "aws:RequestedRegion": "${AWS_REGION}" } } }, { "Sid": "AllowSSMReadActions", "Effect": "Allow", - "Resource": "arn:${AWS_PARTITION}:ssm:${REGION}::parameter/aws/service/*", + "Resource": "arn:${AWS_PARTITION}:ssm:${AWS_REGION}::parameter/aws/service/*", "Action": "ssm:GetParameter" }, { @@ -142,7 +142,7 @@ { "Sid": "AllowInterruptionQueueActions", "Effect": "Allow", - "Resource": "arn:aws:sqs:${REGION}:${AWS_ACCOUNT_ID}:${CLUSTER_NAME}", + "Resource": "arn:aws:sqs:${AWS_REGION}:${AWS_ACCOUNT_ID}:${CLUSTER_NAME}", "Action": [ "sqs:DeleteMessage", "sqs:GetQueueAttributes", @@ -169,7 +169,7 @@ "Condition": { "StringEquals": { "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "${REGION}" + "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { "aws:RequestTag/karpenter.k8s.aws/ec2nodeclass": "*" @@ -184,9 +184,9 @@ "Condition": { "StringEquals": { "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "${REGION}", + "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}", "aws:RequestTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:RequestTag/topology.kubernetes.io/region": "${REGION}" + "aws:RequestTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*", @@ -206,7 +206,7 @@ "Condition": { "StringEquals": { "aws:ResourceTag/kubernetes.io/cluster/${CLUSTER_NAME}": "owned", - "aws:ResourceTag/topology.kubernetes.io/region": "${REGION}" + "aws:ResourceTag/topology.kubernetes.io/region": "${AWS_REGION}" }, "StringLike": { "aws:ResourceTag/karpenter.k8s.aws/ec2nodeclass": "*" @@ -222,8 +222,8 @@ { "Sid": "AllowAPIServerEndpointDiscovery", "Effect": "Allow", - "Resource": "arn:${AWS_PARTITION}:eks:${REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", + "Resource": "arn:${AWS_PARTITION}:eks:${AWS_REGION}:${AWS_ACCOUNT_ID}:cluster/${CLUSTER_NAME}", "Action": "eks:DescribeCluster" } ] -} \ No newline at end of file +} From 3be36eef51a056130d7c6ef03963c6e2934ee8c5 Mon Sep 17 00:00:00 2001 From: Ellis Tarn Date: Tue, 24 Oct 2023 13:56:43 -0700 Subject: [PATCH 35/47] chore: Audited and documented permissions for gha (#4910) --- .github/actions/upload-artifact/action.yaml | 9 --------- .github/workflows/approval-comment.yaml | 6 ++++-- .github/workflows/codegen.yaml | 6 +++--- .github/workflows/codeql-analysis.yaml | 5 ++--- .github/workflows/docgen.yaml | 2 +- .github/workflows/e2e-upgrade.yaml | 5 ++--- .github/workflows/e2e.yaml | 5 ++--- .github/workflows/pr-snapshot.yaml | 2 -- .github/workflows/publish-test-tools.yaml | 2 +- .github/workflows/release.yaml | 6 +++--- .github/workflows/snapshot.yaml | 2 +- .github/workflows/stale.yaml | 5 ++--- .github/workflows/sweeper.yaml | 3 +-- 13 files changed, 22 insertions(+), 36 deletions(-) delete mode 100644 .github/actions/upload-artifact/action.yaml diff --git a/.github/actions/upload-artifact/action.yaml b/.github/actions/upload-artifact/action.yaml deleted file mode 100644 index b9d384456b7d..000000000000 --- a/.github/actions/upload-artifact/action.yaml +++ /dev/null @@ -1,9 +0,0 @@ -name: UploadArtifacts -description: 'Uploads artifacts of a workflow as an archive of a directory so that another workflow that runs on workflow_run can download and use it' -runs: - using: "composite" - steps: - - uses: actions/upload-artifact@v3 - with: - name: artifacts - path: /tmp/artifacts diff --git a/.github/workflows/approval-comment.yaml b/.github/workflows/approval-comment.yaml index b02597e22a37..d53c1bd6cb77 100644 --- a/.github/workflows/approval-comment.yaml +++ b/.github/workflows/approval-comment.yaml @@ -6,7 +6,6 @@ on: jobs: approval-comment: if: startsWith(github.event.review.body, '/karpenter snapshot') || startsWith(github.event.review.body, '/karpenter scale') || startsWith(github.event.review.body, '/karpenter conformance') - permissions: write-all runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 @@ -21,4 +20,7 @@ jobs: echo ${{ github.event.pull_request.number }} >> /tmp/artifacts/metadata.txt echo ${{ github.event.review.commit_id }} >> /tmp/artifacts/metadata.txt cat /tmp/artifacts/metadata.txt - - uses: ./.github/actions/upload-artifact + - uses: actions/upload-artifact@v3 + with: + name: artifacts + path: /tmp/artifacts diff --git a/.github/workflows/codegen.yaml b/.github/workflows/codegen.yaml index a45f7eb2bf61..d92447dbd44e 100644 --- a/.github/workflows/codegen.yaml +++ b/.github/workflows/codegen.yaml @@ -5,9 +5,9 @@ on: - cron: '0 13 * * MON' permissions: - id-token: write - pull-requests: write - contents: write + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 + pull-requests: write # name: Create Pull Request + contents: write # name: Create Pull Request jobs: codegen: diff --git a/.github/workflows/codeql-analysis.yaml b/.github/workflows/codeql-analysis.yaml index bc61db907386..60f764dbe505 100644 --- a/.github/workflows/codeql-analysis.yaml +++ b/.github/workflows/codeql-analysis.yaml @@ -12,9 +12,8 @@ jobs: name: Analyze runs-on: ubuntu-latest permissions: - actions: read - contents: read - security-events: write + actions: read # github/codeql-action/init@v2 + security-events: write # github/codeql-action/init@v2 strategy: fail-fast: false diff --git a/.github/workflows/docgen.yaml b/.github/workflows/docgen.yaml index 97a116bf437b..c65be3d2bdd4 100644 --- a/.github/workflows/docgen.yaml +++ b/.github/workflows/docgen.yaml @@ -5,7 +5,7 @@ on: branches: [main] permissions: - id-token: write + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 jobs: docgen-ci: diff --git a/.github/workflows/e2e-upgrade.yaml b/.github/workflows/e2e-upgrade.yaml index 1e418d872335..8a71094b63e0 100644 --- a/.github/workflows/e2e-upgrade.yaml +++ b/.github/workflows/e2e-upgrade.yaml @@ -46,9 +46,8 @@ on: SLACK_WEBHOOK_URL: required: true permissions: - id-token: write # This is required for requesting the JWT - contents: read # This is required for actions/checkout - statuses: write + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 + statuses: write # ./.github/actions/commit-status/start jobs: run-suite: name: suite-upgrade diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 47b373afbd0b..42abf5c92bfc 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -74,9 +74,8 @@ on: SLACK_WEBHOOK_URL: required: true permissions: - id-token: write # This is required for requesting the JWT - contents: read # This is required for actions/checkout - statuses: write + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 + statuses: write # ./.github/actions/commit-status/start jobs: run-suite: name: suite-${{ inputs.suite }} diff --git a/.github/workflows/pr-snapshot.yaml b/.github/workflows/pr-snapshot.yaml index 8bd9e961c7a7..f720df036147 100644 --- a/.github/workflows/pr-snapshot.yaml +++ b/.github/workflows/pr-snapshot.yaml @@ -5,8 +5,6 @@ on: types: [completed] permissions: id-token: write - pull-requests: write - contents: write statuses: write jobs: release: diff --git a/.github/workflows/publish-test-tools.yaml b/.github/workflows/publish-test-tools.yaml index f9365b210dbe..09f0f50325d3 100644 --- a/.github/workflows/publish-test-tools.yaml +++ b/.github/workflows/publish-test-tools.yaml @@ -8,7 +8,7 @@ on: schedule: - cron: '0 13 * * MON' permissions: - id-token: write + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 jobs: publish-tools: if: github.repository == 'aws/karpenter' diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 798c905b1ff5..c8dd1d914610 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -3,9 +3,9 @@ on: push: tags: [ 'v*.*.*' ] permissions: - id-token: write - pull-requests: write - contents: write + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 + contents: write # marvinpinto/action-automatic-releases@latest + pull-requests: write # name: Create PR jobs: release: if: github.repository == 'aws/karpenter' diff --git a/.github/workflows/snapshot.yaml b/.github/workflows/snapshot.yaml index da8f58f41840..50add2d98f88 100644 --- a/.github/workflows/snapshot.yaml +++ b/.github/workflows/snapshot.yaml @@ -3,7 +3,7 @@ on: push: branches: [ main ] permissions: - id-token: write + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 jobs: release: if: github.repository == 'aws/karpenter' diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml index 219fcef93cf2..2d27333c2a36 100644 --- a/.github/workflows/stale.yaml +++ b/.github/workflows/stale.yaml @@ -7,9 +7,8 @@ jobs: StaleBot: runs-on: ubuntu-latest permissions: - issues: write - discussions: write - pull-requests: write + issues: write # actions/stale@v8.0.0 + pull-requests: write # actions/stale@v8.0.0 if: github.repository == 'aws/karpenter' name: Stale issue bot steps: diff --git a/.github/workflows/sweeper.yaml b/.github/workflows/sweeper.yaml index 8d0138804f22..894f50400b0b 100644 --- a/.github/workflows/sweeper.yaml +++ b/.github/workflows/sweeper.yaml @@ -4,8 +4,7 @@ on: - cron: '0 */12 * * *' workflow_dispatch: permissions: - id-token: write # This is required for requesting the JWT - contents: read # This is required for actions/checkout + id-token: write # aws-actions/configure-aws-credentials@v4.0.1 jobs: sweeper: if: vars.ACCOUNT_ID != '' || github.event_name == 'workflow_dispatch' From 206c1307d89ef7db3e6abfed04ce1487812558ab Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Tue, 24 Oct 2023 14:39:40 -0700 Subject: [PATCH 36/47] feat: CEL Validation for `NodePool.Spec.Template` Labels (#4906) --- Makefile | 1 + go.mod | 2 +- go.sum | 4 +- hack/validation/labels.sh | 7 ++ pkg/apis/crds/karpenter.sh_nodepools.yaml | 16 +++++ .../v1beta1/nodepool_validation_cel_test.go | 68 +++++++++++++------ 6 files changed, 75 insertions(+), 23 deletions(-) create mode 100755 hack/validation/labels.sh diff --git a/Makefile b/Makefile index 57bf58f0be9d..40ceb42efbf2 100644 --- a/Makefile +++ b/Makefile @@ -125,6 +125,7 @@ verify: tidy download ## Verify code. Includes dependencies, linting, formatting hack/boilerplate.sh cp $(KARPENTER_CORE_DIR)/pkg/apis/crds/* pkg/apis/crds hack/validation/requirements.sh + hack/validation/labels.sh $(foreach dir,$(MOD_DIRS),cd $(dir) && golangci-lint run $(newline)) @git diff --quiet ||\ { echo "New file modification detected in the Git working tree. Please check in before commit."; git --no-pager diff --name-only | uniq | awk '{print " - " $$0}'; \ diff --git a/go.mod b/go.mod index f28c592a6f18..525b62559f0b 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.46.2 - github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49 + github.com/aws/karpenter-core v0.31.1-0.20231024212423-074467327555 github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 github.com/imdario/mergo v0.3.16 diff --git a/go.sum b/go.sum index 9967d78da446..804350daabef 100644 --- a/go.sum +++ b/go.sum @@ -57,8 +57,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.46.2 h1:XZbOmjtN1VCfEtQq7QNFsbxIqO+bB+bRhiOBjp6AzWc= github.com/aws/aws-sdk-go v1.46.2/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49 h1:xhotj4aEGeTH5MWM1vqdtxDQJlkoeAmYF3KeByrFef0= -github.com/aws/karpenter-core v0.31.1-0.20231024004605-223dcd0fda49/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= +github.com/aws/karpenter-core v0.31.1-0.20231024212423-074467327555 h1:Rr+u23lxqMaubLd8JLks3L9uO8ACLSNbzID3VXIm/B4= +github.com/aws/karpenter-core v0.31.1-0.20231024212423-074467327555/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c/go.mod h1:l/TIBsaCx/IrOr0Xvlj/cHLOf05QzuQKEZ1hx2XWmfU= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= diff --git a/hack/validation/labels.sh b/hack/validation/labels.sh new file mode 100755 index 000000000000..53f42a8c5a13 --- /dev/null +++ b/hack/validation/labels.sh @@ -0,0 +1,7 @@ +# Labels Validation + +# # Adding validation for nodepool + +# ## checking for restricted labels while filtering out well known labels +yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.template.properties.metadata.properties.labels.x-kubernetes-validations += [ + {"message": "label domain \"karpenter.k8s.aws\" is restricted", "rule": "self in [\"karpenter.k8s.aws/instance-encryption-in-transit-supported\", \"karpenter.k8s.aws/instance-category\", \"karpenter.k8s.aws/instance-hypervisor\", \"karpenter.k8s.aws/instance-family\", \"karpenter.k8s.aws/instance-generation\", \"karpenter.k8s.aws/instance-local-nvme\", \"karpenter.k8s.aws/instance-size\", \"karpenter.k8s.aws/instance-cpu\",\"karpenter.k8s.aws/instance-memory\", \"karpenter.k8s.aws/instance-network-bandwidth\", \"karpenter.k8s.aws/instance-gpu-name\", \"karpenter.k8s.aws/instance-gpu-manufacturer\", \"karpenter.k8s.aws/instance-gpu-count\", \"karpenter.k8s.aws/instance-gpu-memory\", \"karpenter.k8s.aws/instance-accelerator-name\", \"karpenter.k8s.aws/instance-accelerator-manufacturer\", \"karpenter.k8s.aws/instance-accelerator-count\"] || !self.find(\"^([^/]+)\").endsWith(\"karpenter.k8s.aws\")"}]' -i pkg/apis/crds/karpenter.sh_nodepools.yaml \ No newline at end of file diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 4d93d4ed1bf2..196df4d85f4b 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -90,8 +90,24 @@ spec: labels: additionalProperties: type: string + maxLength: 63 + pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ description: 'Map of string keys and values that can be used to organize and categorize (scope and select) objects. May match selectors of replication controllers and services. More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels' type: object + maxProperties: 100 + x-kubernetes-validations: + - message: label domain "kubernetes.io" is restricted + rule: self.all(x, x in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || x.startsWith("node.kubernetes.io") || x.startsWith("node-restriction.kubernetes.io") || !x.find("^([^/]+)").endsWith("kubernetes.io")) + - message: label domain "k8s.io" is restricted + rule: self.all(x, x.startsWith("kops.k8s.io") || !x.find("^([^/]+)").endsWith("k8s.io")) + - message: label domain "karpenter.sh" is restricted + rule: self.all(x, x in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !x.find("^([^/]+)").endsWith("karpenter.sh")) + - message: label "karpenter.sh/nodepool" is restricted + rule: self.all(x, x != "karpenter.sh/nodepool") + - message: label "kubernetes.io/hostname" is restricted + rule: self.all(x, x != "kubernetes.io/hostname") + - message: label domain "karpenter.k8s.aws" is restricted + rule: self in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") type: object spec: description: NodeClaimSpec describes the desired state of the NodeClaim diff --git a/pkg/apis/v1beta1/nodepool_validation_cel_test.go b/pkg/apis/v1beta1/nodepool_validation_cel_test.go index 62603326c4a5..f7e03c3191c1 100644 --- a/pkg/apis/v1beta1/nodepool_validation_cel_test.go +++ b/pkg/apis/v1beta1/nodepool_validation_cel_test.go @@ -54,28 +54,56 @@ var _ = Describe("CEL/Validation", func() { }, } }) - It("should allow restricted domains exceptions", func() { - oldNodePool := nodePool.DeepCopy() - for label := range v1beta1.LabelDomainExceptions { - nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ - {Key: label + "/test", Operator: v1.NodeSelectorOpIn, Values: []string{"test"}}, + Context("Requirements", func() { + It("should allow restricted domains exceptions", func() { + oldNodePool := nodePool.DeepCopy() + for label := range v1beta1.LabelDomainExceptions { + nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ + {Key: label + "/test", Operator: v1.NodeSelectorOpIn, Values: []string{"test"}}, + } + Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) + Expect(nodePool.RuntimeValidate()).To(Succeed()) + Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) + nodePool = oldNodePool.DeepCopy() } - Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) - Expect(nodePool.RuntimeValidate()).To(Succeed()) - Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) - nodePool = oldNodePool.DeepCopy() - } + }) + It("should allow well known label exceptions", func() { + oldNodePool := nodePool.DeepCopy() + for label := range v1beta1.WellKnownLabels.Difference(sets.New(v1beta1.NodePoolLabelKey)) { + nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ + {Key: label, Operator: v1.NodeSelectorOpIn, Values: []string{"test"}}, + } + Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) + Expect(nodePool.RuntimeValidate()).To(Succeed()) + Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) + nodePool = oldNodePool.DeepCopy() + } + }) }) - It("should allow well known label exceptions", func() { - oldNodePool := nodePool.DeepCopy() - for label := range v1beta1.WellKnownLabels.Difference(sets.New(v1beta1.NodePoolLabelKey)) { - nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ - {Key: label, Operator: v1.NodeSelectorOpIn, Values: []string{"test"}}, + Context("Labels", func() { + It("should allow restricted domains exceptions", func() { + oldNodePool := nodePool.DeepCopy() + for label := range v1beta1.LabelDomainExceptions { + nodePool.Spec.Template.Labels = map[string]string{ + label: "test", + } + Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) + Expect(nodePool.RuntimeValidate()).To(Succeed()) + Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) + nodePool = oldNodePool.DeepCopy() } - Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) - Expect(nodePool.RuntimeValidate()).To(Succeed()) - Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) - nodePool = oldNodePool.DeepCopy() - } + }) + It("should allow well known label exceptions", func() { + oldNodePool := nodePool.DeepCopy() + for label := range v1beta1.WellKnownLabels.Difference(sets.New(v1beta1.NodePoolLabelKey)) { + nodePool.Spec.Template.Labels = map[string]string{ + label: "test", + } + Expect(env.Client.Create(ctx, nodePool)).To(Succeed()) + Expect(nodePool.RuntimeValidate()).To(Succeed()) + Expect(env.Client.Delete(ctx, nodePool)).To(Succeed()) + nodePool = oldNodePool.DeepCopy() + } + }) }) }) From 838c163dbd3933473285c85e6324010b530ce856 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Tue, 24 Oct 2023 15:20:54 -0700 Subject: [PATCH 37/47] chore: CEL NodeClaimsTemplate Labels Clean-up (#4916) --- hack/validation/labels.sh | 2 +- pkg/apis/crds/karpenter.sh_nodepools.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hack/validation/labels.sh b/hack/validation/labels.sh index 53f42a8c5a13..144b66c1e053 100755 --- a/hack/validation/labels.sh +++ b/hack/validation/labels.sh @@ -4,4 +4,4 @@ # ## checking for restricted labels while filtering out well known labels yq eval '.spec.versions[0].schema.openAPIV3Schema.properties.spec.properties.template.properties.metadata.properties.labels.x-kubernetes-validations += [ - {"message": "label domain \"karpenter.k8s.aws\" is restricted", "rule": "self in [\"karpenter.k8s.aws/instance-encryption-in-transit-supported\", \"karpenter.k8s.aws/instance-category\", \"karpenter.k8s.aws/instance-hypervisor\", \"karpenter.k8s.aws/instance-family\", \"karpenter.k8s.aws/instance-generation\", \"karpenter.k8s.aws/instance-local-nvme\", \"karpenter.k8s.aws/instance-size\", \"karpenter.k8s.aws/instance-cpu\",\"karpenter.k8s.aws/instance-memory\", \"karpenter.k8s.aws/instance-network-bandwidth\", \"karpenter.k8s.aws/instance-gpu-name\", \"karpenter.k8s.aws/instance-gpu-manufacturer\", \"karpenter.k8s.aws/instance-gpu-count\", \"karpenter.k8s.aws/instance-gpu-memory\", \"karpenter.k8s.aws/instance-accelerator-name\", \"karpenter.k8s.aws/instance-accelerator-manufacturer\", \"karpenter.k8s.aws/instance-accelerator-count\"] || !self.find(\"^([^/]+)\").endsWith(\"karpenter.k8s.aws\")"}]' -i pkg/apis/crds/karpenter.sh_nodepools.yaml \ No newline at end of file + {"message": "label domain \"karpenter.k8s.aws\" is restricted", "rule": "self.all(x, x in [\"karpenter.k8s.aws/instance-encryption-in-transit-supported\", \"karpenter.k8s.aws/instance-category\", \"karpenter.k8s.aws/instance-hypervisor\", \"karpenter.k8s.aws/instance-family\", \"karpenter.k8s.aws/instance-generation\", \"karpenter.k8s.aws/instance-local-nvme\", \"karpenter.k8s.aws/instance-size\", \"karpenter.k8s.aws/instance-cpu\",\"karpenter.k8s.aws/instance-memory\", \"karpenter.k8s.aws/instance-network-bandwidth\", \"karpenter.k8s.aws/instance-gpu-name\", \"karpenter.k8s.aws/instance-gpu-manufacturer\", \"karpenter.k8s.aws/instance-gpu-count\", \"karpenter.k8s.aws/instance-gpu-memory\", \"karpenter.k8s.aws/instance-accelerator-name\", \"karpenter.k8s.aws/instance-accelerator-manufacturer\", \"karpenter.k8s.aws/instance-accelerator-count\"] || !x.find(\"^([^/]+)\").endsWith(\"karpenter.k8s.aws\"))"}]' -i pkg/apis/crds/karpenter.sh_nodepools.yaml \ No newline at end of file diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index 196df4d85f4b..1f6ae1f03983 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -107,7 +107,7 @@ spec: - message: label "kubernetes.io/hostname" is restricted rule: self.all(x, x != "kubernetes.io/hostname") - message: label domain "karpenter.k8s.aws" is restricted - rule: self in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !self.find("^([^/]+)").endsWith("karpenter.k8s.aws") + rule: self.all(x, x in ["karpenter.k8s.aws/instance-encryption-in-transit-supported", "karpenter.k8s.aws/instance-category", "karpenter.k8s.aws/instance-hypervisor", "karpenter.k8s.aws/instance-family", "karpenter.k8s.aws/instance-generation", "karpenter.k8s.aws/instance-local-nvme", "karpenter.k8s.aws/instance-size", "karpenter.k8s.aws/instance-cpu","karpenter.k8s.aws/instance-memory", "karpenter.k8s.aws/instance-network-bandwidth", "karpenter.k8s.aws/instance-gpu-name", "karpenter.k8s.aws/instance-gpu-manufacturer", "karpenter.k8s.aws/instance-gpu-count", "karpenter.k8s.aws/instance-gpu-memory", "karpenter.k8s.aws/instance-accelerator-name", "karpenter.k8s.aws/instance-accelerator-manufacturer", "karpenter.k8s.aws/instance-accelerator-count"] || !x.find("^([^/]+)").endsWith("karpenter.k8s.aws")) type: object spec: description: NodeClaimSpec describes the desired state of the NodeClaim From 7e93cb3b0b1cf19fe783e1730159b92a6480abcc Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 24 Oct 2023 15:35:47 -0700 Subject: [PATCH 38/47] docs: Bump k8s version in docs to `1.28` (#4914) --- website/content/en/preview/faq.md | 6 +++--- .../scripts/step02-create-cluster.sh | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/website/content/en/preview/faq.md b/website/content/en/preview/faq.md index 7f91d19a341a..f951643f7027 100644 --- a/website/content/en/preview/faq.md +++ b/website/content/en/preview/faq.md @@ -176,10 +176,10 @@ Yes, see the [KubeletConfiguration Section in the NodePool docs]({{ Date: Tue, 24 Oct 2023 15:48:06 -0700 Subject: [PATCH 39/47] fix: Restore PR write permissions to PRSnapshot (#4917) --- .github/workflows/pr-snapshot.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/pr-snapshot.yaml b/.github/workflows/pr-snapshot.yaml index f720df036147..1aa259882ac8 100644 --- a/.github/workflows/pr-snapshot.yaml +++ b/.github/workflows/pr-snapshot.yaml @@ -5,6 +5,7 @@ on: types: [completed] permissions: id-token: write + pull-requests: write statuses: write jobs: release: From 7e73622784b1a9376fae242519d78cfbbcad9837 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 24 Oct 2023 16:14:04 -0700 Subject: [PATCH 40/47] docs: Update some docs around Helm Values changes for `featureGates` (#4912) --- charts/karpenter/README.md | 27 ++------ charts/karpenter/values.yaml | 48 -------------- .../en/preview/upgrading/upgrade-guide.md | 21 ++----- .../en/preview/upgrading/v1beta1-reference.md | 62 +++++++++++++------ 4 files changed, 54 insertions(+), 104 deletions(-) diff --git a/charts/karpenter/README.md b/charts/karpenter/README.md index dd65466fd39e..e5c279c3ab5c 100644 --- a/charts/karpenter/README.md +++ b/charts/karpenter/README.md @@ -30,18 +30,15 @@ helm upgrade --install --namespace karpenter --create-namespace \ | additionalAnnotations | object | `{}` | Additional annotations to add into metadata. | | additionalClusterRoleRules | list | `[]` | Specifies additional rules for the core ClusterRole. | | additionalLabels | object | `{}` | Additional labels to add into metadata. | -| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"}]}]}},"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"topologyKey":"kubernetes.io/hostname"}]}}` | Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. | +| affinity | object | `{"nodeAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":{"nodeSelectorTerms":[{"matchExpressions":[{"key":"karpenter.sh/provisioner-name","operator":"DoesNotExist"},{"key":"karpenter.sh/nodepool","operator":"DoesNotExist"}]}]}},"podAntiAffinity":{"requiredDuringSchedulingIgnoredDuringExecution":[{"topologyKey":"kubernetes.io/hostname"}]}}` | Affinity rules for scheduling the pod. If an explicit label selector is not provided for pod affinity or pod anti-affinity one will be created from the pod selector labels. | | controller.env | list | `[]` | Additional environment variables for the controller pod. | | controller.envFrom | list | `[]` | | -| controller.errorOutputPaths | list | `["stderr"]` | Controller errorOutputPaths - defaults to stderr only (Deprecated: Use logConfig.errorOutputPaths instead) | | controller.extraVolumeMounts | list | `[]` | Additional volumeMounts for the controller pod. | | controller.healthProbe.port | int | `8081` | The container port to use for http health probe. | | controller.image.digest | string | `"sha256:d29767fa9c5c0511a3812397c932f5735234f03a7a875575422b712d15e54a77"` | SHA256 digest of the controller image. | | controller.image.repository | string | `"public.ecr.aws/karpenter/controller"` | Repository path to the controller image. | | controller.image.tag | string | `"v0.31.0"` | Tag of the controller image. | -| controller.logLevel | string | `""` | Controller log level, defaults to the global log level (Deprecated: Use logConfig.logLevel.controller instead) | | controller.metrics.port | int | `8000` | The container port to use for metrics. | -| controller.outputPaths | list | `["stdout"]` | Controller outputPaths - defaults to stdout only (Deprecated: Use logConfig.outputPaths instead) | | controller.resources | object | `{}` | Resources for the controller pod. | | controller.sidecarContainer | list | `[]` | Additional sidecarContainer config | | controller.sidecarVolumeMounts | list | `[]` | Additional volumeMounts for the sidecar - this will be added to the volume mounts on top of extraVolumeMounts | @@ -61,7 +58,6 @@ helm upgrade --install --namespace karpenter --create-namespace \ | logConfig.logLevel.global | string | `"debug"` | Global log level, defaults to 'debug' | | logConfig.logLevel.webhook | string | `"error"` | Error log level, defaults to 'error' | | logConfig.outputPaths | list | `["stdout"]` | Log outputPaths - defaults to stdout only | -| logEncoding | string | `"console"` | Global log encoding (Deprecated: Use logConfig.logEncoding instead) | | logLevel | string | `"debug"` | Global log level | | nameOverride | string | `""` | Overrides the chart's name. | | nodeSelector | object | `{"kubernetes.io/os":"linux"}` | Node selectors to schedule the pod to nodes with labels. | @@ -78,30 +74,16 @@ helm upgrade --install --namespace karpenter --create-namespace \ | serviceMonitor.additionalLabels | object | `{}` | Additional labels for the ServiceMonitor. | | serviceMonitor.enabled | bool | `false` | Specifies whether a ServiceMonitor should be created. | | serviceMonitor.endpointConfig | object | `{}` | Endpoint configuration for the ServiceMonitor. | -| settings | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","aws":{"assumeRoleARN":"","assumeRoleDuration":"15m","clusterCABundle":"","clusterEndpoint":"","clusterName":"","defaultInstanceProfile":"","enableENILimitedPodDensity":true,"enablePodENI":false,"interruptionQueueName":"","isolatedVPC":false,"reservedENIs":"0","tags":null,"vmMemoryOverheadPercent":0.075},"batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","featureGates":{"driftEnabled":false},"interruptionQueue":"","isolatedVPC":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter | +| settings | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","batchIdleDuration":"1s","batchMaxDuration":"10s","clusterCABundle":"","clusterEndpoint":"","clusterName":"","featureGates":{"drift":false},"interruptionQueue":"","isolatedVPC":false,"reservedENIs":"0","vmMemoryOverheadPercent":0.075}` | Global Settings to configure Karpenter | | settings.assumeRoleARN | string | `""` | Role to assume for calling AWS services. | | settings.assumeRoleDuration | string | `"15m"` | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRoleARN set. | -| settings.aws | object | `{"assumeRoleARN":"","assumeRoleDuration":"15m","clusterCABundle":"","clusterEndpoint":"","clusterName":"","defaultInstanceProfile":"","enableENILimitedPodDensity":true,"enablePodENI":false,"interruptionQueueName":"","isolatedVPC":false,"reservedENIs":"0","tags":null,"vmMemoryOverheadPercent":0.075}` | AWS-specific configuration values (Deprecated: Use values without the "aws" prefix instead) | -| settings.aws.assumeRoleARN | string | `""` | Role to assume for calling AWS services. | -| settings.aws.assumeRoleDuration | string | `"15m"` | Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRoleARN set. | -| settings.aws.clusterCABundle | string | `""` | Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. | -| settings.aws.clusterEndpoint | string | `""` | Cluster endpoint. If not set, will be discovered during startup (EKS only) | -| settings.aws.clusterName | string | `""` | Cluster name. | -| settings.aws.defaultInstanceProfile | string | `""` | The default instance profile to use when launching nodes | -| settings.aws.enableENILimitedPodDensity | bool | `true` | Indicates whether new nodes should use ENI-based pod density DEPRECATED: Use `.spec.kubeletConfiguration.maxPods` to set pod density on a per-provisioner basis | -| settings.aws.enablePodENI | bool | `false` | If true then instances that support pod ENI will report a vpc.amazonaws.com/pod-eni resource | -| settings.aws.interruptionQueueName | string | `""` | interruptionQueueName is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. | -| settings.aws.isolatedVPC | bool | `false` | If true then assume we can't reach AWS services which don't have a VPC endpoint This also has the effect of disabling look-ups to the AWS pricing endpoint | -| settings.aws.reservedENIs | string | `"0"` | Reserved ENIs are not included in the calculations for max-pods or kube-reserved This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html | -| settings.aws.tags | string | `nil` | The global tags to use on all AWS infrastructure resources (launch templates, instances, etc.) across node templates | -| settings.aws.vmMemoryOverheadPercent | float | `0.075` | The VM memory overhead as a percent that will be subtracted from the total memory for all instance types | | settings.batchIdleDuration | string | `"1s"` | The maximum amount of time with no new ending pods that if exceeded ends the current batching window. If pods arrive faster than this time, the batching window will be extended up to the maxDuration. If they arrive slower, the pods will be batched separately. | | settings.batchMaxDuration | string | `"10s"` | The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one time which usually results in fewer but larger nodes. | | settings.clusterCABundle | string | `""` | Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. | | settings.clusterEndpoint | string | `""` | Cluster endpoint. If not set, will be discovered during startup (EKS only) | | settings.clusterName | string | `""` | Cluster name. | -| settings.featureGates | object | `{"driftEnabled":false}` | Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features | -| settings.featureGates.driftEnabled | bool | `false` | driftEnabled is in ALPHA and is disabled by default. Setting driftEnabled to true enables the drift deprovisioner to watch for drift between currently deployed nodes and the desired state of nodes set in provisioners and node templates | +| settings.featureGates | object | `{"drift":false}` | Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features | +| settings.featureGates.drift | bool | `false` | drift is in ALPHA and is disabled by default. Setting drift to true enables the drift disruption method to watch for drift between currently deployed nodes and the desired state of nodes set in provisioners and node templates | | settings.interruptionQueue | string | `""` | interruptionQueue is disabled if not specified. Enabling interruption handling may require additional permissions on the controller service account. Additional permissions are outlined in the docs. | | settings.isolatedVPC | bool | `false` | If true then assume we can't reach AWS services which don't have a VPC endpoint This also has the effect of disabling look-ups to the AWS pricing endpoint | | settings.reservedENIs | string | `"0"` | Reserved ENIs are not included in the calculations for max-pods or kube-reserved This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html | @@ -111,7 +93,6 @@ helm upgrade --install --namespace karpenter --create-namespace \ | tolerations | list | `[{"key":"CriticalAddonsOnly","operator":"Exists"}]` | Tolerations to allow the pod to be scheduled to nodes with taints. | | topologySpreadConstraints | list | `[{"maxSkew":1,"topologyKey":"topology.kubernetes.io/zone","whenUnsatisfiable":"ScheduleAnyway"}]` | Topology spread constraints to increase the controller resilience by distributing pods across the cluster zones. If an explicit label selector is not provided one will be created from the pod selector labels. | | webhook.enabled | bool | `true` | Whether to enable the webhooks and webhook permissions. | -| webhook.logLevel | string | `"error"` | Webhook log level (Deprecated: Use logConfig.logLevel.webhook instead) | | webhook.metrics.port | int | `8001` | The container port to use for webhook metrics. | | webhook.port | int | `8443` | The container port to use for the webhook. | diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index 59f56038abed..5190ca5042d1 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -119,14 +119,6 @@ controller: # cpu: 1 # memory: 1Gi - # -- Controller outputPaths - defaults to stdout only (Deprecated: Use logConfig.outputPaths instead) - outputPaths: - - stdout - # -- Controller errorOutputPaths - defaults to stderr only (Deprecated: Use logConfig.errorOutputPaths instead) - errorOutputPaths: - - stderr - # -- Controller log level, defaults to the global log level (Deprecated: Use logConfig.logLevel.controller instead) - logLevel: "" # -- Additional volumeMounts for the controller pod. extraVolumeMounts: [] # - name: aws-iam-token @@ -145,8 +137,6 @@ controller: webhook: # -- Whether to enable the webhooks and webhook permissions. enabled: true - # -- Webhook log level (Deprecated: Use logConfig.logLevel.webhook instead) - logLevel: error # -- The container port to use for the webhook. port: 8443 metrics: @@ -154,8 +144,6 @@ webhook: port: 8001 # -- Global log level logLevel: debug -# -- Global log encoding (Deprecated: Use logConfig.logEncoding instead) -logEncoding: console # -- Log configuration (Deprecated: Logging configuration will be dropped by v1, use logLevel instead) logConfig: # -- Whether to enable provisioning and mounting the log ConfigMap @@ -206,38 +194,6 @@ settings: # -- Reserved ENIs are not included in the calculations for max-pods or kube-reserved # This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html reservedENIs: "0" - # -- AWS-specific configuration values (Deprecated: Use values without the "aws" prefix instead) - aws: - # -- Role to assume for calling AWS services. - assumeRoleARN: "" - # -- Duration of assumed credentials in minutes. Default value is 15 minutes. Not used unless aws.assumeRoleARN set. - assumeRoleDuration: 15m - # -- Cluster CA bundle for TLS configuration of provisioned nodes. If not set, this is taken from the controller's TLS configuration for the API server. - clusterCABundle: "" - # -- Cluster name. - clusterName: "" - # -- Cluster endpoint. If not set, will be discovered during startup (EKS only) - clusterEndpoint: "" - # -- The default instance profile to use when launching nodes - defaultInstanceProfile: "" - # -- If true then instances that support pod ENI will report a vpc.amazonaws.com/pod-eni resource - enablePodENI: false - # -- Indicates whether new nodes should use ENI-based pod density - # DEPRECATED: Use `.spec.kubeletConfiguration.maxPods` to set pod density on a per-provisioner basis - enableENILimitedPodDensity: true - # -- If true then assume we can't reach AWS services which don't have a VPC endpoint - # This also has the effect of disabling look-ups to the AWS pricing endpoint - isolatedVPC: false - # -- The VM memory overhead as a percent that will be subtracted from the total memory for all instance types - vmMemoryOverheadPercent: 0.075 - # -- interruptionQueueName is disabled if not specified. Enabling interruption handling may - # require additional permissions on the controller service account. Additional permissions are outlined in the docs. - interruptionQueueName: "" - # -- The global tags to use on all AWS infrastructure resources (launch templates, instances, etc.) across node templates - tags: - # -- Reserved ENIs are not included in the calculations for max-pods or kube-reserved - # This is most often used in the VPC CNI custom networking setup https://docs.aws.amazon.com/eks/latest/userguide/cni-custom-network.html - reservedENIs: "0" # -- Feature Gate configuration values. Feature Gates will follow the same graduation process and requirements as feature gates # in Kubernetes. More information here https://kubernetes.io/docs/reference/command-line-tools-reference/feature-gates/#feature-gates-for-alpha-or-beta-features featureGates: @@ -245,7 +201,3 @@ settings: # Setting drift to true enables the drift disruption method to watch for drift between currently deployed nodes # and the desired state of nodes set in provisioners and node templates drift: false - # -- driftEnabled is in ALPHA and is disabled by default. (Deprecated: Use featureGates.drift instead) - # Setting driftEnabled to true enables the drift disruption method to watch for drift between currently deployed nodes - # and the desired state of nodes set in provisioners and node templates - driftEnabled: false diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index 5d28a9eb9b96..63a0fcdc95cc 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -111,26 +111,14 @@ Add `~/go/bin` to your $PATH, if you have not already done so. aws iam attach-role-policy --role-name "${ROLE_NAME}" --policy-arn "${POLICY_ARN}" ``` -5. Apply the v0.32.0 Custom Resource Definitions (CRDs) in the crds directory of the Karpenter helm chart. Here are the ways you can do this: - - * As an independent helm chart [karpenter-crd](https://gallery.ecr.aws/karpenter/karpenter-crd) - [source](https://github.com/aws/karpenter/blob/main/charts/karpenter-crd) that can be used by Helm to manage the lifecycle of these CRDs. To upgrade or install `karpenter-crd` run: - ```bash - helm upgrade --install karpenter-crd oci://public.ecr.aws/karpenter/karpenter-crd --version vx.y.z --namespace karpenter --create-namespace - ``` - - {{% alert title="Note" color="warning" %}} - - If you get the error `invalid ownership metadata; label validation error:` while installing the `karpenter-crd` chart from an older version of Karpenter, follow the [Troubleshooting Guide]({{}}) for details on how to resolve these errors. - - {{% /alert %}} - - * As part of the helm chart [karpenter](https://gallery.ecr.aws/karpenter/karpenter) - [source](https://github.com/aws/karpenter/blob/main/charts/karpenter/crds). Helm [does not manage the lifecycle of CRDs using this method](https://helm.sh/docs/chart_best_practices/custom_resource_definitions/), the tool will only install the CRD during the first installation of the helm chart. Subsequent chart upgrades will not add or remove CRDs, even if the CRDs have changed. When CRDs are changed, we will make a note in the version's upgrade guide. In general, ou can reapply the CRDs in the `crds` directory of the Karpenter helm chart: +5. Apply the v0.32.0 Custom Resource Definitions (CRDs): ```bash kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.sh_nodepools.yaml kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.sh_nodeclaims.yaml kubectl apply -f https://raw.githubusercontent.com/aws/karpenter{{< githubRelRef >}}pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml ``` + 6. Upgrade Karpenter to the new version: ```bash @@ -148,6 +136,10 @@ Add `~/go/bin` to your $PATH, if you have not already done so. --wait ``` + {{% alert title="Note" color="warning" %}} + Karpenter has deprecated and moved a number of Helm values as part of the v1beta1 release. Ensure that you upgrade to the newer version of these helm values during your migration to v1beta1. You can find detail for all the settings that were moved in the [v1beta1 Upgrade Reference]({{}}). + {{% /alert %}} + 7. Convert each AWSNodeTemplate to an EC2NodeClass. To convert your v1alpha Karpenter manifests to v1beta1, you can either manually apply changes to API components or use the [Karpenter conversion tool](https://github.com/aws/karpenter/tree/main/tools/karpenter-convert). See the [AWSNodeTemplate to EC2NodeClass]({{< relref "v1beta1-reference#awsnodetemplate-to-ec2nodeclass" >}}) section of the Karpenter Upgrade Reference for details on how to update to Karpenter AWSNodeTemplate objects. Here is an example of how to use the `karpenter-convert` CLI to convert an AWSNodeTemplate file to a EC2NodeClass file: @@ -234,7 +226,6 @@ If you are using some IaC for managing your policy documents attached to the con * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. -* Karpenter now moves all AWS controller-wide configuration settings from the `settings.aws` block into the top-level `settings` block. The previous `settings.aws` block is deprecated and will be dropped at v0.33.0. ### Upgrading to v0.31.0+ diff --git a/website/content/en/preview/upgrading/v1beta1-reference.md b/website/content/en/preview/upgrading/v1beta1-reference.md index 970e1187430f..42b9c7010760 100644 --- a/website/content/en/preview/upgrading/v1beta1-reference.md +++ b/website/content/en/preview/upgrading/v1beta1-reference.md @@ -588,7 +588,6 @@ The following table shows v1alpha5 metrics and the v1beta1 version of each metri | karpenter_machines_terminated | karpenter_nodeclaims_terminated | | karpenter_provisioners_limit | karpenter_nodepools_limit | | karpenter_provisioners_usage | karpenter_nodepools_usage | -| karpenter_provisioners_usage_pct | Dropped | | karpenter_deprovisioning_evaluation_duration_seconds | karpenter_disruption_evaluation_duration_seconds | | karpenter_deprovisioning_eligible_machines | karpenter_disruption_eligible_nodeclaims | | karpenter_deprovisioning_replacement_machine_initialized_seconds | karpenter_disruption_replacement_nodeclaims_initialized_seconds | @@ -596,6 +595,7 @@ The following table shows v1alpha5 metrics and the v1beta1 version of each metri | karpenter_deprovisioning_actions_performed | karpenter_disruption_actions_performed_total | | karpenter_deprovisioning_consolidation_timeouts | karpenter_disruption_consolidation_timeouts_total | | karpenter_nodes_leases_deleted | karpenter_leases_deleted | +| karpenter_provisioners_usage_pct | **Dropped** | In addition to these metrics, the MachineNotFound error returned by the `karpenter_cloudprovider_errors_total` values in the error label has been changed to `NodeClaimNotFound`. This is agnostic to the version of the API (Machine or NodeClaim) that actually owns the instance. @@ -604,23 +604,49 @@ In addition to these metrics, the MachineNotFound error returned by the `karpent The v1beta1 specification removes the `karpenter-global-settings` ConfigMap in favor of setting all Karpenter configuration using environment variables. Along, with this change, Karpenter has chosen to remove certain global variables that can be configured with more specificity in the EC2NodeClass . These values are marked as removed below. -| **`karpenter-global-settings` ConfigMap Key** | **Environment Variable** | **CLI Argument** -|---------------------------------------------------|---------------------------------|-------------------------------| -| batchMaxDuration | BATCH_MAX_DURATION | --batch-max-duration | -| batchIdleDuration | BATCH_IDLE_DURATION | --batch-idle-duration | -| assumeRoleARN | ASSUME_ROLE_ARN | --assume-role-arn | -| assumeRoleDuration | ASSUME_ROLE_DURATION | --assume-role-duration | -| clusterCABundle | CLUSTER_CA_BUNDLE | --cluster-ca-bundle | -| clusterName | CLUSTER_NAME | --cluster-name | -| clusterEndpoint | CLUSTER_ENDPOINT | --cluster-endpoint | -| defaultInstanceProfile | Dropped | Dropped | -| enablePodENI | Dropped | Dropped | -| enableENILimitedPodDensity | Dropped | Dropped | -| isolatedVPC | ISOLATED_VPC | --isolated-vpc | -| vmMemoryOverheadPercent | VM_MEMORY_OVERHEAD_PERCENT | --vm-memory-overhead-percent | -| interruptionQueueName | INTERRUPTION_QUEUE_NAME | --interruption-queue-name | -| reservedENIs | RESERVED_ENIS | --reserved-enis | -| featureGates.enableDrift | FEATURE_GATE="Drift=true" | --feature-gates Drift=true | +| **`karpenter-global-settings` ConfigMap Key** | **Environment Variable** | **CLI Argument** | +|-----------------------------------------------|----------------------------|------------------------------| +| batchMaxDuration | BATCH_MAX_DURATION | --batch-max-duration | +| batchIdleDuration | BATCH_IDLE_DURATION | --batch-idle-duration | +| assumeRoleARN | ASSUME_ROLE_ARN | --assume-role-arn | +| assumeRoleDuration | ASSUME_ROLE_DURATION | --assume-role-duration | +| clusterCABundle | CLUSTER_CA_BUNDLE | --cluster-ca-bundle | +| clusterName | CLUSTER_NAME | --cluster-name | +| clusterEndpoint | CLUSTER_ENDPOINT | --cluster-endpoint | +| isolatedVPC | ISOLATED_VPC | --isolated-vpc | +| vmMemoryOverheadPercent | VM_MEMORY_OVERHEAD_PERCENT | --vm-memory-overhead-percent | +| interruptionQueueName | INTERRUPTION_QUEUE | --interruption-queue | +| reservedENIs | RESERVED_ENIS | --reserved-enis | +| featureGates.driftEnabled | FEATURE_GATE="Drift=true" | --feature-gates Drift=true | +| defaultInstanceProfile | **Dropped** | **Dropped** | +| enablePodENI | **Dropped** | **Dropped** | +| enableENILimitedPodDensity | **Dropped** | **Dropped** | + +## Helm Values + +The v1beta1 helm chart comes with a number of changes to the values that were previously used in v0.31.x. Your older helm values will continue to work throughout v0.32.x but any values no longer specified in the chart will no longer be supported starting in v0.33.0. + +| < v0.32.x Key | >= v0.32.x Key | +|-----------------------------------------|----------------------------------| +| controller.outputPaths | logConfig.outputPaths | +| controller.errorOutputPaths | logConfig.errorOutputPaths | +| controller.logLevel | logConfig.logLevel.controller | +| webhook.logLevel | logConfig.logLevel.webhook | +| logEncoding | logConfig.logEncoding | +| settings.aws.assumeRoleARN | settings.assumeRoleARN | +| settings.aws.assumeRoleDuration | settings.assumeRoleDuration | +| settings.aws.clusterCABundle | settings.clusterCABundle | +| settings.aws.clusterName | settings.clusterName | +| settings.aws.clusterEndpoint | settings.clusterEndpoint | +| settings.aws.isolatedVPC | settings.isolatedVPC | +| settings.aws.vmMemoryOverheadPercent | settings.vmMemoryOverheadPercent | +| settings.aws.interruptionQueueName | settings.interruptionQueue | +| settings.aws.reservedENIs | settings.reservedENIs | +| settings.featureGates.driftEnabled | settings.featureGates.drift | +| settings.aws.defaultInstanceProfile | **Dropped** | +| settings.aws.enablePodENI | **Dropped** | +| settings.aws.enableENILimitedPodDensity | **Dropped** | +| settings.aws.tags | **Dropped** | ## Drift Enabled by Default From 82ea2e4e3be2a5efa46a388022f555b51cdfc470 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 24 Oct 2023 16:24:59 -0700 Subject: [PATCH 41/47] docs: Make the latest K8s version a parameter (#4918) --- website/config.yaml | 1 + website/content/en/preview/faq.md | 6 +++--- .../getting-started-with-karpenter/_index.md | 5 +++-- .../scripts/step02-create-cluster.sh | 2 +- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/website/config.yaml b/website/config.yaml index 62b0da37d6e4..05fd37b3b084 100644 --- a/website/config.yaml +++ b/website/config.yaml @@ -67,6 +67,7 @@ params: icon: fab fa-slack desc: 'Chat with us on Slack in the #aws-provider channel' latest_release_version: v0.31.1 + latest_k8s_version: 1.28 versions: - v0.31 - v0.30 diff --git a/website/content/en/preview/faq.md b/website/content/en/preview/faq.md index f951643f7027..66ea2c7d50fd 100644 --- a/website/content/en/preview/faq.md +++ b/website/content/en/preview/faq.md @@ -176,10 +176,10 @@ Yes, see the [KubeletConfiguration Section in the NodePool docs]({{}} by configuring an `amiSelector` that references the AMI name. +```yaml amiSelectorTerms: - - name: Windows_Server-2022-English-Full-EKS_Optimized-1.28* + - name: Windows_Server-2022-English-Full-EKS_Optimized-{{< param "latest_k8s_version" >}}* ``` ## Deprovisioning diff --git a/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md index 924e9a74577d..479f1156e8fe 100644 --- a/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/preview/getting-started/getting-started-with-karpenter/_index.md @@ -41,10 +41,11 @@ authenticate properly by running `aws sts get-caller-identity`. ### 2. Set environment variables -After setting up the tools, set the Karpenter version number: +After setting up the tools, set the Karpenter and Kubernetes version: ```bash export KARPENTER_VERSION={{< param "latest_release_version" >}} +export K8S_VERSION={{< param "latest_k8s_version" >}} ``` Then set the following environment variable: @@ -56,7 +57,7 @@ If you open a new shell to run steps in this procedure, you need to set some or To remind yourself of these values, type: ```bash -echo $KARPENTER_VERSION $CLUSTER_NAME $AWS_DEFAULT_REGION $AWS_ACCOUNT_ID $TEMPOUT +echo $KARPENTER_VERSION $K8S_VERSION $CLUSTER_NAME $AWS_DEFAULT_REGION $AWS_ACCOUNT_ID $TEMPOUT ``` {{% /alert %}} diff --git a/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh b/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh index 675aca0bebf1..32619fa12871 100755 --- a/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh +++ b/website/content/en/preview/getting-started/getting-started-with-karpenter/scripts/step02-create-cluster.sh @@ -12,7 +12,7 @@ kind: ClusterConfig metadata: name: ${CLUSTER_NAME} region: ${AWS_DEFAULT_REGION} - version: "1.28" + version: "${K8S_VERSION}" tags: karpenter.sh/discovery: ${CLUSTER_NAME} From e3923c432f33729a70cc044b51ce8876f3bdd17d Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Tue, 24 Oct 2023 16:44:13 -0700 Subject: [PATCH 42/47] ci: Fix helm linting on configmap (#4919) --- charts/karpenter/values.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index 5190ca5042d1..b85fe2d33615 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -166,6 +166,8 @@ logConfig: webhook: error # -- Global Settings to configure Karpenter settings: + # -- AWS-specific settings (Deprecated: The AWS block inside of settings was flattened into settings) + aws: {} # -- The maximum length of a batch window. The longer this is, the more pods we can consider for provisioning at one # time which usually results in fewer but larger nodes. batchMaxDuration: 10s From f6d1a28f6c40e026342fe9ee3659fc4da381a1e8 Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 25 Oct 2023 09:26:16 -0700 Subject: [PATCH 43/47] BREAKING: Fix helm ordering of values and set default log encoding to `json` (#4921) --- .../templates/configmap-logging.yaml | 8 +++---- charts/karpenter/templates/configmap.yaml | 2 +- charts/karpenter/templates/deployment.yaml | 24 ++++++++++--------- charts/karpenter/values.yaml | 4 ++-- .../en/preview/upgrading/upgrade-guide.md | 1 + 5 files changed, 21 insertions(+), 18 deletions(-) diff --git a/charts/karpenter/templates/configmap-logging.yaml b/charts/karpenter/templates/configmap-logging.yaml index 8570ed8ec2e3..ce0e6b8ae49e 100644 --- a/charts/karpenter/templates/configmap-logging.yaml +++ b/charts/karpenter/templates/configmap-logging.yaml @@ -14,7 +14,7 @@ data: # https://github.com/uber-go/zap/blob/aa3e73ec0896f8b066ddf668597a02f89628ee50/config.go zap-logger-config: | { - "level": "{{ or .Values.logConfig.logLevel.global .Values.logLevel }}", + "level": "{{ .Values.logConfig.logLevel.global }}", "development": false, "disableStacktrace": true, "disableCaller": true, @@ -24,7 +24,7 @@ data: }, "outputPaths": [{{ include "karpenter.outputPathsList" . }}], "errorOutputPaths": [{{ include "karpenter.errorOutputPathsList" . }}], - "encoding": "{{ or .Values.logConfig.logEncoding .Values.logEncoding }}", + "encoding": "{{ or .Values.logEncoding .Values.logConfig.logEncoding }}", "encoderConfig": { "timeKey": "time", "levelKey": "level", @@ -36,6 +36,6 @@ data: "timeEncoder": "iso8601" } } - loglevel.controller: {{ or .Values.logConfig.logLevel.controller .Values.controller.logLevel }} - loglevel.webhook: {{ or .Values.logConfig.logLevel.webhook .Values.webhook.logLevel }} + loglevel.controller: {{ or .Values.controller.logLevel .Values.logConfig.logLevel.controller }} + loglevel.webhook: {{ or .Values.webhook.logLevel .Values.logConfig.logLevel.webhook }} {{- end }} \ No newline at end of file diff --git a/charts/karpenter/templates/configmap.yaml b/charts/karpenter/templates/configmap.yaml index 8311a36192bf..efbc59d6626f 100644 --- a/charts/karpenter/templates/configmap.yaml +++ b/charts/karpenter/templates/configmap.yaml @@ -56,6 +56,6 @@ data: aws.reservedENIs: "{{ . }}" {{- end }} {{- with .Values.settings.featureGates.driftEnabled }} - featureGates.driftEnabled: "${{ . }}" + featureGates.driftEnabled: "{{ . }}" {{- end }} diff --git a/charts/karpenter/templates/deployment.yaml b/charts/karpenter/templates/deployment.yaml index 66a7a7812cab..ecb5d72731ff 100644 --- a/charts/karpenter/templates/deployment.yaml +++ b/charts/karpenter/templates/deployment.yaml @@ -82,8 +82,10 @@ spec: - name: DISABLE_WEBHOOK value: "true" {{- end }} + {{- with .Values.logLevel }} - name: LOG_LEVEL - value: "{{ .Values.logLevel }}" + value: "{{ . }}" + {{- end }} - name: METRICS_PORT value: "{{ .Values.controller.metrics.port }}" - name: HEALTH_PROBE_PORT @@ -99,7 +101,7 @@ spec: divisor: "0" resource: limits.memory - name: FEATURE_GATES - value: "Drift={{ or .Values.settings.featureGates.drift .Values.settings.featureGates.driftEnabled }}" + value: "Drift={{ or .Values.settings.featureGates.driftEnabled .Values.settings.featureGates.drift }}" {{- with .Values.settings.batchMaxDuration }} - name: BATCH_MAX_DURATION value: "{{ . }}" @@ -108,39 +110,39 @@ spec: - name: BATCH_IDLE_DURATION value: "{{ . }}" {{- end }} - {{- with or .Values.settings.assumeRoleARN .Values.settings.aws.assumeRoleARN }} + {{- with or .Values.settings.aws.assumeRoleARN .Values.settings.assumeRoleARN }} - name: ASSUME_ROLE_ARN value: "{{ . }}" {{- end }} - {{- with or .Values.settings.assumeRoleDuration .Values.settings.aws.assumeRoleDuration }} + {{- with or .Values.settings.aws.assumeRoleDuration .Values.settings.assumeRoleDuration }} - name: ASSUME_ROLE_DURATION value: "{{ . }}" {{- end }} - {{- with or .Values.settings.clusterCABundle .Values.settings.aws.clusterCABundle }} + {{- with or .Values.settings.aws.clusterCABundle .Values.settings.clusterCABundle }} - name: CLUSTER_CA_BUNDLE value: "{{ . }}" {{- end }} - {{- with or .Values.settings.clusterName .Values.settings.aws.clusterName }} + {{- with or .Values.settings.aws.clusterName .Values.settings.clusterName }} - name: CLUSTER_NAME value: "{{ . }}" {{- end }} - {{- with or .Values.settings.clusterEndpoint .Values.settings.aws.clusterEndpoint }} + {{- with or .Values.settings.aws.clusterEndpoint .Values.settings.clusterEndpoint }} - name: CLUSTER_ENDPOINT value: "{{ . }}" {{- end }} - {{- with or .Values.settings.isolatedVPC .Values.settings.aws.isolatedVPC }} + {{- with or .Values.settings.aws.isolatedVPC .Values.settings.isolatedVPC }} - name: ISOLATED_VPC value: "{{ . }}" {{- end }} - {{- with or .Values.settings.vmMemoryOverheadPercent .Values.settings.aws.vmMemoryOverheadPercent }} + {{- with or .Values.settings.aws.vmMemoryOverheadPercent .Values.settings.vmMemoryOverheadPercent }} - name: VM_MEMORY_OVERHEAD_PERCENT value: "{{ . }}" {{- end }} - {{- with or .Values.settings.interruptionQueue .Values.settings.aws.interruptionQueueName }} + {{- with or .Values.settings.aws.interruptionQueueName .Values.settings.interruptionQueue }} - name: INTERRUPTION_QUEUE value: "{{ . }}" {{- end }} - {{- with or .Values.settings.reservedENIs .Values.settings.aws.reservedENIs }} + {{- with or .Values.settings.aws.reservedENIs .Values.settings.reservedENIs }} - name: RESERVED_ENIS value: "{{ . }}" {{- end }} diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index b85fe2d33615..33700a002f96 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -154,8 +154,8 @@ logConfig: # -- Log errorOutputPaths - defaults to stderr only errorOutputPaths: - stderr - # -- Log encoding - defaults to console - must be one of 'json', 'console' - logEncoding: console + # -- Log encoding - defaults to json - must be one of 'json', 'console' + logEncoding: json # -- Component-based log configuration logLevel: # -- Global log level, defaults to 'debug' diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index 63a0fcdc95cc..0a783e5e1091 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -226,6 +226,7 @@ If you are using some IaC for managing your policy documents attached to the con * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. +* The default log encoding changed from `console` to `json`. If you were previously not setting the type of log encoding, this default will change with the helm chart. If you were setting the value through `logEncoding`, this value will continue to work until v0.33.x but it is deprecated in favor of `logConfig.logEncoding` ### Upgrading to v0.31.0+ From 93484b9c4a5ef2e5b16630e479df0fd7df51cf97 Mon Sep 17 00:00:00 2001 From: Ellis Tarn Date: Wed, 25 Oct 2023 12:21:35 -0700 Subject: [PATCH 44/47] chore: Remove GITHUB_ENV in favor of GITHUB_OUTPUT (#4926) --- .github/actions/e2e/slack/notify/action.yaml | 9 +++--- .github/workflows/codegen.yaml | 5 +-- .github/workflows/e2e-upgrade.yaml | 32 ++++++++++---------- .github/workflows/e2e.yaml | 24 +++++++-------- .github/workflows/pr-snapshot.yaml | 18 +++++------ .github/workflows/release.yaml | 12 ++++---- 6 files changed, 51 insertions(+), 49 deletions(-) diff --git a/.github/actions/e2e/slack/notify/action.yaml b/.github/actions/e2e/slack/notify/action.yaml index af4aeb8ba841..143d63aa0e9e 100644 --- a/.github/actions/e2e/slack/notify/action.yaml +++ b/.github/actions/e2e/slack/notify/action.yaml @@ -17,7 +17,8 @@ runs: - uses: actions/checkout@v4 with: ref: ${{ inputs.git_ref }} - - shell: bash + - id: get-run-name + shell: bash run: | if [[ ${{ github.event_name }} == "schedule" ]]; then RUN_NAME="${{ inputs.suite }}-periodic" @@ -26,14 +27,14 @@ runs: fi # Convert the RUN_NAME to all lowercase - echo RUN_NAME=${RUN_NAME,,} >> $GITHUB_ENV + echo RUN_NAME=${RUN_NAME,,} >> $GITHUB_OUTPUT - uses: ./.github/actions/e2e/slack/send-message if: ${{ job.status == 'success' }} with: url: ${{ inputs.url }} - message: ":white_check_mark: ${{ env.RUN_NAME }} (https://github.com/${{github.repository}}/actions/runs/${{github.run_id}})" + message: ":white_check_mark: ${{ steps.get-run-name.outputs.RUN_NAME }} (https://github.com/${{github.repository}}/actions/runs/${{github.run_id}})" - uses: ./.github/actions/e2e/slack/send-message if: ${{ job.status == 'failure' }} with: url: ${{ inputs.url }} - message: ":x: ${{ env.RUN_NAME }} (https://github.com/${{github.repository}}/actions/runs/${{github.run_id}})" \ No newline at end of file + message: ":x: ${{ steps.get-run-name.outputs.RUN_NAME }} (https://github.com/${{github.repository}}/actions/runs/${{github.run_id}})" \ No newline at end of file diff --git a/.github/workflows/codegen.yaml b/.github/workflows/codegen.yaml index d92447dbd44e..02e9f821ae26 100644 --- a/.github/workflows/codegen.yaml +++ b/.github/workflows/codegen.yaml @@ -29,9 +29,10 @@ jobs: env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} ENABLE_GIT_PUSH: true - - run: export APICodeGenUpdate=$((cat /tmp/codegen-updates && echo "APICodeGenUpdate=true" >> $GITHUB_ENV) || echo "false") + - id: detect-changes + run: cat /tmp/codegen-updates && echo APICodeGenUpdate=true >> $GITHUB_OUTPUT - name: Create Pull Request - if: env.APICodeGenUpdate == 'true' + if: steps.detect-changes.outputs.APICodeGenUpdate == 'true' uses: actions/github-script@v6 with: script: | diff --git a/.github/workflows/e2e-upgrade.yaml b/.github/workflows/e2e-upgrade.yaml index 8a71094b63e0..f023e9799a7b 100644 --- a/.github/workflows/e2e-upgrade.yaml +++ b/.github/workflows/e2e-upgrade.yaml @@ -72,18 +72,18 @@ jobs: role-to-assume: arn:aws:iam::${{ vars.ACCOUNT_ID }}:role/${{ vars.ROLE_NAME }} aws-region: ${{ inputs.region }} role-duration-seconds: 21600 - - name: generate cluster name + - id: generate-cluster-name run: | CLUSTER_NAME="upgrade-$RANDOM$RANDOM" echo "Using cluster name \"$CLUSTER_NAME\"" - echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_ENV - - name: create eks cluster '${{ env.CLUSTER_NAME }}' + echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_OUTPUT + - name: create eks cluster '${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}' uses: ./.github/actions/e2e/create-cluster with: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} eksctl_version: ${{ inputs.eksctl_version }} ip_family: IPv4 # Set the value to IPv6 if IPv6 suite, else IPv4 @@ -94,7 +94,7 @@ jobs: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ vars.PROMETHEUS_REGION }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} workspace_id: ${{ vars.WORKSPACE_ID }} git_ref: ${{ inputs.from_git_ref }} - name: install karpenter @@ -105,18 +105,18 @@ jobs: region: ${{ inputs.region }} ecr_account_id: ${{ vars.ECR_ACCOUNT_ID }} ecr_region: ${{ vars.ECR_REGION }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} git_ref: ${{ inputs.from_git_ref }} - uses: actions/checkout@v4 with: ref: ${{ inputs.to_git_ref }} - - name: upgrade eks cluster '${{ env.CLUSTER_NAME }}' + - name: upgrade eks cluster '${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}' uses: ./.github/actions/e2e/create-cluster with: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} eksctl_version: ${{ inputs.eksctl_version }} ip_family: IPv4 # Set the value to IPv6 if IPv6 suite, else IPv4 @@ -127,7 +127,7 @@ jobs: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ vars.PROMETHEUS_REGION }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} workspace_id: ${{ vars.WORKSPACE_ID }} git_ref: ${{ inputs.to_git_ref }} - name: upgrade crds @@ -136,7 +136,7 @@ jobs: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} git_ref: ${{ inputs.to_git_ref }} - name: upgrade karpenter uses: ./.github/actions/e2e/install-karpenter @@ -146,12 +146,12 @@ jobs: region: ${{ inputs.region }} ecr_account_id: ${{ vars.ECR_ACCOUNT_ID }} ecr_region: ${{ vars.ECR_REGION }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} git_ref: ${{ inputs.to_git_ref }} - name: run the Upgrade test suite run: | - aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} - CLUSTER_NAME=${{ env.CLUSTER_NAME }} INTERRUPTION_QUEUE=${{ env.CLUSTER_NAME }} CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ env.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" TEST_SUITE="Beta/Integration" make e2etests + aws eks update-kubeconfig --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + CLUSTER_NAME=${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} INTERRUPTION_QUEUE=${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" TEST_SUITE="Beta/Integration" make e2etests - name: notify slack of success or failure uses: ./.github/actions/e2e/slack/notify if: (success() || failure()) && github.event_name != 'workflow_run' && github.event_name != 'conformance' @@ -167,15 +167,15 @@ jobs: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} - - name: cleanup karpenter and cluster '${{ env.CLUSTER_NAME }}' resources + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + - name: cleanup karpenter and cluster '${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}' resources uses: ./.github/actions/e2e/cleanup if: always() with: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} git_ref: ${{ inputs.to_git_ref }} eksctl_version: ${{ inputs.eksctl_version }} - if: always() && github.event_name == 'workflow_run' diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 42abf5c92bfc..574866b308fa 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -100,18 +100,18 @@ jobs: run: | # Creating jitter so that we can stagger cluster creation to avoid throttling sleep $(( $RANDOM % 300 + 1 )) - - name: generate cluster name + - id: generate-cluster-name run: | CLUSTER_NAME=$(echo ${{ inputs.suite }}-$RANDOM$RANDOM | awk '{print tolower($0)}' | tr / -) echo "Using cluster name \"$CLUSTER_NAME\"" - echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_ENV - - name: create eks cluster '${{ env.CLUSTER_NAME }}' + echo CLUSTER_NAME=$CLUSTER_NAME >> $GITHUB_OUTPUT + - name: create eks cluster '${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}' uses: ./.github/actions/e2e/create-cluster with: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} eksctl_version: ${{ inputs.eksctl_version }} ip_family: ${{ contains(inputs.suite, 'IPv6') && 'IPv6' || 'IPv4' }} # Set the value to IPv6 if IPv6 suite, else IPv4 @@ -122,7 +122,7 @@ jobs: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ vars.PROMETHEUS_REGION }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} workspace_id: ${{ vars.WORKSPACE_ID }} git_ref: ${{ inputs.git_ref }} - name: install karpenter @@ -133,14 +133,14 @@ jobs: region: ${{ inputs.region }} ecr_account_id: ${{ vars.ECR_ACCOUNT_ID }} ecr_region: ${{ vars.ECR_REGION }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} git_ref: ${{ inputs.git_ref }} - name: run the ${{ inputs.suite }} test suite run: | - aws eks update-kubeconfig --name ${{ env.CLUSTER_NAME }} + aws eks update-kubeconfig --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} TEST_SUITE="${{ inputs.suite }}" ENABLE_METRICS=${{ inputs.enable_metrics }} METRICS_REGION=${{ vars.TIMESTREAM_REGION }} GIT_REF="$(git rev-parse HEAD)" \ - CLUSTER_NAME="${{ env.CLUSTER_NAME }}" CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ env.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" \ - INTERRUPTION_QUEUE="${{ env.CLUSTER_NAME }}" make e2etests + CLUSTER_NAME="${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}" CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" \ + INTERRUPTION_QUEUE="${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}" make e2etests - name: notify slack of success or failure uses: ./.github/actions/e2e/slack/notify if: (success() || failure()) && github.event_name != 'workflow_run' && inputs.workflow_trigger != 'conformance' @@ -156,15 +156,15 @@ jobs: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} - - name: cleanup karpenter and cluster '${{ env.CLUSTER_NAME }}' resources + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + - name: cleanup karpenter and cluster '${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }}' resources uses: ./.github/actions/e2e/cleanup if: always() with: account_id: ${{ vars.ACCOUNT_ID }} role: ${{ vars.ROLE_NAME }} region: ${{ inputs.region }} - cluster_name: ${{ env.CLUSTER_NAME }} + cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} git_ref: ${{ inputs.git_ref }} eksctl_version: ${{ inputs.eksctl_version }} - if: always() && github.event_name == 'workflow_run' diff --git a/.github/workflows/pr-snapshot.yaml b/.github/workflows/pr-snapshot.yaml index 1aa259882ac8..461b6748e636 100644 --- a/.github/workflows/pr-snapshot.yaml +++ b/.github/workflows/pr-snapshot.yaml @@ -14,19 +14,19 @@ jobs: steps: - uses: actions/checkout@v4 - uses: ./.github/actions/download-artifact - - name: Parse artifacts and assign GA environment variables + - id: metadata run: | pr_number=$(head -n 2 /tmp/artifacts/metadata.txt | tail -n 1) pr_commit=$(tail -n 1 /tmp/artifacts/metadata.txt) - echo "PR_COMMIT=$pr_commit" >> $GITHUB_ENV - echo "PR_NUMBER=$pr_number" >> $GITHUB_ENV + echo "PR_COMMIT=$pr_commit" >> $GITHUB_OUTPUT + echo "PR_NUMBER=$pr_number" >> $GITHUB_OUTPUT - uses: actions/checkout@v4 with: - ref: ${{ env.PR_COMMIT }} + ref: ${{ steps.metadata.outputs.PR_COMMIT }} - uses: ./.github/actions/commit-status/start with: name: "${{ github.workflow }} / ${{ github.job }} (pull_request_review)" - git_ref: ${{ env.PR_COMMIT }} + git_ref: ${{ steps.metadata.outputs.PR_COMMIT }} - uses: ./.github/actions/install-deps - uses: aws-actions/configure-aws-credentials@v4.0.1 with: @@ -34,19 +34,19 @@ jobs: aws-region: ${{ vars.ECR_REGION }} - run: make snapshot env: - GH_PR_NUMBER: ${{env.PR_NUMBER}} + GH_PR_NUMBER: ${{steps.metadata.outputs.PR_NUMBER}} - uses: actions/github-script@v6 with: github-token: ${{secrets.GITHUB_TOKEN}} script: | github.rest.issues.createComment({ - issue_number: `${{env.PR_NUMBER}}`, + issue_number: `${{steps.metadata.outputs.PR_NUMBER}}`, owner: context.repo.owner, repo: context.repo.repo, - body: 'Snapshot successfully published to `oci://${{ vars.ECR_ACCOUNT_ID }}.dkr.ecr.${{ vars.ECR_REGION }}.amazonaws.com/karpenter/snapshot/karpenter:v0-${{env.PR_COMMIT}}`.' + body: 'Snapshot successfully published to `oci://${{ vars.ECR_ACCOUNT_ID }}.dkr.ecr.${{ vars.ECR_REGION }}.amazonaws.com/karpenter/snapshot/karpenter:v0-${{steps.metadata.outputs.PR_COMMIT}}`.' }) - if: always() uses: ./.github/actions/commit-status/end with: name: "${{ github.workflow }} / ${{ github.job }} (pull_request_review)" - git_ref: ${{ env.PR_COMMIT }} + git_ref: ${{ steps.metadata.outputs.PR_COMMIT }} diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index c8dd1d914610..360ac1325d96 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -19,10 +19,10 @@ jobs: with: repo_token: '${{ secrets.GITHUB_TOKEN }}' prerelease: false - - name: Save commit tag to environment + - id: tag run: | TAG=$(git describe --tags --exact-match) - echo "TAG=${TAG}" >> $GITHUB_ENV + echo "TAG=${TAG}" >> $GITHUB_OUTPUT - uses: ./.github/actions/install-deps - uses: aws-actions/configure-aws-credentials@v4.0.1 with: @@ -36,19 +36,19 @@ jobs: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} GITHUB_REPO: ${{ github.repository }} - name: Create PR - if: env.TAG != 'no tag' + if: steps.tag.outputs.TAG != 'no tag' uses: actions/github-script@v6 with: script: | const { repo, owner } = context.repo; const result = await github.rest.pulls.create({ - title: 'chore: Release ${{ env.TAG }}', + title: 'chore: Release ${{ steps.tag.outputs.TAG }}', owner, repo, - head: 'release-${{ env.TAG }}', + head: 'release-${{ steps.tag.outputs.TAG }}', base: 'main', body: [ - 'Stable Release Changes for ${{ env.TAG }}.', + 'Stable Release Changes for ${{ steps.tag.outputs.TAG }}.', 'Please disregard this PR if it is for a patch release.', 'Please remove the branch after merging.', 'This PR is generated by [StableRelease](https://github.com/aws/karpenter/actions/workflows/stable-release.yml).' From 6d4cd286cc82552115ab3c71c5ffb7835b098d73 Mon Sep 17 00:00:00 2001 From: Amanuel Engeda <74629455+engedaam@users.noreply.github.com> Date: Wed, 25 Oct 2023 13:59:49 -0700 Subject: [PATCH 45/47] fix: Stop firing conformance testing Notifications (#4927) --- .github/workflows/e2e-matrix.yaml | 1 + .github/workflows/e2e-upgrade.yaml | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-matrix.yaml b/.github/workflows/e2e-matrix.yaml index c9e2948f1d5a..398ec6791032 100644 --- a/.github/workflows/e2e-matrix.yaml +++ b/.github/workflows/e2e-matrix.yaml @@ -87,5 +87,6 @@ jobs: region: ${{ inputs.region }} k8s_version: ${{ inputs.k8s_version }} eksctl_version: ${{ inputs.eksctl_version }} + workflow_trigger: ${{ inputs.workflow_trigger }} secrets: SLACK_WEBHOOK_URL: ${{ secrets.SLACK_WEBHOOK_URL }} diff --git a/.github/workflows/e2e-upgrade.yaml b/.github/workflows/e2e-upgrade.yaml index f023e9799a7b..fe74965f6590 100644 --- a/.github/workflows/e2e-upgrade.yaml +++ b/.github/workflows/e2e-upgrade.yaml @@ -42,6 +42,8 @@ on: eksctl_version: type: string default: v0.160.0-rc.0 + workflow_trigger: + type: string secrets: SLACK_WEBHOOK_URL: required: true @@ -154,7 +156,7 @@ jobs: CLUSTER_NAME=${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} INTERRUPTION_QUEUE=${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} CLUSTER_ENDPOINT="$(aws eks describe-cluster --name ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} --query "cluster.endpoint" --output text)" TEST_SUITE="Beta/Integration" make e2etests - name: notify slack of success or failure uses: ./.github/actions/e2e/slack/notify - if: (success() || failure()) && github.event_name != 'workflow_run' && github.event_name != 'conformance' + if: (success() || failure()) && github.event_name != 'workflow_run' && inputs.workflow_trigger != 'conformance' with: url: ${{ secrets.SLACK_WEBHOOK_URL }} suite: Upgrade From 86f098a37c5f4e37ba7c64eec1fbb370435aed4b Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 25 Oct 2023 14:42:41 -0700 Subject: [PATCH 46/47] fix: Fix bug in drift ami resolution with incompatible requirements (#4925) Co-authored-by: Amanuel Engeda --- pkg/cloudprovider/drift.go | 3 --- pkg/cloudprovider/machine_test.go | 25 +++++++++++++----- pkg/cloudprovider/nodeclaim_test.go | 23 +++++++++++++---- pkg/fake/utils.go | 2 +- test/suites/alpha/drift/suite_test.go | 37 ++++++++++++++++++++++++--- test/suites/beta/drift/suite_test.go | 35 ++++++++++++++++++++++--- 6 files changed, 102 insertions(+), 23 deletions(-) diff --git a/pkg/cloudprovider/drift.go b/pkg/cloudprovider/drift.go index 216785f7d82d..f407ff94c150 100644 --- a/pkg/cloudprovider/drift.go +++ b/pkg/cloudprovider/drift.go @@ -89,9 +89,6 @@ func (c *CloudProvider) isAMIDrifted(ctx context.Context, nodeClaim *corev1beta1 return "", fmt.Errorf("no amis exist given constraints") } mappedAMIs := amis.MapToInstanceTypes([]*cloudprovider.InstanceType{nodeInstanceType}, nodeClaim.IsMachine) - if len(mappedAMIs) == 0 { - return "", fmt.Errorf("no instance types satisfy requirements of amis %v", amis) - } if !lo.Contains(lo.Keys(mappedAMIs), instance.ImageID) { return AMIDrift, nil } diff --git a/pkg/cloudprovider/machine_test.go b/pkg/cloudprovider/machine_test.go index f177eebf32cc..dc289e817f66 100644 --- a/pkg/cloudprovider/machine_test.go +++ b/pkg/cloudprovider/machine_test.go @@ -157,7 +157,7 @@ var _ = Describe("Machine/CloudProvider", func() { }) }) Context("Machine Drift", func() { - var validAMI string + var armAMIID, amdAMIID string var validSecurityGroup string var selectedInstanceType *corecloudproivder.InstanceType var instance *ec2.Instance @@ -165,21 +165,27 @@ var _ = Describe("Machine/CloudProvider", func() { var validSubnet1 string var validSubnet2 string BeforeEach(func() { - validAMI = fake.ImageID() + armAMIID, amdAMIID = fake.ImageID(), fake.ImageID() validSecurityGroup = fake.SecurityGroupID() validSubnet1 = fake.SubnetID() validSubnet2 = fake.SubnetID() awsEnv.SSMAPI.GetParameterOutput = &ssm.GetParameterOutput{ - Parameter: &ssm.Parameter{Value: aws.String(validAMI)}, + Parameter: &ssm.Parameter{Value: aws.String(armAMIID)}, } awsEnv.EC2API.DescribeImagesOutput.Set(&ec2.DescribeImagesOutput{ Images: []*ec2.Image{ { Name: aws.String(coretest.RandomName()), - ImageId: aws.String(validAMI), + ImageId: aws.String(armAMIID), Architecture: aws.String("arm64"), CreationDate: aws.String("2022-08-15T12:00:00Z"), }, + { + Name: aws.String(coretest.RandomName()), + ImageId: aws.String(amdAMIID), + Architecture: aws.String("x86_64"), + CreationDate: aws.String("2022-08-15T12:00:00Z"), + }, }, }) nodeTemplate.Status.SecurityGroups = []v1alpha1.SecurityGroup{ @@ -205,7 +211,7 @@ var _ = Describe("Machine/CloudProvider", func() { // Create the instance we want returned from the EC2 API instance = &ec2.Instance{ - ImageId: aws.String(validAMI), + ImageId: aws.String(armAMIID), InstanceType: aws.String(selectedInstanceType.Name), SubnetId: aws.String(validSubnet1), SpotInstanceRequestId: aws.String(coretest.RandomName()), @@ -368,6 +374,13 @@ var _ = Describe("Machine/CloudProvider", func() { _, err := cloudProvider.IsDrifted(ctx, nodeclaimutil.New(machine)) Expect(err).To(HaveOccurred()) }) + It("should return drifted if the AMI no longer matches the existing machine instance type", func() { + nodeTemplate.Spec.AMISelector = map[string]string{"aws::ids": amdAMIID} + ExpectApplied(ctx, env.Client, nodeTemplate) + isDrifted, err := cloudProvider.IsDrifted(ctx, nodeclaimutil.New(machine)) + Expect(err).ToNot(HaveOccurred()) + Expect(isDrifted).To(Equal(cloudprovider.AMIDrift)) + }) Context("Static Drift Detection", func() { BeforeEach(func() { provisioner = test.Provisioner(coretest.ProvisionerOptions{ @@ -420,7 +433,7 @@ var _ = Describe("Machine/CloudProvider", func() { Expect(err).NotTo(HaveOccurred()) Expect(isDrifted).To(BeEmpty()) }, - Entry("AMISelector Drift", v1alpha1.AWSNodeTemplateSpec{AMISelector: map[string]string{"aws::ids": validAMI}}), + Entry("AMISelector Drift", v1alpha1.AWSNodeTemplateSpec{AMISelector: map[string]string{"aws::ids": armAMIID}}), Entry("SubnetSelector Drift", v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{SubnetSelector: map[string]string{"aws-ids": "subnet-test1"}}}), Entry("SecurityGroupSelector Drift", v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{SecurityGroupSelector: map[string]string{"sg-key": "sg-value"}}}), ) diff --git a/pkg/cloudprovider/nodeclaim_test.go b/pkg/cloudprovider/nodeclaim_test.go index 218d358d1e70..a22146fe2801 100644 --- a/pkg/cloudprovider/nodeclaim_test.go +++ b/pkg/cloudprovider/nodeclaim_test.go @@ -124,28 +124,34 @@ var _ = Describe("NodeClaim/CloudProvider", func() { }) }) Context("NodeClaim Drift", func() { - var validAMI string + var armAMIID, amdAMIID string var validSecurityGroup string var selectedInstanceType *corecloudproivder.InstanceType var instance *ec2.Instance var validSubnet1 string var validSubnet2 string BeforeEach(func() { - validAMI = fake.ImageID() + armAMIID, amdAMIID = fake.ImageID(), fake.ImageID() validSecurityGroup = fake.SecurityGroupID() validSubnet1 = fake.SubnetID() validSubnet2 = fake.SubnetID() awsEnv.SSMAPI.GetParameterOutput = &ssm.GetParameterOutput{ - Parameter: &ssm.Parameter{Value: aws.String(validAMI)}, + Parameter: &ssm.Parameter{Value: aws.String(armAMIID)}, } awsEnv.EC2API.DescribeImagesOutput.Set(&ec2.DescribeImagesOutput{ Images: []*ec2.Image{ { Name: aws.String(coretest.RandomName()), - ImageId: aws.String(validAMI), + ImageId: aws.String(armAMIID), Architecture: aws.String("arm64"), CreationDate: aws.String("2022-08-15T12:00:00Z"), }, + { + Name: aws.String(coretest.RandomName()), + ImageId: aws.String(amdAMIID), + Architecture: aws.String("x86_64"), + CreationDate: aws.String("2022-08-15T12:00:00Z"), + }, }, }) nodeClass.Status.SecurityGroups = []v1beta1.SecurityGroup{ @@ -171,7 +177,7 @@ var _ = Describe("NodeClaim/CloudProvider", func() { // Create the instance we want returned from the EC2 API instance = &ec2.Instance{ - ImageId: aws.String(validAMI), + ImageId: aws.String(armAMIID), InstanceType: aws.String(selectedInstanceType.Name), SubnetId: aws.String(validSubnet1), SpotInstanceRequestId: aws.String(coretest.RandomName()), @@ -305,6 +311,13 @@ var _ = Describe("NodeClaim/CloudProvider", func() { _, err := cloudProvider.IsDrifted(ctx, nodeClaim) Expect(err).To(HaveOccurred()) }) + It("should return drifted if the AMI no longer matches the existing NodeClaims instance type", func() { + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: amdAMIID}} + ExpectApplied(ctx, env.Client, nodeClass) + isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) + Expect(err).ToNot(HaveOccurred()) + Expect(isDrifted).To(Equal(cloudprovider.AMIDrift)) + }) Context("Static Drift Detection", func() { DescribeTable("should return drifted if the spec is updated", func(changes v1beta1.EC2NodeClass) { diff --git a/pkg/fake/utils.go b/pkg/fake/utils.go index 1075c7375c80..427e38184876 100644 --- a/pkg/fake/utils.go +++ b/pkg/fake/utils.go @@ -37,7 +37,7 @@ func ProviderID(id string) string { } func ImageID() string { - return fmt.Sprintf("ami-%s", randomdata.Alphanumeric(17)) + return fmt.Sprintf("ami-%s", strings.ToLower(randomdata.Alphanumeric(17))) } func SecurityGroupID() string { return fmt.Sprintf("sg-%s", randomdata.Alphanumeric(17)) diff --git a/test/suites/alpha/drift/suite_test.go b/test/suites/alpha/drift/suite_test.go index 3fce8ca0e58a..c716100f64c1 100644 --- a/test/suites/alpha/drift/suite_test.go +++ b/test/suites/alpha/drift/suite_test.go @@ -40,12 +40,13 @@ import ( "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis/v1alpha1" + awstest "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment/aws" ) var env *aws.Environment -var customAMI string +var amdAMI string func TestDrift(t *testing.T) { RegisterFailHandler(Fail) @@ -70,7 +71,7 @@ var _ = Describe("Drift", Label("AWS"), func() { var nodeTemplate *v1alpha1.AWSNodeTemplate var provisioner *v1alpha5.Provisioner BeforeEach(func() { - customAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) + amdAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) nodeTemplate = awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, SubnetSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, @@ -106,7 +107,7 @@ var _ = Describe("Drift", Label("AWS"), func() { machine := env.EventuallyExpectCreatedMachineCount("==", 1)[0] node := env.EventuallyExpectNodeCount("==", 1)[0] - nodeTemplate.Spec.AMISelector = map[string]string{"aws-ids": customAMI} + nodeTemplate.Spec.AMISelector = map[string]string{"aws-ids": amdAMI} env.ExpectCreatedOrUpdated(nodeTemplate) Eventually(func(g Gomega) { @@ -119,6 +120,34 @@ var _ = Describe("Drift", Label("AWS"), func() { env.ExpectUpdated(pod) env.EventuallyExpectNotFound(pod, machine, node) }) + It("should return drifted if the AMI no longer matches the existing machine instance type", func() { + armParameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: awssdk.String("/aws/service/eks/optimized-ami/1.28/amazon-linux-2-arm64/recommended/image_id"), + }) + Expect(err).To(BeNil()) + armAMI := *armParameter.Parameter.Value + nodeTemplate.Spec.AMIFamily = &v1alpha1.AMIFamilyAL2 + nodeTemplate.Spec.AMISelector = map[string]string{"aws-ids": armAMI} + provisioner.Spec.Requirements = append(provisioner.Spec.Requirements, v1.NodeSelectorRequirement{Key: v1.LabelArchStable, Operator: v1.NodeSelectorOpExists}) + + env.ExpectCreated(pod, nodeTemplate, provisioner) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + machine := env.EventuallyExpectCreatedMachineCount("==", 1)[0] + node := env.EventuallyExpectNodeCount("==", 1)[0] + nodeTemplate.Spec.AMISelector = map[string]string{"aws-ids": amdAMI} + env.ExpectCreatedOrUpdated(nodeTemplate) + + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(machine), machine)).To(Succeed()) + g.Expect(machine.StatusConditions().GetCondition(v1alpha5.MachineDrifted).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + delete(pod.Annotations, v1alpha5.DoNotEvictPodAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, machine, node) + }) It("should not deprovision nodes that have drifted without the featureGate enabled", func() { env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) // choose an old static image @@ -136,7 +165,7 @@ var _ = Describe("Drift", Label("AWS"), func() { env.ExpectCreatedNodeCount("==", 1) node := env.Monitor.CreatedNodes()[0] - nodeTemplate.Spec.AMISelector = map[string]string{"aws-ids": customAMI} + nodeTemplate.Spec.AMISelector = map[string]string{"aws-ids": amdAMI} env.ExpectUpdated(nodeTemplate) // We should consistently get the same node existing for a minute diff --git a/test/suites/beta/drift/suite_test.go b/test/suites/beta/drift/suite_test.go index b1872f270dec..f8c65a1c6c28 100644 --- a/test/suites/beta/drift/suite_test.go +++ b/test/suites/beta/drift/suite_test.go @@ -45,7 +45,7 @@ import ( ) var env *aws.Environment -var customAMI string +var amdAMI string func TestDrift(t *testing.T) { RegisterFailHandler(Fail) @@ -70,7 +70,7 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { var nodeClass *v1beta1.EC2NodeClass var nodePool *corev1beta1.NodePool BeforeEach(func() { - customAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) + amdAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) nodeClass = awstest.EC2NodeClass(v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{ AMIFamily: &v1beta1.AMIFamilyAL2, SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ @@ -122,7 +122,34 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] node := env.EventuallyExpectNodeCount("==", 1)[0] - nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: customAMI}} + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: amdAMI}} + env.ExpectCreatedOrUpdated(nodeClass) + + Eventually(func(g Gomega) { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(nodeClaim), nodeClaim)).To(Succeed()) + g.Expect(nodeClaim.StatusConditions().GetCondition(corev1beta1.Drifted).IsTrue()).To(BeTrue()) + }).Should(Succeed()) + + delete(pod.Annotations, corev1beta1.DoNotDisruptAnnotationKey) + env.ExpectUpdated(pod) + env.EventuallyExpectNotFound(pod, nodeClaim, node) + }) + It("should return drifted if the AMI no longer matches the existing NodeClaims instance type", func() { + armParameter, err := env.SSMAPI.GetParameter(&ssm.GetParameterInput{ + Name: awssdk.String("/aws/service/eks/optimized-ami/1.28/amazon-linux-2-arm64/recommended/image_id"), + }) + Expect(err).To(BeNil()) + armAMI := *armParameter.Parameter.Value + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyAL2 + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: armAMI}} + + env.ExpectCreated(pod, nodeClass, nodePool) + env.EventuallyExpectHealthy(pod) + env.ExpectCreatedNodeCount("==", 1) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + node := env.EventuallyExpectNodeCount("==", 1)[0] + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: amdAMI}} env.ExpectCreatedOrUpdated(nodeClass) Eventually(func(g Gomega) { @@ -152,7 +179,7 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { env.ExpectCreatedNodeCount("==", 1) node := env.Monitor.CreatedNodes()[0] - nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: customAMI}} + nodeClass.Spec.AMISelectorTerms = []v1beta1.AMISelectorTerm{{ID: amdAMI}} env.ExpectUpdated(nodeClass) // We should consistently get the same node existing for a minute From 652b394dc051524ea39a59e4b96e46d90204d9ca Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 25 Oct 2023 17:03:23 -0700 Subject: [PATCH 47/47] chore: E2E Test Fixes for Beta (#4902) --- go.mod | 2 +- go.sum | 4 +- pkg/providers/instancetype/types.go | 26 ++++--- test/pkg/environment/aws/environment.go | 20 +++++ test/pkg/environment/common/environment.go | 43 ++++++++++- test/pkg/environment/common/expectations.go | 37 ++++++++-- test/suites/alpha/drift/suite_test.go | 1 + .../{expiration_test.go => suite_test.go} | 1 + test/suites/alpha/integration/cni_test.go | 7 +- .../alpha/integration/kubelet_config_test.go | 6 +- .../alpha/integration/scheduling_test.go | 26 +++++-- .../alpha/integration/utilization_test.go | 4 +- .../alpha/machine/garbage_collection_test.go | 2 + .../suites/alpha/scale/deprovisioning_test.go | 1 + test/suites/beta/chaos/suite_test.go | 32 +------- test/suites/beta/consolidation/suite_test.go | 21 ++---- test/suites/beta/drift/suite_test.go | 73 +------------------ test/suites/beta/expiration/suite_test.go | 35 +-------- test/suites/beta/integration/ami_test.go | 28 +++---- .../suites/beta/integration/daemonset_test.go | 1 + .../integration/extended_resources_test.go | 47 ++++++------ .../beta/integration/kubelet_config_test.go | 72 ++++-------------- .../beta/integration/scheduling_test.go | 49 +++++++++---- test/suites/beta/integration/subnet_test.go | 12 ++- test/suites/beta/integration/suite_test.go | 32 +------- .../beta/integration/utilization_test.go | 22 ++++-- .../beta/integration/validation_test.go | 61 +++++++--------- test/suites/beta/interruption/suite_test.go | 51 ++----------- test/suites/beta/ipv6/suite_test.go | 50 +++---------- test/suites/beta/nodeclaim/suite_test.go | 32 +------- 30 files changed, 308 insertions(+), 490 deletions(-) rename test/suites/alpha/expiration/{expiration_test.go => suite_test.go} (99%) diff --git a/go.mod b/go.mod index 525b62559f0b..9979eb5919c7 100644 --- a/go.mod +++ b/go.mod @@ -7,7 +7,7 @@ require ( github.com/PuerkitoBio/goquery v1.8.1 github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.46.2 - github.com/aws/karpenter-core v0.31.1-0.20231024212423-074467327555 + github.com/aws/karpenter-core v0.31.1-0.20231025165859-8c11172ed8cf github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c github.com/go-logr/zapr v1.2.4 github.com/imdario/mergo v0.3.16 diff --git a/go.sum b/go.sum index 804350daabef..4e93b2e64b83 100644 --- a/go.sum +++ b/go.sum @@ -57,8 +57,8 @@ github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHS github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= github.com/aws/aws-sdk-go v1.46.2 h1:XZbOmjtN1VCfEtQq7QNFsbxIqO+bB+bRhiOBjp6AzWc= github.com/aws/aws-sdk-go v1.46.2/go.mod h1:aVsgQcEevwlmQ7qHE9I3h+dtQgpqhFB+i8Phjh7fkwI= -github.com/aws/karpenter-core v0.31.1-0.20231024212423-074467327555 h1:Rr+u23lxqMaubLd8JLks3L9uO8ACLSNbzID3VXIm/B4= -github.com/aws/karpenter-core v0.31.1-0.20231024212423-074467327555/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= +github.com/aws/karpenter-core v0.31.1-0.20231025165859-8c11172ed8cf h1:yCeG+MLOEsXyx43prs4jPqn31ZygfVmNiv7LaPjN/lY= +github.com/aws/karpenter-core v0.31.1-0.20231025165859-8c11172ed8cf/go.mod h1:liN81BwfVdlE5VHhgUnNZQdE+TEfO5cOYZXyR034T58= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c h1:oXWwIttmjYLbBKhLazG21aQvpJ3NOOr8IXhCJ/p6e/M= github.com/aws/karpenter/tools/kompat v0.0.0-20231010173459-62c25a3ea85c/go.mod h1:l/TIBsaCx/IrOr0Xvlj/cHLOf05QzuQKEZ1hx2XWmfU= github.com/benbjohnson/clock v1.1.0/go.mod h1:J11/hYXuz8f4ySSvYwY0FKfm+ezbsZBKZxNJlLklBHA= diff --git a/pkg/providers/instancetype/types.go b/pkg/providers/instancetype/types.go index 750543c3e251..bde3bcb1d184 100644 --- a/pkg/providers/instancetype/types.go +++ b/pkg/providers/instancetype/types.go @@ -54,7 +54,7 @@ func NewInstanceType(ctx context.Context, info *ec2.InstanceTypeInfo, kc *corev1 region string, nodeClass *v1beta1.EC2NodeClass, offerings cloudprovider.Offerings) *cloudprovider.InstanceType { amiFamily := amifamily.GetAMIFamily(nodeClass.Spec.AMIFamily, &amifamily.Options{}) - return &cloudprovider.InstanceType{ + it := &cloudprovider.InstanceType{ Name: aws.StringValue(info.InstanceType), Requirements: computeRequirements(ctx, info, offerings, region, amiFamily, kc, nodeClass), Offerings: offerings, @@ -65,6 +65,10 @@ func NewInstanceType(ctx context.Context, info *ec2.InstanceTypeInfo, kc *corev1 EvictionThreshold: evictionThreshold(memory(ctx, info), ephemeralStorage(amiFamily, nodeClass.Spec.BlockDeviceMappings), amiFamily, kc), }, } + if it.Requirements.Compatible(scheduling.NewRequirements(scheduling.NewRequirement(v1.LabelOSStable, v1.NodeSelectorOpIn, string(v1.Windows)))) == nil { + it.Capacity[v1beta1.ResourcePrivateIPv4Address] = *privateIPv4Address(info) + } + return it } //nolint:gocyclo @@ -173,16 +177,15 @@ func computeCapacity(ctx context.Context, info *ec2.InstanceTypeInfo, amiFamily blockDeviceMappings []*v1beta1.BlockDeviceMapping, kc *corev1beta1.KubeletConfiguration) v1.ResourceList { resourceList := v1.ResourceList{ - v1.ResourceCPU: *cpu(info), - v1.ResourceMemory: *memory(ctx, info), - v1.ResourceEphemeralStorage: *ephemeralStorage(amiFamily, blockDeviceMappings), - v1.ResourcePods: *pods(ctx, info, amiFamily, kc), - v1beta1.ResourceAWSPodENI: *awsPodENI(aws.StringValue(info.InstanceType)), - v1beta1.ResourceNVIDIAGPU: *nvidiaGPUs(info), - v1beta1.ResourceAMDGPU: *amdGPUs(info), - v1beta1.ResourceAWSNeuron: *awsNeurons(info), - v1beta1.ResourceHabanaGaudi: *habanaGaudis(info), - v1beta1.ResourcePrivateIPv4Address: *privateIPv4Address(info), + v1.ResourceCPU: *cpu(info), + v1.ResourceMemory: *memory(ctx, info), + v1.ResourceEphemeralStorage: *ephemeralStorage(amiFamily, blockDeviceMappings), + v1.ResourcePods: *pods(ctx, info, amiFamily, kc), + v1beta1.ResourceAWSPodENI: *awsPodENI(aws.StringValue(info.InstanceType)), + v1beta1.ResourceNVIDIAGPU: *nvidiaGPUs(info), + v1beta1.ResourceAMDGPU: *amdGPUs(info), + v1beta1.ResourceAWSNeuron: *awsNeurons(info), + v1beta1.ResourceHabanaGaudi: *habanaGaudis(info), } return resourceList } @@ -315,7 +318,6 @@ func ENILimitedPods(ctx context.Context, info *ec2.InstanceTypeInfo) *resource.Q } func privateIPv4Address(info *ec2.InstanceTypeInfo) *resource.Quantity { - //https://github.com/aws/amazon-vpc-resource-controller-k8s/blob/ecbd6965a0100d9a070110233762593b16023287/pkg/provider/ip/provider.go#L297 capacity := aws.Int64Value(info.NetworkInfo.Ipv4AddressesPerInterface) - 1 return resources.Quantity(fmt.Sprint(capacity)) diff --git a/test/pkg/environment/aws/environment.go b/test/pkg/environment/aws/environment.go index f55bcbdbcaca..90cc250c9580 100644 --- a/test/pkg/environment/aws/environment.go +++ b/test/pkg/environment/aws/environment.go @@ -15,6 +15,7 @@ limitations under the License. package aws import ( + "fmt" "os" "testing" @@ -36,7 +37,9 @@ import ( "github.com/samber/lo" "k8s.io/utils/env" + "github.com/aws/karpenter/pkg/apis/v1beta1" "github.com/aws/karpenter/pkg/controllers/interruption" + "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment/common" ) @@ -103,3 +106,20 @@ func GetTimeStreamAPI(session *session.Session) timestreamwriteiface.TimestreamW } return &NoOpTimeStreamAPI{} } + +func (env *Environment) DefaultEC2NodeClass() *v1beta1.EC2NodeClass { + nodeClass := test.EC2NodeClass() + nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyAL2 + nodeClass.Spec.SecurityGroupSelectorTerms = []v1beta1.SecurityGroupSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + } + nodeClass.Spec.SubnetSelectorTerms = []v1beta1.SubnetSelectorTerm{ + { + Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, + }, + } + nodeClass.Spec.Role = fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName) + return nodeClass +} diff --git a/test/pkg/environment/common/environment.go b/test/pkg/environment/common/environment.go index bab195b7c677..9471e6081cad 100644 --- a/test/pkg/environment/common/environment.go +++ b/test/pkg/environment/common/environment.go @@ -26,6 +26,7 @@ import ( "github.com/onsi/gomega" "github.com/samber/lo" v1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/runtime" "k8s.io/client-go/kubernetes" clientgoscheme "k8s.io/client-go/kubernetes/scheme" @@ -37,10 +38,12 @@ import ( "sigs.k8s.io/controller-runtime/pkg/client" coreapis "github.com/aws/karpenter-core/pkg/apis" - "github.com/aws/karpenter-core/pkg/apis/v1beta1" + corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter-core/pkg/operator" "github.com/aws/karpenter-core/pkg/operator/injection" + coretest "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis" + "github.com/aws/karpenter/pkg/apis/v1beta1" ) type ContextKey string @@ -120,7 +123,7 @@ func NewClient(ctx context.Context, config *rest.Config) client.Client { lo.Must0(cache.IndexField(ctx, &v1.Node{}, "spec.taints[*].karpenter.sh/disruption", func(o client.Object) []string { node := o.(*v1.Node) t, _ := lo.Find(node.Spec.Taints, func(t v1.Taint) bool { - return t.Key == v1beta1.DisruptionTaintKey + return t.Key == corev1beta1.DisruptionTaintKey }) return []string{t.Value} })) @@ -135,3 +138,39 @@ func NewClient(ctx context.Context, config *rest.Config) client.Client { } return c } + +func (env *Environment) DefaultNodePool(nodeClass *v1beta1.EC2NodeClass) *corev1beta1.NodePool { + nodePool := coretest.NodePool() + nodePool.Spec.Template.Spec.NodeClassRef = &corev1beta1.NodeClassReference{ + Name: nodeClass.Name, + } + nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Linux)}, + }, + { + Key: corev1beta1.CapacityTypeLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{corev1beta1.CapacityTypeOnDemand}, + }, + { + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, + { + Key: v1beta1.LabelInstanceGeneration, + Operator: v1.NodeSelectorOpGt, + Values: []string{"2"}, + }, + } + nodePool.Spec.Disruption.ConsolidateAfter = &corev1beta1.NillableDuration{} + nodePool.Spec.Disruption.ExpireAfter.Duration = nil + nodePool.Spec.Limits = corev1beta1.Limits(v1.ResourceList{ + v1.ResourceCPU: resource.MustParse("1000"), + v1.ResourceMemory: resource.MustParse("1000Gi"), + }) + return nodePool +} diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index d0a9905cb610..66bb34f1a2b8 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -149,6 +149,26 @@ func (env *Environment) ExpectSettingsOverridden(vars ...v1.EnvVar) { } } +func (env *Environment) ExpectSettingsRemoved(vars ...v1.EnvVar) { + GinkgoHelper() + + varNames := sets.New[string](lo.Map(vars, func(v v1.EnvVar, _ int) string { return v.Name })...) + + d := &appsv1.Deployment{} + Expect(env.Client.Get(env.Context, types.NamespacedName{Namespace: "karpenter", Name: "karpenter"}, d)).To(Succeed()) + Expect(d.Spec.Template.Spec.Containers).To(HaveLen(1)) + + stored := d.DeepCopy() + d.Spec.Template.Spec.Containers[0].Env = lo.Reject(d.Spec.Template.Spec.Containers[0].Env, func(v v1.EnvVar, _ int) bool { + return varNames.Has(v.Name) + }) + if !equality.Semantic.DeepEqual(d, stored) { + By("removing environment variables for karpenter deployment") + Expect(env.Client.Patch(env.Context, d, client.MergeFrom(stored))).To(Succeed()) + env.EventuallyExpectKarpenterRestarted() + } +} + // ExpectSettingsLegacy gets the karpenter-global-settings ConfigMap func (env *Environment) ExpectSettingsLegacy() *v1.ConfigMap { GinkgoHelper() @@ -384,14 +404,17 @@ func (env *Environment) ExpectPodsMatchingSelector(selector labels.Selector) []* return lo.ToSlicePtr(podList.Items) } -func (env *Environment) ExpectUniqueNodeNames(selector labels.Selector, uniqueNames int) { +func (env *Environment) EventuallyExpectUniqueNodeNames(selector labels.Selector, uniqueNames int) { GinkgoHelper() - pods := env.Monitor.RunningPods(selector) - nodeNames := sets.NewString() - for _, pod := range pods { - nodeNames.Insert(pod.Spec.NodeName) - } - Expect(len(nodeNames)).To(BeNumerically("==", uniqueNames)) + + Eventually(func(g Gomega) { + pods := env.Monitor.RunningPods(selector) + nodeNames := sets.NewString() + for _, pod := range pods { + nodeNames.Insert(pod.Spec.NodeName) + } + g.Expect(len(nodeNames)).To(BeNumerically("==", uniqueNames)) + }).Should(Succeed()) } func (env *Environment) eventuallyExpectScaleDown() { diff --git a/test/suites/alpha/drift/suite_test.go b/test/suites/alpha/drift/suite_test.go index c716100f64c1..98c73de39946 100644 --- a/test/suites/alpha/drift/suite_test.go +++ b/test/suites/alpha/drift/suite_test.go @@ -88,6 +88,7 @@ var _ = Describe("Drift", Label("AWS"), func() { }, }, }) + env.ExpectSettingsRemoved(v1.EnvVar{Name: "FEATURE_GATES"}) env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) }) It("should deprovision nodes that have drifted due to AMIs", func() { diff --git a/test/suites/alpha/expiration/expiration_test.go b/test/suites/alpha/expiration/suite_test.go similarity index 99% rename from test/suites/alpha/expiration/expiration_test.go rename to test/suites/alpha/expiration/suite_test.go index 15621c59d700..e0caac5efd7f 100644 --- a/test/suites/alpha/expiration/expiration_test.go +++ b/test/suites/alpha/expiration/suite_test.go @@ -73,6 +73,7 @@ var _ = Describe("Expiration", func() { ProviderRef: &v1alpha5.MachineTemplateRef{Name: nodeTemplate.Name}, TTLSecondsUntilExpired: ptr.Int64(30), }) + env.ExpectSettingsRemoved(v1.EnvVar{Name: "FEATURE_GATES"}) env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "false"}) }) It("should expire the node after the TTLSecondsUntilExpired is reached", func() { diff --git a/test/suites/alpha/integration/cni_test.go b/test/suites/alpha/integration/cni_test.go index a14efea66cd5..b136ea11d52c 100644 --- a/test/suites/alpha/integration/cni_test.go +++ b/test/suites/alpha/integration/cni_test.go @@ -21,7 +21,6 @@ import ( "github.com/aws/aws-sdk-go/service/ec2" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" - "github.com/samber/lo" corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/types" @@ -72,8 +71,8 @@ var _ = Describe("CNITests", func() { Expect(allocatablePods).To(Equal(eniLimitedPodsFor(node.Labels["node.kubernetes.io/instance-type"]))) }) It("should set maxPods when reservedENIs is set", func() { + env.ExpectSettingsRemoved(corev1.EnvVar{Name: "RESERVED_ENIS"}) env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.reservedENIs": "1"}) - env.ExpectSettingsOverridden(corev1.EnvVar{Name: "RESERVED_ENIS", Value: "1"}) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{ AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, @@ -109,9 +108,9 @@ func reservedENIsFor(instanceType string) int64 { Expect(err).ToNot(HaveOccurred()) networkInfo := *instance.InstanceTypes[0].NetworkInfo reservedENIs := 0 - reservedENIsVar, ok := lo.Find(env.ExpectSettings(), func(v corev1.EnvVar) bool { return v.Name == "RESERVED_ENIS" }) + reservedENIsStr, ok := env.ExpectSettingsLegacy().Data["aws.reservedENIs"] if ok { - reservedENIs, err = strconv.Atoi(reservedENIsVar.Value) + reservedENIs, err = strconv.Atoi(reservedENIsStr) Expect(err).ToNot(HaveOccurred()) } return (*networkInfo.MaximumNetworkInterfaces-int64(reservedENIs))*(*networkInfo.Ipv4AddressesPerInterface-1) + 2 diff --git a/test/suites/alpha/integration/kubelet_config_test.go b/test/suites/alpha/integration/kubelet_config_test.go index 1e023001214b..774d4aa4a0a5 100644 --- a/test/suites/alpha/integration/kubelet_config_test.go +++ b/test/suites/alpha/integration/kubelet_config_test.go @@ -214,7 +214,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 3) - env.ExpectUniqueNodeNames(selector, 3) + env.EventuallyExpectUniqueNodeNames(selector, 3) }) It("should schedule pods onto separate nodes when podsPerCore is set", func() { provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ @@ -268,7 +268,7 @@ var _ = Describe("KubeletConfiguration Overrides", func() { env.ExpectCreated(provisioner, provider, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 2) - env.ExpectUniqueNodeNames(selector, 2) + env.EventuallyExpectUniqueNodeNames(selector, 2) }) It("should ignore podsPerCore value when Bottlerocket is used", func() { provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ @@ -308,6 +308,6 @@ var _ = Describe("KubeletConfiguration Overrides", func() { env.ExpectCreated(provisioner, provider, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) - env.ExpectUniqueNodeNames(selector, 1) + env.EventuallyExpectUniqueNodeNames(selector, 1) }) }) diff --git a/test/suites/alpha/integration/scheduling_test.go b/test/suites/alpha/integration/scheduling_test.go index 352b0a526f85..f07fbabb3cf4 100644 --- a/test/suites/alpha/integration/scheduling_test.go +++ b/test/suites/alpha/integration/scheduling_test.go @@ -251,16 +251,28 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { Image: aws.WindowsDefaultImage, }}) provider.Spec.AMIFamily = &v1alpha1.AMIFamilyWindows2022 - provisioner.Spec.Requirements = append(provisioner.Spec.Requirements, v1.NodeSelectorRequirement{ - Key: v1.LabelOSStable, - Operator: v1.NodeSelectorOpExists, - }, + provisioner.Spec.Requirements = append(provisioner.Spec.Requirements, + v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpExists, + }, + v1.NodeSelectorRequirement{ + Key: v1alpha1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpIn, + Values: []string{"c", "m", "r"}, + }, // TODO: remove this requirement once VPC RC rolls out m7a.* ENI data (https://github.com/aws/karpenter/issues/4472) + v1.NodeSelectorRequirement{ + Key: v1alpha1.LabelInstanceFamily, + Operator: v1.NodeSelectorOpNotIn, + Values: aws.ExcludedInstanceFamilies, + }, v1.NodeSelectorRequirement{ Key: v1alpha1.LabelInstanceGeneration, - Operator: v1.NodeSelectorOpLt, - Values: []string{"7"}, - }) + Operator: v1.NodeSelectorOpGt, + Values: []string{"2"}, + }, + ) env.ExpectCreated(provisioner, provider, deployment) env.EventuallyExpectHealthyPodCountWithTimeout(time.Minute*15, labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) env.ExpectCreatedNodeCount("==", 1) diff --git a/test/suites/alpha/integration/utilization_test.go b/test/suites/alpha/integration/utilization_test.go index 135e66a40268..9e7efc594b4a 100644 --- a/test/suites/alpha/integration/utilization_test.go +++ b/test/suites/alpha/integration/utilization_test.go @@ -15,6 +15,8 @@ limitations under the License. package integration_test import ( + "time" + . "github.com/onsi/ginkgo/v2" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" @@ -45,7 +47,7 @@ var _ = Describe("Utilization", Label(debug.NoWatch), Label(debug.NoEvents), fun PodOptions: test.PodOptions{ResourceRequirements: v1.ResourceRequirements{Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1.5")}}}}) env.ExpectCreated(provisioner, provider, deployment) - env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.EventuallyExpectHealthyPodCountWithTimeout(time.Minute*10, labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) env.ExpectCreatedNodeCount("==", int(*deployment.Spec.Replicas)) // One pod per node enforced by instance size }) }) diff --git a/test/suites/alpha/machine/garbage_collection_test.go b/test/suites/alpha/machine/garbage_collection_test.go index 3d9b140d2c15..7193ffe2dc28 100644 --- a/test/suites/alpha/machine/garbage_collection_test.go +++ b/test/suites/alpha/machine/garbage_collection_test.go @@ -25,6 +25,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" "github.com/aws/karpenter-core/pkg/apis/v1alpha5" "github.com/aws/karpenter-core/pkg/test" @@ -134,6 +135,7 @@ var _ = Describe("NodeClaimGarbageCollection", func() { }) It("should succeed to garbage collect a Machine that was deleted without the cluster's knowledge", func() { // Disable the interruption queue for the garbage collection test + env.ExpectSettingsRemoved(corev1.EnvVar{Name: "INTERRUPTION_QUEUE"}) env.ExpectSettingsOverriddenLegacy(map[string]string{"aws.interruptionQueueName": ""}) provider := awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ diff --git a/test/suites/alpha/scale/deprovisioning_test.go b/test/suites/alpha/scale/deprovisioning_test.go index 482176c098b3..15caa1cedd31 100644 --- a/test/suites/alpha/scale/deprovisioning_test.go +++ b/test/suites/alpha/scale/deprovisioning_test.go @@ -81,6 +81,7 @@ var _ = Describe("Deprovisioning", Label(debug.NoWatch), Label(debug.NoEvents), var dsCount int BeforeEach(func() { + env.ExpectSettingsRemoved(v1.EnvVar{Name: "FEATURE_GATES"}) env.ExpectSettingsOverriddenLegacy(map[string]string{"featureGates.driftEnabled": "true"}) nodeTemplate = awstest.AWSNodeTemplate(v1alpha1.AWSNodeTemplateSpec{AWS: v1alpha1.AWS{ SecurityGroupSelector: map[string]string{"karpenter.sh/discovery": env.ClusterName}, diff --git a/test/suites/beta/chaos/suite_test.go b/test/suites/beta/chaos/suite_test.go index 6674568a0ee5..a1789dd3d502 100644 --- a/test/suites/beta/chaos/suite_test.go +++ b/test/suites/beta/chaos/suite_test.go @@ -39,7 +39,6 @@ import ( coretest "github.com/aws/karpenter-core/pkg/test" nodeutils "github.com/aws/karpenter-core/pkg/utils/node" "github.com/aws/karpenter/pkg/apis/v1beta1" - "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/debug" "github.com/aws/karpenter/test/pkg/environment/aws" ) @@ -61,33 +60,8 @@ func TestChaos(t *testing.T) { var _ = BeforeEach(func() { env.BeforeEach() - nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ - Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), - }, - }) - nodePool = coretest.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - NodeClassRef: &corev1beta1.NodeClassReference{ - Name: nodeClass.Name, - }, - }, - }, - }, - }) + nodeClass = env.DefaultEC2NodeClass() + nodePool = env.DefaultNodePool(nodeClass) }) var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() }) @@ -98,7 +72,7 @@ var _ = Describe("Chaos", func() { ctx, cancel := context.WithCancel(env.Context) defer cancel() - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + nodePool = coretest.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ Key: corev1beta1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{corev1beta1.CapacityTypeSpot}, diff --git a/test/suites/beta/consolidation/suite_test.go b/test/suites/beta/consolidation/suite_test.go index e1200ef04e7d..b758827be0ad 100644 --- a/test/suites/beta/consolidation/suite_test.go +++ b/test/suites/beta/consolidation/suite_test.go @@ -31,7 +31,6 @@ import ( "github.com/aws/karpenter/pkg/apis/v1beta1" "github.com/aws/karpenter/test/pkg/debug" - awstest "github.com/aws/karpenter/pkg/test" environmentaws "github.com/aws/karpenter/test/pkg/environment/aws" "github.com/aws/karpenter/test/pkg/environment/common" @@ -55,14 +54,7 @@ func TestConsolidation(t *testing.T) { var nodeClass *v1beta1.EC2NodeClass var _ = BeforeEach(func() { - nodeClass = awstest.EC2NodeClass(v1beta1.EC2NodeClass{ - Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{{Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}}}, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{{Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}}}, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), - }, - }) + nodeClass = env.DefaultEC2NodeClass() env.BeforeEach() }) var _ = AfterEach(func() { env.Cleanup() }) @@ -324,18 +316,17 @@ var _ = Describe("Beta/Consolidation", func() { // Expect the node to consolidate to a spot instance as it will be a cheaper // instance than on-demand nodePool.Spec.Disruption.ConsolidateAfter = nil - nodePool.Spec.Template.Spec.Requirements = []v1.NodeSelectorRequirement{ - { + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: corev1beta1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{corev1beta1.CapacityTypeOnDemand, corev1beta1.CapacityTypeSpot}, + Operator: v1.NodeSelectorOpExists, }, - { + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceSize, Operator: v1.NodeSelectorOpIn, Values: []string{"large"}, }, - } + ) env.ExpectUpdated(nodePool) // Eventually expect the on-demand nodes to be consolidated into diff --git a/test/suites/beta/drift/suite_test.go b/test/suites/beta/drift/suite_test.go index f8c65a1c6c28..213ae93b403e 100644 --- a/test/suites/beta/drift/suite_test.go +++ b/test/suites/beta/drift/suite_test.go @@ -17,7 +17,6 @@ package drift_test import ( "fmt" "sort" - "strings" "testing" "time" @@ -34,7 +33,6 @@ import ( awssdk "github.com/aws/aws-sdk-go/aws" "github.com/aws/aws-sdk-go/service/ec2" "github.com/aws/aws-sdk-go/service/eks" - "github.com/aws/aws-sdk-go/service/iam" "github.com/aws/aws-sdk-go/service/ssm" corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" @@ -46,6 +44,8 @@ import ( var env *aws.Environment var amdAMI string +var nodeClass *v1beta1.EC2NodeClass +var nodePool *corev1beta1.NodePool func TestDrift(t *testing.T) { RegisterFailHandler(Fail) @@ -60,6 +60,8 @@ func TestDrift(t *testing.T) { var _ = BeforeEach(func() { env.BeforeEach() + nodeClass = env.DefaultEC2NodeClass() + nodePool = env.DefaultNodePool(nodeClass) }) var _ = AfterEach(func() { env.Cleanup() }) @@ -67,34 +69,8 @@ var _ = AfterEach(func() { env.AfterEach() }) var _ = Describe("Beta/Drift", Label("AWS"), func() { var pod *v1.Pod - var nodeClass *v1beta1.EC2NodeClass - var nodePool *corev1beta1.NodePool BeforeEach(func() { amdAMI = env.GetCustomAMI("/aws/service/eks/optimized-ami/%s/amazon-linux-2/recommended/image_id", 1) - nodeClass = awstest.EC2NodeClass(v1beta1.EC2NodeClass{Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), - }}) - nodePool = test.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - Requirements: []v1.NodeSelectorRequirement{{Key: corev1beta1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{corev1beta1.CapacityTypeOnDemand}}}, - NodeClassRef: &corev1beta1.NodeClassReference{Name: nodeClass.Name}, - }, - }, - }, - }) // Add a do-not-disrupt pod so that we can check node metadata before we disrupt pod = test.Pod(test.PodOptions{ ObjectMeta: metav1.ObjectMeta{ @@ -537,44 +513,3 @@ var _ = Describe("Beta/Drift", Label("AWS"), func() { }) }) }) - -func ExpectInstanceProfileCreated(instanceProfileName *string) { - By("creating an instance profile") - createInstanceProfile := &iam.CreateInstanceProfileInput{ - InstanceProfileName: instanceProfileName, - Tags: []*iam.Tag{ - { - Key: awssdk.String(test.DiscoveryLabel), - Value: awssdk.String(env.ClusterName), - }, - }, - } - By("adding the karpenter role to new instance profile") - _, err := env.IAMAPI.CreateInstanceProfile(createInstanceProfile) - Expect(ignoreAlreadyExists(err)).ToNot(HaveOccurred()) - addInstanceProfile := &iam.AddRoleToInstanceProfileInput{ - InstanceProfileName: instanceProfileName, - RoleName: awssdk.String(fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName)), - } - _, err = env.IAMAPI.AddRoleToInstanceProfile(addInstanceProfile) - Expect(ignoreAlreadyContainsRole(err)).ToNot(HaveOccurred()) -} - -func ignoreAlreadyExists(err error) error { - if err != nil { - if strings.Contains(err.Error(), "EntityAlreadyExists") { - return nil - } - } - return err -} - -func ignoreAlreadyContainsRole(err error) error { - if err != nil { - if strings.Contains(err.Error(), "Cannot exceed quota for InstanceSessionsPerInstanceProfile") { - return nil - } - } - - return err -} diff --git a/test/suites/beta/expiration/suite_test.go b/test/suites/beta/expiration/suite_test.go index 623feec43ea3..f47fcbf57dbb 100644 --- a/test/suites/beta/expiration/suite_test.go +++ b/test/suites/beta/expiration/suite_test.go @@ -15,7 +15,6 @@ limitations under the License. package expiration_test import ( - "fmt" "testing" "time" @@ -34,7 +33,6 @@ import ( corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" "github.com/aws/karpenter/pkg/apis/v1beta1" - "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment/aws" coretest "github.com/aws/karpenter-core/pkg/test" @@ -57,36 +55,9 @@ func TestExpiration(t *testing.T) { var _ = BeforeEach(func() { env.BeforeEach() - nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ - Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), - }, - }) - nodePool = coretest.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - NodeClassRef: &corev1beta1.NodeClassReference{ - Name: nodeClass.Name, - }, - }, - }, - Disruption: corev1beta1.Disruption{ - ExpireAfter: corev1beta1.NillableDuration{Duration: lo.ToPtr(time.Second * 30)}, - }, - }, - }) + nodeClass = env.DefaultEC2NodeClass() + nodePool = env.DefaultNodePool(nodeClass) + nodePool.Spec.Disruption.ExpireAfter = corev1beta1.NillableDuration{Duration: lo.ToPtr(time.Second * 30)} }) var _ = AfterEach(func() { env.Cleanup() }) diff --git a/test/suites/beta/integration/ami_test.go b/test/suites/beta/integration/ami_test.go index 070be1f4ff3c..6149449f0b63 100644 --- a/test/suites/beta/integration/ami_test.go +++ b/test/suites/beta/integration/ami_test.go @@ -156,18 +156,13 @@ var _ = Describe("AMI", func() { nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyUbuntu // TODO: remove requirements after Ubuntu fixes bootstrap script issue w/ // new instance types not included in the max-pods.txt file. (https://github.com/aws/karpenter/issues/4472) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { + nodePool = coretest.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, Values: awsenv.ExcludedInstanceFamilies, }, - { - Key: v1beta1.LabelInstanceCategory, - Operator: v1.NodeSelectorOpIn, - Values: []string{"c", "m", "r"}, - }, - }...) + ) pod := coretest.Pod() env.ExpectCreated(nodeClass, nodePool, pod) env.EventuallyExpectHealthy(pod) @@ -286,23 +281,18 @@ var _ = Describe("AMI", func() { nodePool.Spec.Template.Spec.StartupTaints = []v1.Taint{{Key: "example.com", Value: "value", Effect: "NoSchedule"}} // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1.LabelOSStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{string(v1.Windows)}, - }, - { + nodePool = coretest.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, Values: awsenv.ExcludedInstanceFamilies, }, - { - Key: v1beta1.LabelInstanceCategory, + v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, Operator: v1.NodeSelectorOpIn, - Values: []string{"c", "m", "r"}, + Values: []string{string(v1.Windows)}, }, - }...) + ) pod := coretest.Pod(coretest.PodOptions{ Image: awsenv.WindowsDefaultImage, NodeSelector: map[string]string{ diff --git a/test/suites/beta/integration/daemonset_test.go b/test/suites/beta/integration/daemonset_test.go index 3e7206814de0..34f6cf278243 100644 --- a/test/suites/beta/integration/daemonset_test.go +++ b/test/suites/beta/integration/daemonset_test.go @@ -38,6 +38,7 @@ var _ = Describe("DaemonSet", func() { BeforeEach(func() { nodePool.Spec.Disruption.ConsolidationPolicy = corev1beta1.ConsolidationPolicyWhenUnderutilized + nodePool.Spec.Disruption.ConsolidateAfter = nil priorityclass = &schedulingv1.PriorityClass{ ObjectMeta: metav1.ObjectMeta{ Name: "high-priority-daemonsets", diff --git a/test/suites/beta/integration/extended_resources_test.go b/test/suites/beta/integration/extended_resources_test.go index f8dc328b6fe4..94423b6bd3e1 100644 --- a/test/suites/beta/integration/extended_resources_test.go +++ b/test/suites/beta/integration/extended_resources_test.go @@ -55,6 +55,10 @@ var _ = Describe("Extended Resources", func() { }, }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + test.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpExists, + }) env.ExpectCreated(nodeClass, nodePool, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) @@ -81,6 +85,10 @@ var _ = Describe("Extended Resources", func() { }, }) selector := labels.SelectorFromSet(dep.Spec.Selector.MatchLabels) + test.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpExists, + }) env.ExpectCreated(nodeClass, nodePool, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) @@ -92,18 +100,13 @@ var _ = Describe("Extended Resources", func() { env.ExpectPodENIDisabled() }) // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, Values: awsenv.ExcludedInstanceFamilies, }, - { - Key: v1beta1.LabelInstanceCategory, - Operator: v1.NodeSelectorOpIn, - Values: []string{"c", "m", "r"}, - }, - }...) + ) numPods := 1 dep := test.Deployment(test.DeploymentOptions{ Replicas: int32(numPods), @@ -210,10 +213,10 @@ var _ = Describe("Extended Resources", func() { func ExpectNvidiaDevicePluginCreated() { GinkgoHelper() env.ExpectCreated(&appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{ Name: "nvidia-device-plugin-daemonset", Namespace: "kube-system", - }, + }), Spec: appsv1.DaemonSetSpec{ Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ @@ -224,11 +227,11 @@ func ExpectNvidiaDevicePluginCreated() { Type: appsv1.RollingUpdateDaemonSetStrategyType, }, Template: v1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{ Labels: map[string]string{ "name": "nvidia-device-plugin-ds", }, - }, + }), Spec: v1.PodSpec{ Tolerations: []v1.Toleration{ { @@ -281,10 +284,10 @@ func ExpectNvidiaDevicePluginCreated() { func ExpectAMDDevicePluginCreated() { GinkgoHelper() env.ExpectCreated(&appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{ Name: "amdgpu-device-plugin-daemonset", Namespace: "kube-system", - }, + }), Spec: appsv1.DaemonSetSpec{ Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ @@ -292,11 +295,11 @@ func ExpectAMDDevicePluginCreated() { }, }, Template: v1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{ Labels: map[string]string{ "name": "amdgpu-dp-ds", }, - }, + }), Spec: v1.PodSpec{ PriorityClassName: "system-node-critical", Tolerations: []v1.Toleration{ @@ -355,15 +358,15 @@ func ExpectAMDDevicePluginCreated() { func ExpectHabanaDevicePluginCreated() { GinkgoHelper() env.ExpectCreated(&v1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{ Name: "habana-system", - }, + }), }) env.ExpectCreated(&appsv1.DaemonSet{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{ Name: "habanalabs-device-plugin-daemonset", Namespace: "habana-system", - }, + }), Spec: appsv1.DaemonSetSpec{ Selector: &metav1.LabelSelector{ MatchLabels: map[string]string{ @@ -374,14 +377,14 @@ func ExpectHabanaDevicePluginCreated() { Type: appsv1.RollingUpdateDaemonSetStrategyType, }, Template: v1.PodTemplateSpec{ - ObjectMeta: metav1.ObjectMeta{ + ObjectMeta: test.ObjectMeta(metav1.ObjectMeta{ Annotations: map[string]string{ "scheduler.alpha.kubernetes.io/critical-pod": "", }, Labels: map[string]string{ "name": "habanalabs-device-plugin-ds", }, - }, + }), Spec: v1.PodSpec{ Tolerations: []v1.Toleration{ { diff --git a/test/suites/beta/integration/kubelet_config_test.go b/test/suites/beta/integration/kubelet_config_test.go index 1a7ac9c23403..ff95d7bf58da 100644 --- a/test/suites/beta/integration/kubelet_config_test.go +++ b/test/suites/beta/integration/kubelet_config_test.go @@ -82,27 +82,6 @@ var _ = Describe("KubeletConfiguration Overrides", func() { DescribeTable("Linux AMIFamilies", func(amiFamily *string) { nodeClass.Spec.AMIFamily = amiFamily - // Need to enable nodepool-level OS-scoping for now since DS evaluation is done off of the nodepool - // requirements, not off of the instance type options so scheduling can fail if nodepools aren't - // properly scoped - // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1beta1.LabelInstanceFamily, - Operator: v1.NodeSelectorOpNotIn, - Values: aws.ExcludedInstanceFamilies, - }, - { - Key: v1beta1.LabelInstanceCategory, - Operator: v1.NodeSelectorOpIn, - Values: []string{"c", "m", "r"}, - }, - { - Key: v1.LabelOSStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{string(v1.Linux)}, - }, - }...) pod := test.Pod(test.PodOptions{ NodeSelector: map[string]string{ v1.LabelOSStable: string(v1.Linux), @@ -129,23 +108,18 @@ var _ = Describe("KubeletConfiguration Overrides", func() { // requirements, not off of the instance type options so scheduling can fail if nodepool aren't // properly scoped // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.*, c7a.* ENI data (https://github.com/aws/karpenter/issues/4472) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, Values: aws.ExcludedInstanceFamilies, }, - { - Key: v1beta1.LabelInstanceCategory, - Operator: v1.NodeSelectorOpIn, - Values: []string{"c", "m", "r"}, - }, - { + v1.NodeSelectorRequirement{ Key: v1.LabelOSStable, Operator: v1.NodeSelectorOpIn, Values: []string{string(v1.Windows)}, }, - }...) + ) pod := test.Pod(test.PodOptions{ Image: aws.WindowsDefaultImage, NodeSelector: map[string]string{ @@ -162,15 +136,6 @@ var _ = Describe("KubeletConfiguration Overrides", func() { ) }) It("should schedule pods onto separate nodes when maxPods is set", func() { - // MaxPods needs to account for the daemonsets that will run on the nodes - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1.LabelOSStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{string(v1.Linux)}, - }, - }...) - // Get the DS pod count and use it to calculate the DS pod overhead dsCount := env.GetDaemonSetCount(nodePool) nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{ @@ -194,23 +159,18 @@ var _ = Describe("KubeletConfiguration Overrides", func() { env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 3) - env.ExpectUniqueNodeNames(selector, 3) + env.EventuallyExpectUniqueNodeNames(selector, 3) }) It("should schedule pods onto separate nodes when podsPerCore is set", func() { // PodsPerCore needs to account for the daemonsets that will run on the nodes // This will have 4 pods available on each node (2 taken by daemonset pods) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceCPU, Operator: v1.NodeSelectorOpIn, Values: []string{"2"}, }, - { - Key: v1.LabelOSStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{string(v1.Linux)}, - }, - }...) + ) numPods := 4 dep := test.Deployment(test.DeploymentOptions{ Replicas: int32(numPods), @@ -241,24 +201,20 @@ var _ = Describe("KubeletConfiguration Overrides", func() { env.ExpectCreated(nodeClass, nodePool, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 2) - env.ExpectUniqueNodeNames(selector, 2) + env.EventuallyExpectUniqueNodeNames(selector, 2) }) It("should ignore podsPerCore value when Bottlerocket is used", func() { nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyBottlerocket // All pods should schedule to a single node since we are ignoring podsPerCore value // This would normally schedule to 3 nodes if not using Bottlerocket - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceCPU, Operator: v1.NodeSelectorOpIn, Values: []string{"2"}, }, - { - Key: v1.LabelOSStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{string(v1.Linux)}, - }, - }...) + ) + nodePool.Spec.Template.Spec.Kubelet = &corev1beta1.KubeletConfiguration{PodsPerCore: ptr.Int32(1)} numPods := 6 dep := test.Deployment(test.DeploymentOptions{ @@ -277,6 +233,6 @@ var _ = Describe("KubeletConfiguration Overrides", func() { env.ExpectCreated(nodeClass, nodePool, dep) env.EventuallyExpectHealthyPodCount(selector, numPods) env.ExpectCreatedNodeCount("==", 1) - env.ExpectUniqueNodeNames(selector, 1) + env.EventuallyExpectUniqueNodeNames(selector, 1) }) }) diff --git a/test/suites/beta/integration/scheduling_test.go b/test/suites/beta/integration/scheduling_test.go index 39013fb6c27b..972e37aae3e3 100644 --- a/test/suites/beta/integration/scheduling_test.go +++ b/test/suites/beta/integration/scheduling_test.go @@ -37,6 +37,19 @@ import ( var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { var selectors sets.Set[string] + BeforeEach(func() { + // Make the NodePool requirements fully flexible, so we can match well-known label keys + nodePool = test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpExists, + }, + v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceGeneration, + Operator: v1.NodeSelectorOpExists, + }, + ) + }) BeforeAll(func() { selectors = sets.New[string]() }) @@ -234,34 +247,28 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { }}) nodeClass.Spec.AMIFamily = &v1beta1.AMIFamilyWindows2022 // TODO: remove this requirement once VPC RC rolls out m7a.*, r7a.* ENI data (https://github.com/aws/karpenter/issues/4472) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ Key: v1beta1.LabelInstanceFamily, Operator: v1.NodeSelectorOpNotIn, Values: aws.ExcludedInstanceFamilies, }, - { - Key: v1beta1.LabelInstanceCategory, + v1.NodeSelectorRequirement{ + Key: v1.LabelOSStable, Operator: v1.NodeSelectorOpIn, - Values: []string{"c", "m", "r"}, + Values: []string{string(v1.Windows)}, }, - }...) + ) env.ExpectCreated(nodeClass, nodePool, deployment) env.EventuallyExpectHealthyPodCountWithTimeout(time.Minute*15, labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) env.ExpectCreatedNodeCount("==", 1) }) It("should support the node-restriction.kubernetes.io label domain", func() { // Assign labels to the nodepool so that it has known values - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1.LabelNamespaceNodeRestriction + "/team", - Operator: v1.NodeSelectorOpExists, - }, - { - Key: v1.LabelNamespaceNodeRestriction + "/custom-label", - Operator: v1.NodeSelectorOpExists, - }, - }...) + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{Key: v1.LabelNamespaceNodeRestriction + "/team", Operator: v1.NodeSelectorOpExists}, + v1.NodeSelectorRequirement{Key: v1.LabelNamespaceNodeRestriction + "/custom-label", Operator: v1.NodeSelectorOpExists}, + ) nodeSelector := map[string]string{ v1.LabelNamespaceNodeRestriction + "/team": "team-1", v1.LabelNamespaceNodeRestriction + "/custom-label": "custom-value", @@ -348,6 +355,11 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { Name: nodeClass.Name, }, Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Linux)}, + }, { Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, @@ -367,6 +379,11 @@ var _ = Describe("Scheduling", Ordered, ContinueOnFailure, func() { Name: nodeClass.Name, }, Requirements: []v1.NodeSelectorRequirement{ + { + Key: v1.LabelOSStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{string(v1.Linux)}, + }, { Key: v1.LabelInstanceTypeStable, Operator: v1.NodeSelectorOpIn, diff --git a/test/suites/beta/integration/subnet_test.go b/test/suites/beta/integration/subnet_test.go index af5eb321bea2..b060ea044642 100644 --- a/test/suites/beta/integration/subnet_test.go +++ b/test/suites/beta/integration/subnet_test.go @@ -98,13 +98,11 @@ var _ = Describe("Subnets", func() { Expect(len(subnets)).ToNot(Equal(0)) shuffledAZs := lo.Shuffle(lo.Keys(subnets)) - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1.LabelZoneFailureDomainStable, - Operator: "In", - Values: []string{shuffledAZs[0]}, - }, - }...) + test.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: v1.LabelZoneFailureDomainStable, + Operator: "In", + Values: []string{shuffledAZs[0]}, + }) pod := test.Pod() env.ExpectCreated(pod, nodeClass, nodePool) diff --git a/test/suites/beta/integration/suite_test.go b/test/suites/beta/integration/suite_test.go index 26d7a8c62fc1..bedf27a93ae9 100644 --- a/test/suites/beta/integration/suite_test.go +++ b/test/suites/beta/integration/suite_test.go @@ -15,16 +15,13 @@ limitations under the License. package integration_test import ( - "fmt" "testing" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" - coretest "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis/v1beta1" - "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment/aws" ) @@ -45,33 +42,8 @@ func TestIntegration(t *testing.T) { var _ = BeforeEach(func() { env.BeforeEach() - nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ - Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), - }, - }) - nodePool = coretest.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - NodeClassRef: &corev1beta1.NodeClassReference{ - Name: nodeClass.Name, - }, - }, - }, - }, - }) + nodeClass = env.DefaultEC2NodeClass() + nodePool = env.DefaultNodePool(nodeClass) }) var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() }) diff --git a/test/suites/beta/integration/utilization_test.go b/test/suites/beta/integration/utilization_test.go index a84084cdbe90..005f092035b2 100644 --- a/test/suites/beta/integration/utilization_test.go +++ b/test/suites/beta/integration/utilization_test.go @@ -15,29 +15,37 @@ limitations under the License. package integration_test import ( + "time" + . "github.com/onsi/ginkgo/v2" v1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" "k8s.io/apimachinery/pkg/labels" "github.com/aws/karpenter-core/pkg/test" + "github.com/aws/karpenter/pkg/apis/v1beta1" "github.com/aws/karpenter/test/pkg/debug" ) var _ = Describe("Utilization", Label(debug.NoWatch), Label(debug.NoEvents), func() { It("should provision one pod per node", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"t3a.small"}, - }) - + test.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"t3a.small"}, + }, + v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpExists, + }, + ) deployment := test.Deployment(test.DeploymentOptions{ Replicas: 100, PodOptions: test.PodOptions{ResourceRequirements: v1.ResourceRequirements{Requests: v1.ResourceList{v1.ResourceCPU: resource.MustParse("1.5")}}}}) env.ExpectCreated(nodeClass, nodePool, deployment) - env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) + env.EventuallyExpectHealthyPodCountWithTimeout(time.Minute*10, labels.SelectorFromSet(deployment.Spec.Selector.MatchLabels), int(*deployment.Spec.Replicas)) env.ExpectCreatedNodeCount("==", int(*deployment.Spec.Replicas)) // One pod per node enforced by instance size }) }) diff --git a/test/suites/beta/integration/validation_test.go b/test/suites/beta/integration/validation_test.go index 593fc98f00f6..b6595b5c7fb4 100644 --- a/test/suites/beta/integration/validation_test.go +++ b/test/suites/beta/integration/validation_test.go @@ -26,6 +26,7 @@ import ( . "github.com/onsi/gomega" corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" + coretest "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis/v1beta1" ) @@ -50,53 +51,43 @@ var _ = Describe("Validation", func() { Expect(env.Client.Create(env.Context, nodePool)).To(Succeed()) }) It("should error when a requirement references a restricted label (karpenter.sh/nodepool)", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: corev1beta1.NodePoolLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{"default"}, - }, - }...) + nodePool = coretest.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: corev1beta1.NodePoolLabelKey, + Operator: v1.NodeSelectorOpIn, + Values: []string{"default"}, + }) Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) }) It("should error when a requirement uses In but has no values", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{}, - }, - }...) + nodePool = coretest.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{}, + }) Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) }) It("should error when a requirement uses an unknown operator", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: corev1beta1.CapacityTypeLabelKey, - Operator: "within", - Values: []string{corev1beta1.CapacityTypeSpot}, - }, - }...) + nodePool = coretest.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: corev1beta1.CapacityTypeLabelKey, + Operator: "within", + Values: []string{corev1beta1.CapacityTypeSpot}, + }) Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) }) It("should error when Gt is used with multiple integer values", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1beta1.LabelInstanceMemory, - Operator: v1.NodeSelectorOpGt, - Values: []string{"1000000", "2000000"}, - }, - }...) + nodePool = coretest.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceMemory, + Operator: v1.NodeSelectorOpGt, + Values: []string{"1000000", "2000000"}, + }) Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) }) It("should error when Lt is used with multiple integer values", func() { - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, []v1.NodeSelectorRequirement{ - { - Key: v1beta1.LabelInstanceMemory, - Operator: v1.NodeSelectorOpLt, - Values: []string{"1000000", "2000000"}, - }, - }...) + nodePool = coretest.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceMemory, + Operator: v1.NodeSelectorOpLt, + Values: []string{"1000000", "2000000"}, + }) Expect(env.Client.Create(env.Context, nodePool)).ToNot(Succeed()) }) It("should error when ttlSecondAfterEmpty is negative", func() { diff --git a/test/suites/beta/interruption/suite_test.go b/test/suites/beta/interruption/suite_test.go index 06e4a308de77..0e3da9fe5830 100644 --- a/test/suites/beta/interruption/suite_test.go +++ b/test/suites/beta/interruption/suite_test.go @@ -21,6 +21,7 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" + "github.com/samber/lo" v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" @@ -32,6 +33,7 @@ import ( "github.com/aws/karpenter/pkg/apis/v1beta1" "github.com/aws/karpenter/pkg/controllers/interruption/messages" "github.com/aws/karpenter/pkg/controllers/interruption/messages/scheduledchange" + "github.com/aws/karpenter/pkg/operator/options" "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/pkg/utils" "github.com/aws/karpenter/test/pkg/environment/aws" @@ -53,35 +55,13 @@ func TestInterruption(t *testing.T) { } var _ = BeforeEach(func() { + env.Context = options.ToContext(env.Context, test.Options(test.OptionsFields{ + InterruptionQueue: lo.ToPtr(env.InterruptionQueue), + })) env.BeforeEach() env.ExpectQueueExists() - nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ - Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), - }, - }) - nodePool = coretest.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - NodeClassRef: &corev1beta1.NodeClassReference{ - Name: nodeClass.Name, - }, - }, - }, - }, - }) + nodeClass = env.DefaultEC2NodeClass() + nodePool = env.DefaultNodePool(nodeClass) }) var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() }) @@ -89,7 +69,7 @@ var _ = AfterEach(func() { env.AfterEach() }) var _ = Describe("Interruption", Label("AWS"), func() { It("should terminate the spot instance and spin-up a new node on spot interruption warning", func() { By("Creating a single healthy node with a healthy deployment") - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ + nodePool = coretest.ReplaceRequirements(nodePool, v1.NodeSelectorRequirement{ Key: corev1beta1.CapacityTypeLabelKey, Operator: v1.NodeSelectorOpIn, Values: []string{corev1beta1.CapacityTypeSpot}, @@ -128,11 +108,6 @@ var _ = Describe("Interruption", Label("AWS"), func() { }) It("should terminate the node at the API server when the EC2 instance is stopped", func() { By("Creating a single healthy node with a healthy deployment") - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ - Key: corev1beta1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{corev1beta1.CapacityTypeOnDemand}, - }) numPods := 1 dep := coretest.Deployment(coretest.DeploymentOptions{ Replicas: int32(numPods), @@ -159,11 +134,6 @@ var _ = Describe("Interruption", Label("AWS"), func() { }) It("should terminate the node at the API server when the EC2 instance is terminated", func() { By("Creating a single healthy node with a healthy deployment") - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ - Key: corev1beta1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{corev1beta1.CapacityTypeOnDemand}, - }) numPods := 1 dep := coretest.Deployment(coretest.DeploymentOptions{ Replicas: int32(numPods), @@ -190,11 +160,6 @@ var _ = Describe("Interruption", Label("AWS"), func() { }) It("should terminate the node when receiving a scheduled change health event", func() { By("Creating a single healthy node with a healthy deployment") - nodePool.Spec.Template.Spec.Requirements = append(nodePool.Spec.Template.Spec.Requirements, v1.NodeSelectorRequirement{ - Key: corev1beta1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{corev1beta1.CapacityTypeOnDemand}, - }) numPods := 1 dep := coretest.Deployment(coretest.DeploymentOptions{ Replicas: int32(numPods), diff --git a/test/suites/beta/ipv6/suite_test.go b/test/suites/beta/ipv6/suite_test.go index 36507ba5f82c..afc5c960a67b 100644 --- a/test/suites/beta/ipv6/suite_test.go +++ b/test/suites/beta/ipv6/suite_test.go @@ -15,7 +15,6 @@ limitations under the License. package ipv6_test import ( - "fmt" "net" "testing" @@ -27,7 +26,6 @@ import ( corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" coretest "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis/v1beta1" - "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment/aws" ) @@ -48,45 +46,19 @@ func TestIPv6(t *testing.T) { var _ = BeforeEach(func() { env.BeforeEach() - nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ - Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), + nodeClass = env.DefaultEC2NodeClass() + nodePool = env.DefaultNodePool(nodeClass) + nodePool = coretest.ReplaceRequirements(nodePool, + v1.NodeSelectorRequirement{ + Key: v1beta1.LabelInstanceCategory, + Operator: v1.NodeSelectorOpExists, }, - }) - nodePool = coretest.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - NodeClassRef: &corev1beta1.NodeClassReference{ - Name: nodeClass.Name, - }, - Requirements: []v1.NodeSelectorRequirement{ - { - Key: v1.LabelInstanceTypeStable, - Operator: v1.NodeSelectorOpIn, - Values: []string{"t3a.small"}, - }, - { - Key: corev1beta1.CapacityTypeLabelKey, - Operator: v1.NodeSelectorOpIn, - Values: []string{"on-demand"}, - }, - }, - }, - }, + v1.NodeSelectorRequirement{ + Key: v1.LabelInstanceTypeStable, + Operator: v1.NodeSelectorOpIn, + Values: []string{"t3a.small"}, }, - }) + ) }) var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() }) diff --git a/test/suites/beta/nodeclaim/suite_test.go b/test/suites/beta/nodeclaim/suite_test.go index 2d8b16c45dc5..d7e02f3bb264 100644 --- a/test/suites/beta/nodeclaim/suite_test.go +++ b/test/suites/beta/nodeclaim/suite_test.go @@ -15,16 +15,13 @@ limitations under the License. package nodeclaim_test import ( - "fmt" "testing" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" corev1beta1 "github.com/aws/karpenter-core/pkg/apis/v1beta1" - coretest "github.com/aws/karpenter-core/pkg/test" "github.com/aws/karpenter/pkg/apis/v1beta1" - "github.com/aws/karpenter/pkg/test" "github.com/aws/karpenter/test/pkg/environment/aws" ) @@ -45,33 +42,8 @@ func TestNodeClaim(t *testing.T) { var _ = BeforeEach(func() { env.BeforeEach() - nodeClass = test.EC2NodeClass(v1beta1.EC2NodeClass{ - Spec: v1beta1.EC2NodeClassSpec{ - AMIFamily: &v1beta1.AMIFamilyAL2, - SecurityGroupSelectorTerms: []v1beta1.SecurityGroupSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - SubnetSelectorTerms: []v1beta1.SubnetSelectorTerm{ - { - Tags: map[string]string{"karpenter.sh/discovery": env.ClusterName}, - }, - }, - Role: fmt.Sprintf("KarpenterNodeRole-%s", env.ClusterName), - }, - }) - nodePool = coretest.NodePool(corev1beta1.NodePool{ - Spec: corev1beta1.NodePoolSpec{ - Template: corev1beta1.NodeClaimTemplate{ - Spec: corev1beta1.NodeClaimSpec{ - NodeClassRef: &corev1beta1.NodeClassReference{ - Name: nodeClass.Name, - }, - }, - }, - }, - }) + nodeClass = env.DefaultEC2NodeClass() + nodePool = env.DefaultNodePool(nodeClass) }) var _ = AfterEach(func() { env.Cleanup() }) var _ = AfterEach(func() { env.AfterEach() })