diff --git a/.github/actions/e2e/cleanup/action.yaml b/.github/actions/e2e/cleanup/action.yaml index 7237012066c6..3f053d8449a0 100644 --- a/.github/actions/e2e/cleanup/action.yaml +++ b/.github/actions/e2e/cleanup/action.yaml @@ -24,7 +24,7 @@ inputs: runs: using: "composite" steps: - - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 + - uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 with: ref: ${{ inputs.git_ref }} - uses: ./.github/actions/e2e/install-eksctl diff --git a/.github/actions/e2e/install-karpenter/action.yaml b/.github/actions/e2e/install-karpenter/action.yaml index 137f6237bb26..67392858d70f 100644 --- a/.github/actions/e2e/install-karpenter/action.yaml +++ b/.github/actions/e2e/install-karpenter/action.yaml @@ -30,7 +30,7 @@ inputs: runs: using: "composite" steps: - - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 + - uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 with: ref: ${{ inputs.git_ref }} - uses: ./.github/actions/e2e/install-helm diff --git a/.github/actions/e2e/install-prometheus/action.yaml b/.github/actions/e2e/install-prometheus/action.yaml index f80dd138c8a6..b52721b75ca4 100644 --- a/.github/actions/e2e/install-prometheus/action.yaml +++ b/.github/actions/e2e/install-prometheus/action.yaml @@ -27,7 +27,7 @@ inputs: runs: using: "composite" steps: - - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 + - uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 with: ref: ${{ inputs.git_ref }} - uses: ./.github/actions/e2e/install-helm diff --git a/.github/actions/e2e/run-tests-private-cluster/action.yaml b/.github/actions/e2e/run-tests-private-cluster/action.yaml index 64504b9b6d58..8203553ebea4 100644 --- a/.github/actions/e2e/run-tests-private-cluster/action.yaml +++ b/.github/actions/e2e/run-tests-private-cluster/action.yaml @@ -125,6 +125,10 @@ runs: - kubectl delete ec2nodeclass --all - kubectl delete deployment --all - PRIVATE_CLUSTER=$CLUSTER_NAME TEST_SUITE=$SUITE ENABLE_METRICS=$ENABLE_METRICS METRICS_REGION=$METRICS_REGION GIT_REF="$(git rev-parse HEAD)" CLUSTER_NAME=$CLUSTER_NAME CLUSTER_ENDPOINT="$(aws eks describe-cluster --name $CLUSTER_NAME --query "cluster.endpoint" --output text)" INTERRUPTION_QUEUE=$CLUSTER_NAME make e2etests + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/application --retention-in-days 30 + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/dataplane --retention-in-days 30 + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/host --retention-in-days 30 + - aws logs put-retention-policy --log-group-name /aws/containerinsights/$CLUSTER_NAME/performance --retention-in-days 30 post_build: commands: # Describe karpenter pods diff --git a/.github/actions/e2e/setup-cluster/action.yaml b/.github/actions/e2e/setup-cluster/action.yaml index c829662294f0..aec619949048 100644 --- a/.github/actions/e2e/setup-cluster/action.yaml +++ b/.github/actions/e2e/setup-cluster/action.yaml @@ -30,7 +30,7 @@ inputs: default: "1.29" eksctl_version: description: "Version of eksctl to install" - default: v0.169.0 + default: v0.175.0 ip_family: description: "IP Family of the cluster. Valid values are IPv4 or IPv6" default: "IPv4" @@ -50,7 +50,7 @@ inputs: runs: using: "composite" steps: - - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 + - uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 with: ref: ${{ inputs.git_ref }} - uses: ./.github/actions/e2e/install-eksctl @@ -152,11 +152,9 @@ runs: minSize: 2 maxSize: 2 iam: + withAddonPolicies: + cloudWatch: true instanceRolePermissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary" - taints: - - key: CriticalAddonsOnly - value: "true" - effect: NoSchedule cloudWatch: clusterLogging: enableTypes: ["*"] @@ -175,6 +173,8 @@ runs: $KARPENTER_IAM withOIDC: true addons: + - name: amazon-cloudwatch-observability + permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary" - name: vpc-cni permissionsBoundary: "arn:aws:iam::$ACCOUNT_ID:policy/GithubActionsPermissionsBoundary" - name: coredns @@ -211,6 +211,11 @@ runs: else eksctl ${cmd} cluster -f clusterconfig.yaml fi + + # Adding taints after all necessary pods have scheduled to the manged node group nodes + # amazon-cloudwatch-observability pods do no not tolerate CriticalAddonsOnly=true:NoSchedule and + # amazon-cloudwatch-observability addons does not allow to add tolerations to the addon pods as part of the advanced configuration + kubectl taint nodes CriticalAddonsOnly=true:NoSchedule --all - name: tag oidc provider of the cluster if: always() shell: bash diff --git a/.github/actions/e2e/slack/notify/action.yaml b/.github/actions/e2e/slack/notify/action.yaml index 3fadcd90954f..91160181ef54 100644 --- a/.github/actions/e2e/slack/notify/action.yaml +++ b/.github/actions/e2e/slack/notify/action.yaml @@ -17,7 +17,7 @@ inputs: runs: using: "composite" steps: - - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 + - uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 with: ref: ${{ inputs.git_ref }} - id: get-run-name diff --git a/.github/actions/e2e/upgrade-crds/action.yaml b/.github/actions/e2e/upgrade-crds/action.yaml index 23c28d7fb1c5..8b9242097642 100644 --- a/.github/actions/e2e/upgrade-crds/action.yaml +++ b/.github/actions/e2e/upgrade-crds/action.yaml @@ -24,7 +24,7 @@ runs: role-to-assume: arn:aws:iam::${{ inputs.account_id }}:role/${{ inputs.role }} aws-region: ${{ inputs.region }} role-duration-seconds: 21600 - - uses: actions/checkout@9bb56186c3b09b4f86b1c65136769dd318469633 # v4.1.2 + - uses: actions/checkout@1d96c772d19495a3b5c517cd2bc0cb401ea0529f # v4.1.3 with: ref: ${{ inputs.git_ref }} - name: install-karpenter diff --git a/.github/workflows/approval-comment.yaml b/.github/workflows/approval-comment.yaml index 34dfcafe9c05..ddd05e1e2a6a 100644 --- a/.github/workflows/approval-comment.yaml +++ b/.github/workflows/approval-comment.yaml @@ -19,7 +19,7 @@ jobs: mkdir -p /tmp/artifacts { echo "$REVIEW_BODY"; echo "$PULL_REQUEST_NUMBER"; echo "$COMMIT_ID"; } >> /tmp/artifacts/metadata.txt cat /tmp/artifacts/metadata.txt - - uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # v4.3.1 + - uses: actions/upload-artifact@1746f4ab65b179e0ea60a494b83293b640dd5bba # v4.3.2 with: name: artifacts path: /tmp/artifacts diff --git a/.github/workflows/e2e-upgrade.yaml b/.github/workflows/e2e-upgrade.yaml index 0ad5a6e3f3e0..2c6b8a8c0f28 100644 --- a/.github/workflows/e2e-upgrade.yaml +++ b/.github/workflows/e2e-upgrade.yaml @@ -90,7 +90,7 @@ jobs: region: ${{ inputs.region }} cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} - eksctl_version: v0.169.0 + eksctl_version: v0.175.0 ip_family: IPv4 # Set the value to IPv6 if IPv6 suite, else IPv4 git_ref: ${{ inputs.from_git_ref }} ecr_account_id: ${{ vars.SNAPSHOT_ACCOUNT_ID }} @@ -135,6 +135,15 @@ jobs: url: ${{ secrets.SLACK_WEBHOOK_URL }} suite: Upgrade git_ref: ${{ inputs.to_git_ref }} + - name: add log retention policy + if: ${{ inputs.workflow_trigger != 'private_cluster' }} + env: + CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + run: | + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30 - name: dump logs on failure uses: ./.github/actions/e2e/dump-logs if: failure() || cancelled() diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index dc120754be4a..636c19c77b9c 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -132,7 +132,7 @@ jobs: region: ${{ inputs.region }} cluster_name: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} k8s_version: ${{ inputs.k8s_version }} - eksctl_version: v0.169.0 + eksctl_version: v0.175.0 ip_family: ${{ contains(inputs.suite, 'IPv6') && 'IPv6' || 'IPv4' }} # Set the value to IPv6 if IPv6 suite, else IPv4 private_cluster: ${{ inputs.workflow_trigger == 'private_cluster' }} git_ref: ${{ inputs.git_ref }} @@ -187,6 +187,15 @@ jobs: suite: ${{ inputs.suite }} git_ref: ${{ inputs.git_ref }} workflow_trigger: ${{ inputs.workflow_trigger }} + - name: add log retention policy + if: ${{ inputs.workflow_trigger != 'private_cluster' }} + env: + CLUSTER_NAME: ${{ steps.generate-cluster-name.outputs.CLUSTER_NAME }} + run: | + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/application --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/dataplane --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/host --retention-in-days 30 + aws logs put-retention-policy --log-group-name /aws/containerinsights/"$CLUSTER_NAME"/performance --retention-in-days 30 - name: dump logs on failure uses: ./.github/actions/e2e/dump-logs if: (failure() || cancelled()) && inputs.workflow_trigger != 'private_cluster' diff --git a/charts/karpenter-crd/Chart.yaml b/charts/karpenter-crd/Chart.yaml index 70d122d7e5ec..ce3066a47213 100644 --- a/charts/karpenter-crd/Chart.yaml +++ b/charts/karpenter-crd/Chart.yaml @@ -1,6 +1,6 @@ apiVersion: v2 name: karpenter-crd -description: A Helm chart for Karpenter Custom Resource Definitions (CRDs) +description: A Helm chart for Karpenter Custom Resource Definitions (CRDs). type: application version: 0.36.0 appVersion: 0.36.0 diff --git a/charts/karpenter-crd/README.md b/charts/karpenter-crd/README.md new file mode 100644 index 000000000000..566d9a7efa12 --- /dev/null +++ b/charts/karpenter-crd/README.md @@ -0,0 +1,15 @@ +# karpenter-crd + +![Version: 0.36.0](https://img.shields.io/badge/Version-0.36.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 0.36.0](https://img.shields.io/badge/AppVersion-0.36.0-informational?style=flat-square) + +A Helm chart for Karpenter Custom Resource Definitions (CRDs). + +**Homepage:** + +## Source Code + +* + +---------------------------------------------- + +Autogenerated from chart metadata using [helm-docs](https://github.com/norwoodj/helm-docs/). diff --git a/charts/karpenter-crd/README.md.gotmpl b/charts/karpenter-crd/README.md.gotmpl new file mode 100644 index 000000000000..7991d3c23035 --- /dev/null +++ b/charts/karpenter-crd/README.md.gotmpl @@ -0,0 +1,20 @@ +{{ template "chart.header" . }} +{{ template "chart.deprecationWarning" . }} + +{{ template "chart.badgesSection" . }} + +{{ template "chart.description" . }} + +{{ template "chart.homepageLine" . }} + +{{ template "chart.maintainersSection" . }} + +{{ template "chart.sourcesSection" . }} + +{{ template "chart.requirementsSection" . }} + +{{ template "chart.valuesSection" . }} + +---------------------------------------------- + +Autogenerated from chart metadata using [helm-docs](https://github.com/norwoodj/helm-docs/). diff --git a/charts/karpenter-crd/artifacthub-repo.yaml b/charts/karpenter-crd/artifacthub-repo.yaml index 194c8d2496ea..814833c19915 100644 --- a/charts/karpenter-crd/artifacthub-repo.yaml +++ b/charts/karpenter-crd/artifacthub-repo.yaml @@ -1,4 +1,4 @@ -repositoryID: fda7ffc4-4672-4218-8264-321ec3b4e3cc +repositoryID: 2cfb6f76-afe1-447f-b036-cd2e230d07d7 owners: [] # - name: awsadmin1 # email: artifacthub1@aws.com diff --git a/charts/karpenter-crd/values.yaml b/charts/karpenter-crd/values.yaml new file mode 100644 index 000000000000..e69de29bb2d1 diff --git a/charts/karpenter/README.md b/charts/karpenter/README.md index 497818969d2e..3ba2824468c6 100644 --- a/charts/karpenter/README.md +++ b/charts/karpenter/README.md @@ -111,3 +111,6 @@ cosign verify public.ecr.aws/karpenter/karpenter:0.36.0 \ | webhook.metrics.port | int | `8001` | The container port to use for webhook metrics. | | webhook.port | int | `8443` | The container port to use for the webhook. | +---------------------------------------------- + +Autogenerated from chart metadata using [helm-docs](https://github.com/norwoodj/helm-docs/). diff --git a/charts/karpenter/README.md.gotmpl b/charts/karpenter/README.md.gotmpl index 1e641e1ee586..ee4c8d1e4c0c 100644 --- a/charts/karpenter/README.md.gotmpl +++ b/charts/karpenter/README.md.gotmpl @@ -39,4 +39,6 @@ cosign verify public.ecr.aws/karpenter/karpenter:{{ template "chart.version" . } {{ template "chart.valuesSection" . }} -{{ template "helm-docs.versionFooter" . }} +---------------------------------------------- + +Autogenerated from chart metadata using [helm-docs](https://github.com/norwoodj/helm-docs/). diff --git a/go.mod b/go.mod index a6b0f98b9f6c..c518c5aa466a 100644 --- a/go.mod +++ b/go.mod @@ -6,14 +6,14 @@ require ( github.com/Pallinder/go-randomdata v1.2.0 github.com/PuerkitoBio/goquery v1.9.1 github.com/avast/retry-go v3.0.0+incompatible - github.com/aws/aws-sdk-go v1.51.21 + github.com/aws/aws-sdk-go v1.51.25 github.com/aws/karpenter-provider-aws/tools/kompat v0.0.0-20240410220356-6b868db24881 github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20240229193347-cfab22a10647 github.com/go-logr/zapr v1.3.0 github.com/imdario/mergo v0.3.16 github.com/mitchellh/hashstructure/v2 v2.0.2 github.com/onsi/ginkgo/v2 v2.17.1 - github.com/onsi/gomega v1.32.0 + github.com/onsi/gomega v1.33.0 github.com/patrickmn/go-cache v2.1.0+incompatible github.com/pelletier/go-toml/v2 v2.2.1 github.com/prometheus/client_golang v1.19.0 diff --git a/go.sum b/go.sum index 42e7c2be239e..b9854fd02f42 100644 --- a/go.sum +++ b/go.sum @@ -54,8 +54,8 @@ github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6 github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= github.com/avast/retry-go v3.0.0+incompatible h1:4SOWQ7Qs+oroOTQOYnAHqelpCO0biHSxpiH9JdtuBj0= github.com/avast/retry-go v3.0.0+incompatible/go.mod h1:XtSnn+n/sHqQIpZ10K1qAevBhOOCWBLXXy3hyiqqBrY= -github.com/aws/aws-sdk-go v1.51.21 h1:UrT6JC9R9PkYYXDZBV0qDKTualMr+bfK2eboTknMgbs= -github.com/aws/aws-sdk-go v1.51.21/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk= +github.com/aws/aws-sdk-go v1.51.25 h1:DjTT8mtmsachhV6yrXR8+yhnG6120dazr720nopRsls= +github.com/aws/aws-sdk-go v1.51.25/go.mod h1:LF8svs817+Nz+DmiMQKTO3ubZ/6IaTpq3TjupRn3Eqk= github.com/aws/karpenter-provider-aws/tools/kompat v0.0.0-20240410220356-6b868db24881 h1:m9rhsGhdepdQV96tZgfy68oU75AWAjOH8u65OefTjwA= github.com/aws/karpenter-provider-aws/tools/kompat v0.0.0-20240410220356-6b868db24881/go.mod h1:+Mk5k0b6HpKobxNq+B56DOhZ+I/NiPhd5MIBhQMSTSs= github.com/awslabs/amazon-eks-ami/nodeadm v0.0.0-20240229193347-cfab22a10647 h1:8yRBVsjGmI7qQsPWtIrbWP+XfwHO9Wq7gdLVzjqiZFs= @@ -272,8 +272,8 @@ github.com/olekukonko/tablewriter v0.0.5 h1:P2Ga83D34wi1o9J6Wh1mRuqd4mF/x/lgBS7N github.com/olekukonko/tablewriter v0.0.5/go.mod h1:hPp6KlRPjbx+hW8ykQs1w3UBbZlj6HuIJcUGPhkA7kY= github.com/onsi/ginkgo/v2 v2.17.1 h1:V++EzdbhI4ZV4ev0UTIj0PzhzOcReJFyJaLjtSF55M8= github.com/onsi/ginkgo/v2 v2.17.1/go.mod h1:llBI3WDLL9Z6taip6f33H76YcWtJv+7R3HigUjbIBOs= -github.com/onsi/gomega v1.32.0 h1:JRYU78fJ1LPxlckP6Txi/EYqJvjtMrDC04/MM5XRHPk= -github.com/onsi/gomega v1.32.0/go.mod h1:a4x4gW6Pz2yK1MAmvluYme5lvYTn61afQ2ETw/8n4Lg= +github.com/onsi/gomega v1.33.0 h1:snPCflnZrpMsy94p4lXVEkHo12lmPnc3vY5XBbreexE= +github.com/onsi/gomega v1.33.0/go.mod h1:+925n5YtiFsLzzafLUHzVMBpvvRAzrydIBiSIxjX3wY= github.com/patrickmn/go-cache v2.1.0+incompatible h1:HRMgzkcYKYpi3C8ajMPV8OFXaaRUnok+kx1WdO15EQc= github.com/patrickmn/go-cache v2.1.0+incompatible/go.mod h1:3Qf8kWWT7OJRJbdiICTKqZju1ZixQ/KpMGzzAfe6+WQ= github.com/pelletier/go-toml/v2 v2.2.1 h1:9TA9+T8+8CUCO2+WYnDLCgrYi9+omqKXyjDtosvtEhg= diff --git a/hack/docs/instancetypes_gen_docs.go b/hack/docs/instancetypes_gen_docs.go index 68736fb5e823..074dd4c7222b 100644 --- a/hack/docs/instancetypes_gen_docs.go +++ b/hack/docs/instancetypes_gen_docs.go @@ -98,6 +98,12 @@ func main() { cp := awscloudprovider.New(op.InstanceTypesProvider, op.InstanceProvider, op.EventRecorder, op.GetClient(), op.AMIProvider, op.SecurityGroupProvider, op.SubnetProvider) + if err := op.InstanceTypesProvider.UpdateInstanceTypes(ctx); err != nil { + log.Fatalf("updating instance types, %s", err) + } + if err := op.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx); err != nil { + log.Fatalf("updating instance types offerings, %s", err) + } instanceTypes, err := cp.GetInstanceTypes(ctx, nil) if err != nil { log.Fatalf("listing instance types, %s", err) diff --git a/hack/release/common.sh b/hack/release/common.sh index 04a1c26286d4..00d5c96e0ea3 100644 --- a/hack/release/common.sh +++ b/hack/release/common.sh @@ -76,7 +76,7 @@ build() { } publishHelmChart() { - local oci_repo helm_chart version commit_sha build_date ah_config_file_name helm_chart_artifact helm_chart_digest + local oci_repo helm_chart version commit_sha build_date helm_chart_artifact helm_chart_digest oci_repo="${1}" helm_chart="${2}" @@ -84,21 +84,15 @@ publishHelmChart() { commit_sha="${4}" build_date="${5}" - ah_config_file_name="${helm_chart}/artifacthub-repo.yaml" helm_chart_artifact="${helm_chart}-${version}.tgz" + updateAhConfig "${oci_repo}" "${helm_chart}" + yq e -i ".appVersion = \"${version}\"" "charts/${helm_chart}/Chart.yaml" yq e -i ".version = \"${version}\"" "charts/${helm_chart}/Chart.yaml" cd charts - if [[ -s "${ah_config_file_name}" ]] && [[ "$oci_repo" == "${RELEASE_REPO_ECR}" ]]; then - # ECR requires us to create an empty config file for an alternative - # media type artifact push rather than /dev/null - # https://github.com/aws/containers-roadmap/issues/1074 - temp=$(mktemp) - echo {} > "${temp}" - oras push "${oci_repo}${helm_chart}:artifacthub.io" --config "${temp}:application/vnd.cncf.artifacthub.config.v1+yaml" "${ah_config_file_name}:application/vnd.cncf.artifacthub.repository-metadata.layer.v1.yaml" - fi + helm dependency update "${helm_chart}" helm lint "${helm_chart}" helm package "${helm_chart}" --version "${version}" @@ -110,6 +104,41 @@ publishHelmChart() { cosignOciArtifact "${version}" "${commit_sha}" "${build_date}" "${oci_repo}${helm_chart}:${version}@${helm_chart_digest}" } +updateAhConfig() { + local oci_repo helm_chart ah_config_path image_config_path image_config media_type oci_repository oci_image old_config_digest blob_digest + + oci_repo="${1}" + helm_chart="${2}" + + ah_config_path="./charts/${helm_chart}/artifacthub-repo.yaml" + + if [[ -f "${ah_config_path}" ]] && [[ "${oci_repo}" == "${RELEASE_REPO_ECR}" ]]; then + # ECR requires us to create an empty config file for an alternative + # media type artifact push rather than /dev/null + # https://github.com/aws/containers-roadmap/issues/1074 + image_config_path="$(mktemp)" + echo "{}" > "${image_config_path}" + + image_config="${image_config_path}:application/vnd.cncf.artifacthub.config.v1+yaml" + media_type="application/vnd.cncf.artifacthub.repository-metadata.layer.v1.yaml" + oci_repository="${oci_repo}${helm_chart}" + oci_image="${oci_repository}:artifacthub.io" + + old_config_digest="$(crane digest "${oci_image}" || true)" + + if [[ -n "${old_config_digest}" ]]; then + blob_digest="$(oras manifest fetch --output - "${oci_repository}@${old_config_digest}" | jq -r --arg mediaType "${media_type}" '.layers[] | select(.mediaType == $mediaType) | .digest')" + + if [[ "$(oras blob fetch --output - "${oci_repository}@${blob_digest}")" != "$(cat "${ah_config_path}")" ]]; then + oras push --config "${image_config}" "${oci_image}" "${ah_config_path}:${media_type}" + crane delete "${oci_repository}@${old_config_digest}" + fi + else + oras push --config "${image_config}" "${oci_image}" "${ah_config_path}:${media_type}" + fi + fi +} + cosignOciArtifact() { local version commit_sha build_date artifact diff --git a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml index fe04920f8dad..15cd0e2fc34e 100644 --- a/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml +++ b/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml @@ -168,12 +168,6 @@ spec: format: int64 type: integer volumeSize: - allOf: - - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - - pattern: ^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$ - anyOf: - - type: integer - - type: string description: |- VolumeSize in `Gi`, `G`, `Ti`, or `T`. You must specify either a snapshot ID or a volume size. The following are the supported volumes sizes for each volume @@ -190,7 +184,8 @@ spec: * standard: 1-1,024 - x-kubernetes-int-or-string: true + pattern: ^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$ + type: string volumeType: description: |- VolumeType of the block device. diff --git a/pkg/apis/v1beta1/ec2nodeclass.go b/pkg/apis/v1beta1/ec2nodeclass.go index f9515b5e4d2f..66a2926b2ced 100644 --- a/pkg/apis/v1beta1/ec2nodeclass.go +++ b/pkg/apis/v1beta1/ec2nodeclass.go @@ -292,7 +292,8 @@ type BlockDevice struct { // + TODO: Add the CEL resources.quantity type after k8s 1.29 // + https://github.com/kubernetes/apiserver/commit/b137c256373aec1c5d5810afbabb8932a19ecd2a#diff-838176caa5882465c9d6061febd456397a3e2b40fb423ed36f0cabb1847ecb4dR190 // +kubebuilder:validation:Pattern:="^((?:[1-9][0-9]{0,3}|[1-4][0-9]{4}|[5][0-8][0-9]{3}|59000)Gi|(?:[1-9][0-9]{0,3}|[1-5][0-9]{4}|[6][0-3][0-9]{3}|64000)G|([1-9]||[1-5][0-7]|58)Ti|([1-9]||[1-5][0-9]|6[0-3]|64)T)$" - // +kubebuilder:validation:XIntOrString + // +kubebuilder:validation:Schemaless + // +kubebuilder:validation:Type:=string // +optional VolumeSize *resource.Quantity `json:"volumeSize,omitempty" hash:"string"` // VolumeType of the block device. diff --git a/pkg/cloudprovider/drift.go b/pkg/cloudprovider/drift.go index 6f0b2cf22de0..f40455ad3837 100644 --- a/pkg/cloudprovider/drift.go +++ b/pkg/cloudprovider/drift.go @@ -25,9 +25,6 @@ import ( corev1beta1 "sigs.k8s.io/karpenter/pkg/apis/v1beta1" "sigs.k8s.io/karpenter/pkg/cloudprovider" - "github.com/aws/aws-sdk-go/aws" - "github.com/aws/aws-sdk-go/service/ec2" - "github.com/aws/karpenter-provider-aws/pkg/apis/v1beta1" "github.com/aws/karpenter-provider-aws/pkg/providers/amifamily" "github.com/aws/karpenter-provider-aws/pkg/providers/instance" @@ -54,7 +51,7 @@ func (c *CloudProvider) isNodeClassDrifted(ctx context.Context, nodeClaim *corev if err != nil { return "", fmt.Errorf("calculating ami drift, %w", err) } - securitygroupDrifted, err := c.areSecurityGroupsDrifted(ctx, instance, nodeClass) + securitygroupDrifted, err := c.areSecurityGroupsDrifted(instance, nodeClass) if err != nil { return "", fmt.Errorf("calculating securitygroup drift, %w", err) } @@ -114,14 +111,10 @@ func (c *CloudProvider) isSubnetDrifted(instance *instance.Instance, nodeClass * // Checks if the security groups are drifted, by comparing the security groups returned from the SecurityGroupProvider // to the ec2 instance security groups -func (c *CloudProvider) areSecurityGroupsDrifted(ctx context.Context, ec2Instance *instance.Instance, nodeClass *v1beta1.EC2NodeClass) (cloudprovider.DriftReason, error) { - securitygroup, err := c.securityGroupProvider.List(ctx, nodeClass) - if err != nil { - return "", err - } - securityGroupIds := sets.New(lo.Map(securitygroup, func(sg *ec2.SecurityGroup, _ int) string { return aws.StringValue(sg.GroupId) })...) +func (c *CloudProvider) areSecurityGroupsDrifted(ec2Instance *instance.Instance, nodeClass *v1beta1.EC2NodeClass) (cloudprovider.DriftReason, error) { + securityGroupIds := sets.New(lo.Map(nodeClass.Status.SecurityGroups, func(sg v1beta1.SecurityGroup, _ int) string { return sg.ID })...) if len(securityGroupIds) == 0 { - return "", fmt.Errorf("no security groups are discovered") + return "", fmt.Errorf("no security groups are present in the status") } if !securityGroupIds.Equal(sets.New(ec2Instance.SecurityGroupIDs...)) { diff --git a/pkg/cloudprovider/suite_test.go b/pkg/cloudprovider/suite_test.go index 8df5c065eed8..d226c2b19651 100644 --- a/pkg/cloudprovider/suite_test.go +++ b/pkg/cloudprovider/suite_test.go @@ -140,18 +140,35 @@ var _ = Describe("CloudProvider", func() { }, }, }) - nodeClass.Status.Subnets = []v1beta1.Subnet{ - { - ID: "subnet-test1", - Zone: "test-zone-1a", - }, - { - ID: "subnet-test2", - Zone: "test-zone-1b", + nodeClass.Status = v1beta1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + SecurityGroups: []v1beta1.SecurityGroup{ + { + ID: "sg-test1", + Name: "securityGroup-test1", + }, + { + ID: "sg-test2", + Name: "securityGroup-test2", + }, + { + ID: "sg-test3", + Name: "securityGroup-test3", + }, }, - { - ID: "subnet-test3", - Zone: "test-zone-1c", + Subnets: []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a", + }, + { + ID: "subnet-test2", + Zone: "test-zone-1b", + }, + { + ID: "subnet-test3", + Zone: "test-zone-1c", + }, }, } Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) @@ -612,6 +629,11 @@ var _ = Describe("CloudProvider", func() { Zone: "zone-2", }, } + nodeClass.Status.SecurityGroups = []v1beta1.SecurityGroup{ + { + ID: validSecurityGroup, + }, + } ExpectApplied(ctx, env.Client, nodePool, nodeClass) instanceTypes, err := cloudProvider.GetInstanceTypes(ctx, nodePool) Expect(err).ToNot(HaveOccurred()) @@ -698,7 +720,8 @@ var _ = Describe("CloudProvider", func() { Expect(isDrifted).To(BeEmpty()) }) It("should return an error if the security groups are empty", func() { - awsEnv.EC2API.DescribeSecurityGroupsOutput.Set(&ec2.DescribeSecurityGroupsOutput{SecurityGroups: []*ec2.SecurityGroup{}}) + nodeClass.Status.SecurityGroups = []v1beta1.SecurityGroup{} + ExpectApplied(ctx, env.Client, nodeClass) // Instance is a reference to what we return in the GetInstances call instance.SecurityGroups = []*ec2.GroupIdentifier{{GroupId: aws.String(fake.SecurityGroupID())}} _, err := cloudProvider.IsDrifted(ctx, nodeClaim) @@ -719,18 +742,17 @@ var _ = Describe("CloudProvider", func() { Expect(isDrifted).To(Equal(cloudprovider.SecurityGroupDrift)) }) It("should return drifted if more security groups are present than instance security groups then discovered from nodeclass", func() { - awsEnv.EC2API.DescribeSecurityGroupsOutput.Set(&ec2.DescribeSecurityGroupsOutput{ - SecurityGroups: []*ec2.SecurityGroup{ - { - GroupId: aws.String(validSecurityGroup), - GroupName: aws.String("test-securitygroup"), - }, - { - GroupId: aws.String(fake.SecurityGroupID()), - GroupName: aws.String("test-securitygroup"), - }, + nodeClass.Status.SecurityGroups = []v1beta1.SecurityGroup{ + { + ID: validSecurityGroup, + Name: "test-securitygroup", }, - }) + { + ID: fake.SecurityGroupID(), + Name: "test-securitygroup", + }, + } + ExpectApplied(ctx, env.Client, nodeClass) isDrifted, err := cloudProvider.IsDrifted(ctx, nodeClaim) Expect(err).ToNot(HaveOccurred()) Expect(isDrifted).To(Equal(cloudprovider.SecurityGroupDrift)) @@ -815,6 +837,11 @@ var _ = Describe("CloudProvider", func() { Zone: "zone-2", }, }, + SecurityGroups: []v1beta1.SecurityGroup{ + { + ID: validSecurityGroup, + }, + }, }, } nodeClass.Annotations = lo.Assign(nodeClass.Annotations, map[string]string{v1beta1.AnnotationEC2NodeClassHash: nodeClass.Hash()}) @@ -1058,6 +1085,13 @@ var _ = Describe("CloudProvider", func() { }, }, }, + Status: v1beta1.EC2NodeClassStatus{ + SecurityGroups: []v1beta1.SecurityGroup{ + { + ID: "sg-test1", + }, + }, + }, }) nodePool2 := coretest.NodePool(corev1beta1.NodePool{ Spec: corev1beta1.NodePoolSpec{ diff --git a/pkg/controllers/providers/instancetype/suite_test.go b/pkg/controllers/providers/instancetype/suite_test.go index 9f3fb844b9b1..b2d7f3fca2ee 100644 --- a/pkg/controllers/providers/instancetype/suite_test.go +++ b/pkg/controllers/providers/instancetype/suite_test.go @@ -89,7 +89,24 @@ var _ = Describe("InstanceType", func() { }) ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) - instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, &v1beta1.EC2NodeClass{}) + instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, &v1beta1.EC2NodeClass{ + Status: v1beta1.EC2NodeClassStatus{ + Subnets: []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a", + }, + { + ID: "subnet-test2", + Zone: "test-zone-1b", + }, + { + ID: "subnet-test3", + Zone: "test-zone-1c", + }, + }, + }, + }) Expect(err).To(BeNil()) for i := range instanceTypes { Expect(instanceTypes[i].Name).To(Equal(lo.FromPtr(ec2InstanceTypes[i].InstanceType))) @@ -106,7 +123,24 @@ var _ = Describe("InstanceType", func() { }) ExpectReconcileSucceeded(ctx, controller, types.NamespacedName{}) - instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, &v1beta1.EC2NodeClass{}) + instanceTypes, err := awsEnv.InstanceTypesProvider.List(ctx, &corev1beta1.KubeletConfiguration{}, &v1beta1.EC2NodeClass{ + Status: v1beta1.EC2NodeClassStatus{ + Subnets: []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a", + }, + { + ID: "subnet-test2", + Zone: "test-zone-1b", + }, + { + ID: "subnet-test3", + Zone: "test-zone-1c", + }, + }, + }, + }) Expect(err).To(BeNil()) Expect(len(instanceTypes)).To(BeNumerically("==", len(ec2InstanceTypes))) diff --git a/pkg/providers/instance/suite_test.go b/pkg/providers/instance/suite_test.go index 1693e2eadf0d..07980af8e0c0 100644 --- a/pkg/providers/instance/suite_test.go +++ b/pkg/providers/instance/suite_test.go @@ -106,11 +106,41 @@ var _ = Describe("InstanceProvider", func() { }, }, }) + nodeClass.Status = v1beta1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + SecurityGroups: []v1beta1.SecurityGroup{ + { + ID: "sg-test1", + }, + { + ID: "sg-test2", + }, + { + ID: "sg-test3", + }, + }, + Subnets: []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a", + }, + { + ID: "subnet-test2", + Zone: "test-zone-1b", + }, + { + ID: "subnet-test3", + Zone: "test-zone-1c", + }, + }, + } + Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed()) }) It("should return an ICE error when all attempted instance types return an ICE error", func() { ExpectApplied(ctx, env.Client, nodeClaim, nodePool, nodeClass) + nodeClass = ExpectExists(ctx, env.Client, nodeClass) awsEnv.EC2API.InsufficientCapacityPools.Set([]fake.CapacityPool{ {CapacityType: corev1beta1.CapacityTypeOnDemand, InstanceType: "m5.xlarge", Zone: "test-zone-1a"}, {CapacityType: corev1beta1.CapacityTypeOnDemand, InstanceType: "m5.xlarge", Zone: "test-zone-1b"}, diff --git a/pkg/providers/instancetype/instancetype.go b/pkg/providers/instancetype/instancetype.go index 1caaebccdbe1..0d86dd941e8a 100644 --- a/pkg/providers/instancetype/instancetype.go +++ b/pkg/providers/instancetype/instancetype.go @@ -101,27 +101,26 @@ func (p *DefaultProvider) List(ctx context.Context, kc *corev1beta1.KubeletConfi defer p.muInstanceTypeInfo.RUnlock() defer p.muInstanceTypeOfferings.RUnlock() + if kc == nil { + kc = &corev1beta1.KubeletConfiguration{} + } + if nodeClass == nil { + nodeClass = &v1beta1.EC2NodeClass{} + } + if len(p.instanceTypesInfo) == 0 { return nil, fmt.Errorf("no instance types found") } if len(p.instanceTypeOfferings) == 0 { return nil, fmt.Errorf("no instance types offerings found") } - - subnets, err := p.subnetProvider.List(ctx, nodeClass) - if err != nil { - return nil, err + if len(nodeClass.Status.Subnets) == 0 { + return nil, fmt.Errorf("no subnets found") } - subnetZones := sets.New[string](lo.Map(subnets, func(s *ec2.Subnet, _ int) string { - return aws.StringValue(s.AvailabilityZone) - })...) - if kc == nil { - kc = &corev1beta1.KubeletConfiguration{} - } - if nodeClass == nil { - nodeClass = &v1beta1.EC2NodeClass{} - } + subnetZones := sets.New(lo.Map(nodeClass.Status.Subnets, func(s v1beta1.Subnet, _ int) string { + return aws.StringValue(&s.Zone) + })...) // Compute fully initialized instance types hash key subnetZonesHash, _ := hashstructure.Hash(subnetZones, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true}) diff --git a/pkg/providers/instancetype/suite_test.go b/pkg/providers/instancetype/suite_test.go index 0e270e24a70b..681f46a62bd4 100644 --- a/pkg/providers/instancetype/suite_test.go +++ b/pkg/providers/instancetype/suite_test.go @@ -155,6 +155,82 @@ var _ = Describe("InstanceTypeProvider", func() { }, }, }) + + nodeClass.Status = v1beta1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + AMIs: []v1beta1.AMI{ + { + ID: "ami-test1", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureAmd64), + scheduling.NewRequirement(v1beta1.LabelInstanceGPUCount, v1.NodeSelectorOpDoesNotExist), + scheduling.NewRequirement(v1beta1.LabelInstanceAcceleratorCount, v1.NodeSelectorOpDoesNotExist), + ).NodeSelectorRequirements(), + }, + { + ID: "ami-test2", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureAmd64), + scheduling.NewRequirement(v1beta1.LabelInstanceGPUCount, v1.NodeSelectorOpExists), + ).NodeSelectorRequirements(), + }, + { + ID: "ami-test3", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureAmd64), + scheduling.NewRequirement(v1beta1.LabelInstanceAcceleratorCount, v1.NodeSelectorOpExists), + ).NodeSelectorRequirements(), + }, + { + ID: "ami-test4", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureArm64), + scheduling.NewRequirement(v1beta1.LabelInstanceGPUCount, v1.NodeSelectorOpDoesNotExist), + scheduling.NewRequirement(v1beta1.LabelInstanceAcceleratorCount, v1.NodeSelectorOpDoesNotExist), + ).NodeSelectorRequirements(), + }, + }, + SecurityGroups: []v1beta1.SecurityGroup{ + { + ID: "sg-test1", + }, + { + ID: "sg-test2", + }, + { + ID: "sg-test3", + }, + }, + Subnets: []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a", + }, + { + ID: "subnet-test2", + Zone: "test-zone-1b", + }, + { + ID: "subnet-test3", + Zone: "test-zone-1c", + }, + }, + } + windowsNodeClass.Status = v1beta1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + AMIs: []v1beta1.AMI{ + { + ID: "ami-window-test1", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureAmd64), + scheduling.NewRequirement(v1.LabelOSStable, v1.NodeSelectorOpIn, string(v1.Windows)), + scheduling.NewRequirement(v1.LabelWindowsBuild, v1.NodeSelectorOpIn, v1beta1.Windows2022Build), + ).NodeSelectorRequirements(), + }, + }, + SecurityGroups: nodeClass.Status.SecurityGroups, + Subnets: nodeClass.Status.Subnets, + } Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed()) }) @@ -851,6 +927,12 @@ var _ = Describe("InstanceTypeProvider", func() { }) }) It("should launch instances in local zones", func() { + nodeClass.Status.Subnets = []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a-local", + }, + } ExpectApplied(ctx, env.Client, nodePool, nodeClass) pod := coretest.UnschedulablePod(coretest.PodOptions{ NodeRequirements: []v1.NodeSelectorRequirement{{ @@ -861,7 +943,6 @@ var _ = Describe("InstanceTypeProvider", func() { }) ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) ExpectScheduled(ctx, env.Client, pod) - }) Context("Overhead", func() { diff --git a/pkg/providers/launchtemplate/launchtemplate.go b/pkg/providers/launchtemplate/launchtemplate.go index f7d692c6f0de..b90949530f6f 100644 --- a/pkg/providers/launchtemplate/launchtemplate.go +++ b/pkg/providers/launchtemplate/launchtemplate.go @@ -163,13 +163,15 @@ func (p *DefaultProvider) createAMIOptions(ctx context.Context, nodeClass *v1bet if err != nil { return nil, err } + // Relying on the status rather than an API call means that Karpenter is subject to a race + // condition where EC2NodeClass spec changes haven't propagated to the status once a node + // has launched. + // If a user changes their EC2NodeClass and shortly after Karpenter launches a node, + // in the worst case, the node could be drifted and re-created. + // TODO @aengeda: add status generation fields to gate node creation until the status is updated from a spec change // Get constrained security groups - securityGroups, err := p.securityGroupProvider.List(ctx, nodeClass) - if err != nil { - return nil, err - } - if len(securityGroups) == 0 { - return nil, fmt.Errorf("no security groups exist given constraints") + if len(nodeClass.Status.SecurityGroups) == 0 { + return nil, fmt.Errorf("no security groups are present in the status") } options := &amifamily.Options{ ClusterName: options.FromContext(ctx).ClusterName, @@ -177,14 +179,12 @@ func (p *DefaultProvider) createAMIOptions(ctx context.Context, nodeClass *v1bet ClusterCIDR: p.ClusterCIDR.Load(), InstanceProfile: instanceProfile, InstanceStorePolicy: nodeClass.Spec.InstanceStorePolicy, - SecurityGroups: lo.Map(securityGroups, func(s *ec2.SecurityGroup, _ int) v1beta1.SecurityGroup { - return v1beta1.SecurityGroup{ID: aws.StringValue(s.GroupId), Name: aws.StringValue(s.GroupName)} - }), - Tags: tags, - Labels: labels, - CABundle: p.CABundle, - KubeDNSIP: p.KubeDNSIP, - NodeClassName: nodeClass.Name, + SecurityGroups: nodeClass.Status.SecurityGroups, + Tags: tags, + Labels: labels, + CABundle: p.CABundle, + KubeDNSIP: p.KubeDNSIP, + NodeClassName: nodeClass.Name, } if nodeClass.Spec.AssociatePublicIPAddress != nil { options.AssociatePublicIPAddress = nodeClass.Spec.AssociatePublicIPAddress diff --git a/pkg/providers/launchtemplate/suite_test.go b/pkg/providers/launchtemplate/suite_test.go index 6799b9ed3ab8..0aa434cfbf23 100644 --- a/pkg/providers/launchtemplate/suite_test.go +++ b/pkg/providers/launchtemplate/suite_test.go @@ -51,6 +51,7 @@ import ( "sigs.k8s.io/karpenter/pkg/events" coreoptions "sigs.k8s.io/karpenter/pkg/operator/options" "sigs.k8s.io/karpenter/pkg/operator/scheme" + "sigs.k8s.io/karpenter/pkg/scheduling" coretest "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" @@ -146,6 +147,66 @@ var _ = Describe("LaunchTemplate Provider", func() { }, }, }) + nodeClass.Status = v1beta1.EC2NodeClassStatus{ + InstanceProfile: "test-profile", + AMIs: []v1beta1.AMI{ + { + ID: "ami-test1", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureAmd64), + scheduling.NewRequirement(v1beta1.LabelInstanceGPUCount, v1.NodeSelectorOpDoesNotExist), + scheduling.NewRequirement(v1beta1.LabelInstanceAcceleratorCount, v1.NodeSelectorOpDoesNotExist), + ).NodeSelectorRequirements(), + }, + { + ID: "ami-test2", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureAmd64), + scheduling.NewRequirement(v1beta1.LabelInstanceGPUCount, v1.NodeSelectorOpExists), + ).NodeSelectorRequirements(), + }, + { + ID: "ami-test3", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureAmd64), + scheduling.NewRequirement(v1beta1.LabelInstanceAcceleratorCount, v1.NodeSelectorOpExists), + ).NodeSelectorRequirements(), + }, + { + ID: "ami-test4", + Requirements: scheduling.NewRequirements( + scheduling.NewRequirement(v1.LabelArchStable, v1.NodeSelectorOpIn, corev1beta1.ArchitectureArm64), + scheduling.NewRequirement(v1beta1.LabelInstanceGPUCount, v1.NodeSelectorOpDoesNotExist), + scheduling.NewRequirement(v1beta1.LabelInstanceAcceleratorCount, v1.NodeSelectorOpDoesNotExist), + ).NodeSelectorRequirements(), + }, + }, + SecurityGroups: []v1beta1.SecurityGroup{ + { + ID: "sg-test1", + }, + { + ID: "sg-test2", + }, + { + ID: "sg-test3", + }, + }, + Subnets: []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a", + }, + { + ID: "subnet-test2", + Zone: "test-zone-1b", + }, + { + ID: "subnet-test3", + Zone: "test-zone-1c", + }, + }, + } Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypes(ctx)).To(Succeed()) Expect(awsEnv.InstanceTypesProvider.UpdateInstanceTypeOfferings(ctx)).To(Succeed()) }) @@ -171,6 +232,32 @@ var _ = Describe("LaunchTemplate Provider", func() { }, }, }) + nodeClass2.Status.SecurityGroups = []v1beta1.SecurityGroup{ + { + ID: "sg-test1", + }, + { + ID: "sg-test2", + }, + { + ID: "sg-test3", + }, + } + nodeClass2.Status.Subnets = []v1beta1.Subnet{ + { + ID: "subnet-test1", + Zone: "test-zone-1a", + }, + { + ID: "subnet-test2", + Zone: "test-zone-1b", + }, + { + ID: "subnet-test3", + Zone: "test-zone-1c", + }, + } + pods := []*v1.Pod{ coretest.UnschedulablePod(coretest.PodOptions{NodeRequirements: []v1.NodeSelectorRequirement{ { @@ -1962,6 +2049,8 @@ var _ = Describe("LaunchTemplate Provider", func() { {Tags: map[string]string{"Name": "test-subnet-2"}}, } ExpectApplied(ctx, env.Client, nodePool, nodeClass) + _, err := awsEnv.SubnetProvider.List(ctx, nodeClass) + Expect(err).To(BeNil()) pod := coretest.UnschedulablePod() ExpectProvisioned(ctx, env.Client, cluster, cloudProvider, prov, pod) ExpectScheduled(ctx, env.Client, pod) diff --git a/pkg/providers/subnet/subnet.go b/pkg/providers/subnet/subnet.go index 1a0f391689e6..af4fbbbeea29 100644 --- a/pkg/providers/subnet/subnet.go +++ b/pkg/providers/subnet/subnet.go @@ -103,11 +103,16 @@ func (p *DefaultProvider) List(ctx context.Context, nodeClass *v1beta1.EC2NodeCl // CheckAnyPublicIPAssociations returns a bool indicating whether all referenced subnets assign public IPv4 addresses to EC2 instances created therein func (p *DefaultProvider) CheckAnyPublicIPAssociations(ctx context.Context, nodeClass *v1beta1.EC2NodeClass) (bool, error) { - subnets, err := p.List(ctx, nodeClass) - if err != nil { - return false, err + filterSets := getFilterSets(nodeClass.Spec.SubnetSelectorTerms) + if len(filterSets) == 0 { + return false, nil } - _, ok := lo.Find(subnets, func(s *ec2.Subnet) bool { + hash := lo.Must1(hashstructure.Hash(filterSets, hashstructure.FormatV2, &hashstructure.HashOptions{SlicesAsSets: true})) + subnets, ok := p.cache.Get(fmt.Sprint(hash)) + if !ok { + return false, nil + } + _, ok = lo.Find(subnets.([]*ec2.Subnet), func(s *ec2.Subnet) bool { return aws.BoolValue(s.MapPublicIpOnLaunch) }) return ok, nil @@ -115,17 +120,21 @@ func (p *DefaultProvider) CheckAnyPublicIPAssociations(ctx context.Context, node // ZonalSubnetsForLaunch returns a mapping of zone to the subnet with the most available IP addresses and deducts the passed ips from the available count func (p *DefaultProvider) ZonalSubnetsForLaunch(ctx context.Context, nodeClass *v1beta1.EC2NodeClass, instanceTypes []*cloudprovider.InstanceType, capacityType string) (map[string]v1beta1.Subnet, error) { - if len(nodeClass.Status.Subnets) == 0 { + subnets := nodeClass.Status.Subnets + if len(subnets) == 0 { return nil, fmt.Errorf("no subnets matched selector %v", nodeClass.Spec.SubnetSelectorTerms) } p.Lock() defer p.Unlock() // sort subnets in ascending order of available IP addresses and populate map with most available subnet per AZ zonalSubnets := map[string]v1beta1.Subnet{} - sort.Slice(nodeClass.Status.Subnets, func(i, j int) bool { - return p.inflightIPs[nodeClass.Status.Subnets[i].ID] < p.inflightIPs[nodeClass.Status.Subnets[j].ID] + sort.Slice(subnets, func(i, j int) bool { + if p.inflightIPs[nodeClass.Status.Subnets[i].ID] != p.inflightIPs[nodeClass.Status.Subnets[j].ID] { + return p.inflightIPs[nodeClass.Status.Subnets[i].ID] < p.inflightIPs[nodeClass.Status.Subnets[j].ID] + } + return nodeClass.Status.Subnets[i].ID < nodeClass.Status.Subnets[j].ID }) - for _, subnet := range nodeClass.Status.Subnets { + for _, subnet := range subnets { zonalSubnets[subnet.Zone] = subnet } for _, subnet := range zonalSubnets { diff --git a/pkg/providers/subnet/suite_test.go b/pkg/providers/subnet/suite_test.go index 96c42c841d01..bd07bbaa02bb 100644 --- a/pkg/providers/subnet/suite_test.go +++ b/pkg/providers/subnet/suite_test.go @@ -225,6 +225,8 @@ var _ = Describe("SubnetProvider", func() { Tags: map[string]string{"foo": "bar"}, }, } + _, err := awsEnv.SubnetProvider.List(ctx, nodeClass) + Expect(err).To(BeNil()) onlyPrivate, err := awsEnv.SubnetProvider.CheckAnyPublicIPAssociations(ctx, nodeClass) Expect(err).To(BeNil()) Expect(onlyPrivate).To(BeFalse()) @@ -235,6 +237,8 @@ var _ = Describe("SubnetProvider", func() { ID: "subnet-test2", }, } + _, err := awsEnv.SubnetProvider.List(ctx, nodeClass) + Expect(err).To(BeNil()) onlyPrivate, err := awsEnv.SubnetProvider.CheckAnyPublicIPAssociations(ctx, nodeClass) Expect(err).To(BeNil()) Expect(onlyPrivate).To(BeTrue()) diff --git a/website/content/en/docs/concepts/disruption.md b/website/content/en/docs/concepts/disruption.md index d92d1c68fc2c..aa10df2124f3 100644 --- a/website/content/en/docs/concepts/disruption.md +++ b/website/content/en/docs/concepts/disruption.md @@ -190,7 +190,7 @@ If you require handling for Spot Rebalance Recommendations, you can use the [AWS Karpenter enables this feature by watching an SQS queue which receives critical events from AWS services which may affect your nodes. Karpenter requires that an SQS queue be provisioned and EventBridge rules and targets be added that forward interruption events from AWS services to the SQS queue. Karpenter provides details for provisioning this infrastructure in the [CloudFormation template in the Getting Started Guide](../../getting-started/getting-started-with-karpenter/#create-the-karpenter-infrastructure-and-iam-roles). -To enable interruption handling, configure the `--interruption-queue-name` CLI argument with the name of the interruption queue provisioned to handle interruption events. +To enable interruption handling, configure the `--interruption-queue` CLI argument with the name of the interruption queue provisioned to handle interruption events. ## Controls diff --git a/website/content/en/docs/concepts/nodepools.md b/website/content/en/docs/concepts/nodepools.md index d679bb5f4373..9a9b9a06d57a 100644 --- a/website/content/en/docs/concepts/nodepools.md +++ b/website/content/en/docs/concepts/nodepools.md @@ -157,7 +157,7 @@ spec: duration: 8h nodes: "0" - # Resource limits constrain the total size of the cluster. + # Resource limits constrain the total size of the pool. # Limits prevent Karpenter from creating new instances once the limit is exceeded. limits: cpu: "1000" diff --git a/website/content/en/docs/concepts/scheduling.md b/website/content/en/docs/concepts/scheduling.md index 76c09cbc5124..696b2bb4afc6 100755 --- a/website/content/en/docs/concepts/scheduling.md +++ b/website/content/en/docs/concepts/scheduling.md @@ -104,8 +104,6 @@ Refer to general [Kubernetes GPU](https://kubernetes.io/docs/tasks/manage-gpus/s You must enable Pod ENI support in the AWS VPC CNI Plugin before enabling Pod ENI support in Karpenter. Please refer to the [Security Groups for Pods documentation](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) for instructions. {{% /alert %}} -Now that Pod ENI support is enabled in the AWS VPC CNI Plugin, you can enable Pod ENI support in Karpenter by setting the `settings.aws.enablePodENI` Helm chart value to `true`. - Here is an example of a pod-eni resource defined in a deployment manifest: ``` spec: @@ -626,19 +624,73 @@ If using Gt/Lt operators, make sure to use values under the actual label values The `Exists` operator can be used on a NodePool to provide workload segregation across nodes. ```yaml -... -requirements: -- key: company.com/team - operator: Exists +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +spec: + template: + spec: + requirements: + - key: company.com/team + operator: Exists ... ``` -With the requirement on the NodePool, workloads can optionally specify a custom value as a required node affinity or node selector. Karpenter will then label the nodes it launches for these pods which prevents `kube-scheduler` from scheduling conflicting pods to those nodes. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. +With this requirement on the NodePool, workloads can specify the same key (e.g. `company.com/team`) with custom values (e.g. `team-a`, `team-b`, etc.) as a required `nodeAffinity` or `nodeSelector`. Karpenter will then apply the key/value pair to nodes it launches dynamically based on the pod's node requirements. + +If each set of pods that can schedule with this NodePool specifies this key in its `nodeAffinity` or `nodeSelector`, you can isolate pods onto different nodes based on their values. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. + +For example, providing the following `nodeSelectors` would isolate the pods for each of these deployments on different nodes. + +#### Team A Deployment ```yaml -nodeSelector: - company.com/team: team-a +apiVersion: v1 +kind: Deployment +metadata: + name: team-a-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-a ``` + +#### Team A Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-a +``` + +#### Team B Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: team-b-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-b +``` + +#### Team B Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-b +``` + {{% alert title="Note" color="primary" %}} If a workload matches the NodePool but doesn't specify a label, Karpenter will generate a random label for the node. {{% /alert %}} diff --git a/website/content/en/docs/faq.md b/website/content/en/docs/faq.md index c765ef174e2d..3c1b1df14dc0 100644 --- a/website/content/en/docs/faq.md +++ b/website/content/en/docs/faq.md @@ -14,10 +14,10 @@ See [Configuring NodePools]({{< ref "./concepts/#configuring-nodepools" >}}) for AWS is the first cloud provider supported by Karpenter, although it is designed to be used with other cloud providers as well. ### Can I write my own cloud provider for Karpenter? -Yes, but there is no documentation yet for it. Start with Karpenter's GitHub [cloudprovider](https://github.com/aws/karpenter-core/tree/v0.36.0/pkg/cloudprovider) documentation to see how the AWS provider is built, but there are other sections of the code that will require changes too. +Yes, but there is no documentation yet for it. Start with Karpenter's GitHub [cloudprovider](https://github.com/aws/karpenter-core/tree/v0.36.1/pkg/cloudprovider) documentation to see how the AWS provider is built, but there are other sections of the code that will require changes too. ### What operating system nodes does Karpenter deploy? -Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). +Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). ### Can I provide my own custom operating system images? Karpenter has multiple mechanisms for configuring the [operating system]({{< ref "./concepts/nodeclasses/#specamiselectorterms" >}}) for your nodes. @@ -26,7 +26,7 @@ Karpenter has multiple mechanisms for configuring the [operating system]({{< ref Karpenter is flexible to multi-architecture configurations using [well known labels]({{< ref "./concepts/scheduling/#supported-labels">}}). ### What RBAC access is required? -All the required RBAC rules can be found in the Helm chart template. See [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole-core.yaml), [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole.yaml), [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/rolebinding.yaml), and [role.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/role.yaml) files for details. +All the required RBAC rules can be found in the Helm chart template. See [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole-core.yaml), [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole.yaml), [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/rolebinding.yaml), and [role.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/role.yaml) files for details. ### Can I run Karpenter outside of a Kubernetes cluster? Yes, as long as the controller has network and IAM/RBAC access to the Kubernetes API and your provider API. @@ -231,7 +231,7 @@ Karpenter's native interruption handling offers two main benefits over the stand 1. You don't have to manage and maintain a separate component to exclusively handle interruption events. 2. Karpenter's native interruption handling coordinates with other deprovisioning so that consolidation, expiration, etc. can be aware of interruption events and vice-versa. -### Why am I receiving QueueNotFound errors when I set `--interruption-queue-name`? +### Why am I receiving QueueNotFound errors when I set `--interruption-queue`? Karpenter requires a queue to exist that receives event messages from EC2 and health services in order to handle interruption messages properly for nodes. Details on the types of events that Karpenter handles can be found in the [Interruption Handling Docs]({{< ref "./concepts/disruption/#interruption" >}}). diff --git a/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md index 89242a872341..9b5891b1658b 100644 --- a/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/docs/getting-started/getting-started-with-karpenter/_index.md @@ -45,7 +45,7 @@ After setting up the tools, set the Karpenter and Kubernetes version: ```bash export KARPENTER_NAMESPACE="kube-system" -export KARPENTER_VERSION="0.36.0" +export KARPENTER_VERSION="0.36.1" export K8S_VERSION="1.29" ``` @@ -109,13 +109,13 @@ See [Enabling Windows support](https://docs.aws.amazon.com/eks/latest/userguide/ As the OCI Helm chart is signed by [Cosign](https://github.com/sigstore/cosign) as part of the release process you can verify the chart before installing it by running the following command. ```bash -cosign verify public.ecr.aws/karpenter/karpenter:0.36.0 \ +cosign verify public.ecr.aws/karpenter/karpenter:0.36.1 \ --certificate-oidc-issuer=https://token.actions.githubusercontent.com \ --certificate-identity-regexp='https://github\.com/aws/karpenter-provider-aws/\.github/workflows/release\.yaml@.+' \ --certificate-github-workflow-repository=aws/karpenter-provider-aws \ --certificate-github-workflow-name=Release \ - --certificate-github-workflow-ref=refs/tags/v0.36.0 \ - --annotations version=0.36.0 + --certificate-github-workflow-ref=refs/tags/v0.36.1 \ + --annotations version=0.36.1 ``` {{% alert title="DNS Policy Notice" color="warning" %}} diff --git a/website/content/en/docs/getting-started/migrating-from-cas/_index.md b/website/content/en/docs/getting-started/migrating-from-cas/_index.md index f71116389411..8a053ecb51aa 100644 --- a/website/content/en/docs/getting-started/migrating-from-cas/_index.md +++ b/website/content/en/docs/getting-started/migrating-from-cas/_index.md @@ -92,7 +92,7 @@ One for your Karpenter node role and one for your existing node group. First set the Karpenter release you want to deploy. ```bash -export KARPENTER_VERSION="0.36.0" +export KARPENTER_VERSION="0.36.1" ``` We can now generate a full Karpenter deployment yaml from the Helm chart. @@ -133,7 +133,7 @@ Now that our deployment is ready we can create the karpenter namespace, create t ## Create default NodePool -We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v0.36.0/examples/v1beta1) for specific needs. +We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v0.36.1/examples/v1beta1) for specific needs. {{% script file="./content/en/{VERSION}/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh" language="bash" %}} diff --git a/website/content/en/docs/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh b/website/content/en/docs/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh index 47df188dc87d..139bbbd1cd02 100644 --- a/website/content/en/docs/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh +++ b/website/content/en/docs/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh @@ -1,6 +1,6 @@ for NODEGROUP in $(aws eks list-nodegroups --cluster-name "${CLUSTER_NAME}" --query 'nodegroups' --output text); do aws ec2 create-tags \ --tags "Key=karpenter.sh/discovery,Value=${CLUSTER_NAME}" \ - --resources "$(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ - --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text )" -done + --resources $(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ + --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text ) +done \ No newline at end of file diff --git a/website/content/en/docs/reference/cloudformation.md b/website/content/en/docs/reference/cloudformation.md index 1e3993b0b1bf..cdd34f44f47f 100644 --- a/website/content/en/docs/reference/cloudformation.md +++ b/website/content/en/docs/reference/cloudformation.md @@ -17,7 +17,7 @@ These descriptions should allow you to understand: To download a particular version of `cloudformation.yaml`, set the version and use `curl` to pull the file to your local system: ```bash -export KARPENTER_VERSION="0.36.0" +export KARPENTER_VERSION="0.36.1" curl https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > cloudformation.yaml ``` diff --git a/website/content/en/docs/reference/threat-model.md b/website/content/en/docs/reference/threat-model.md index 71f8beaf3532..9f6cf6fe9c23 100644 --- a/website/content/en/docs/reference/threat-model.md +++ b/website/content/en/docs/reference/threat-model.md @@ -31,11 +31,11 @@ A Cluster Developer has the ability to create pods via `Deployments`, `ReplicaSe Karpenter has permissions to create and manage cloud instances. Karpenter has Kubernetes API permissions to create, update, and remove nodes, as well as evict pods. For a full list of the permissions, see the RBAC rules in the helm chart template. Karpenter also has AWS IAM permissions to create instances with IAM roles. -* [aggregate-clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/aggregate-clusterrole.yaml) -* [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole-core.yaml) -* [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole.yaml) -* [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/rolebinding.yaml) -* [role.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/role.yaml) +* [aggregate-clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/aggregate-clusterrole.yaml) +* [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole-core.yaml) +* [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole.yaml) +* [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/rolebinding.yaml) +* [role.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/role.yaml) ## Assumptions diff --git a/website/content/en/docs/troubleshooting.md b/website/content/en/docs/troubleshooting.md index 6dc784007b75..0e6c0d3114b9 100644 --- a/website/content/en/docs/troubleshooting.md +++ b/website/content/en/docs/troubleshooting.md @@ -663,7 +663,7 @@ This typically occurs when the node has not been considered fully initialized fo ### Log message of `inflight check failed for node, Expected resource "vpc.amazonaws.com/pod-eni" didn't register on the node` is reported -This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. If you've enabled Pod ENI for Karpenter nodes via the `aws.enablePodENI` setting, you will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. +This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. You will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. ### AWS Node Termination Handler (NTH) interactions Karpenter [doesn't currently support draining and terminating on spot rebalance recommendations]({{< ref "concepts/disruption#interruption" >}}). Users who want support for both drain and terminate on spot interruption as well as drain and termination on spot rebalance recommendations may install Node Termination Handler (NTH) on their clusters to support this behavior. diff --git a/website/content/en/docs/upgrading/upgrade-guide.md b/website/content/en/docs/upgrading/upgrade-guide.md index c7e84b894521..2ffd380031c8 100644 --- a/website/content/en/docs/upgrading/upgrade-guide.md +++ b/website/content/en/docs/upgrading/upgrade-guide.md @@ -28,23 +28,23 @@ If you get the error `invalid ownership metadata; label validation error:` while In general, you can reapply the CRDs in the `crds` directory of the Karpenter Helm chart: ```shell -kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.0/pkg/apis/crds/karpenter.sh_nodepools.yaml -kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.0/pkg/apis/crds/karpenter.sh_nodeclaims.yaml -kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.0/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml +kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.1/pkg/apis/crds/karpenter.sh_nodepools.yaml +kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.1/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.1/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml ``` -### Upgrading to `0.36.0`+ +### Upgrading to `0.36.1`+ {{% alert title="Warning" color="warning" %}} -`0.33.0`+ _only_ supports Karpenter v1beta1 APIs and will not work with existing Provisioner, AWSNodeTemplate or Machine alpha APIs. Do not upgrade to `0.36.0`+ without first [upgrading to `0.32.x`]({{}}). This version supports both the alpha and beta APIs, allowing you to migrate all of your existing APIs to beta APIs without experiencing downtime. +`0.33.0`+ _only_ supports Karpenter v1beta1 APIs and will not work with existing Provisioner, AWSNodeTemplate or Machine alpha APIs. Do not upgrade to `0.36.1`+ without first [upgrading to `0.32.x`]({{}}). This version supports both the alpha and beta APIs, allowing you to migrate all of your existing APIs to beta APIs without experiencing downtime. {{% /alert %}} {{% alert title="Warning" color="warning" %}} - v0.36.x introduces update to drift that restricts rollback. When rolling back from >=v0.36.0, note that v0.32.9+, v0.33.4+, v0.34.5+, v0.35.4+ are the patch versions that support rollback. If Karpenter is rolled back to an older patch version, Karpenter can potentially drift all the nodes in the cluster. + v0.36.x introduces update to drift that restricts rollback. When rolling back from >=v0.36.1, note that v0.32.9+, v0.33.4+, v0.34.5+, v0.35.4+ are the patch versions that support rollback. If Karpenter is rolled back to an older patch version, Karpenter can potentially drift all the nodes in the cluster. {{% /alert %}} * Karpenter changed the name of the `karpenter_cloudprovider_instance_type_price_estimate` metric to `karpenter_cloudprovider_instance_type_offering_price_estimate` to align with the new `karpenter_cloudprovider_instance_type_offering_available` metric. The `region` label was also dropped from the metric, since this can be inferred from the environment that Karpenter is running in. @@ -72,7 +72,7 @@ The Ubuntu EKS optimized AMI has moved from 20.04 to 22.04 for Kubernetes 1.29+. * `Empty Expiration / Empty Drift / Empty Consolidation`: infinite parallelism * `Non-Empty Expiration / Non-Empty Drift / Single-Node Consolidation`: one node at a time * `Multi-Node Consolidation`: max 100 nodes -* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. +* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. * Karpenter now adds a default `podSecurityContext` that configures the `fsgroup: 65536` of volumes in the pod. If you are using sidecar containers, you should review if this configuration is compatible for them. You can disable this default `podSecurityContext` through helm by performing `--set podSecurityContext=null` when installing/upgrading the chart. * The `dnsPolicy` for the Karpenter controller pod has been changed back to the Kubernetes cluster default of `ClusterFirst`. Setting our `dnsPolicy` to `Default` (confusingly, this is not the Kubernetes cluster default) caused more confusion for any users running IPv6 clusters with dual-stack nodes or anyone running Karpenter with dependencies on cluster services (like clusters running service meshes). This change may be breaking for any users on Fargate or MNG who were allowing Karpenter to manage their in-cluster DNS service (`core-dns` on most clusters). If you still want the old behavior here, you can change the `dnsPolicy` to point to use `Default` by setting the helm value on install/upgrade with `--set dnsPolicy=Default`. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). * Karpenter now disallows `nodepool.spec.template.spec.resources` to be set. The webhook validation never allowed `nodepool.spec.template.spec.resources`. We are now ensuring that CEL validation also disallows `nodepool.spec.template.spec.resources` to be set. If you were previously setting the resources field on your NodePool, ensure that you remove this field before upgrading to the newest version of Karpenter or else updates to the resource may fail on the new version. @@ -103,6 +103,7 @@ This version includes **dual support** for both alpha and beta APIs to ensure th Note that if you are rolling back after upgrading to `0.32.0`, note that `0.31.4` is the only version that supports handling rollback after you have deployed the v1beta1 APIs to your cluster. {{% /alert %}} +* Karpenter now uses `settings.InterruptionQueue` instead of `settings.aws.InterruptionQueueName` in its helm chart. The CLI argument also changed to `--interruption-queue`. * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. diff --git a/website/content/en/preview/concepts/disruption.md b/website/content/en/preview/concepts/disruption.md index d92d1c68fc2c..aa10df2124f3 100644 --- a/website/content/en/preview/concepts/disruption.md +++ b/website/content/en/preview/concepts/disruption.md @@ -190,7 +190,7 @@ If you require handling for Spot Rebalance Recommendations, you can use the [AWS Karpenter enables this feature by watching an SQS queue which receives critical events from AWS services which may affect your nodes. Karpenter requires that an SQS queue be provisioned and EventBridge rules and targets be added that forward interruption events from AWS services to the SQS queue. Karpenter provides details for provisioning this infrastructure in the [CloudFormation template in the Getting Started Guide](../../getting-started/getting-started-with-karpenter/#create-the-karpenter-infrastructure-and-iam-roles). -To enable interruption handling, configure the `--interruption-queue-name` CLI argument with the name of the interruption queue provisioned to handle interruption events. +To enable interruption handling, configure the `--interruption-queue` CLI argument with the name of the interruption queue provisioned to handle interruption events. ## Controls diff --git a/website/content/en/preview/concepts/nodepools.md b/website/content/en/preview/concepts/nodepools.md index d679bb5f4373..9a9b9a06d57a 100644 --- a/website/content/en/preview/concepts/nodepools.md +++ b/website/content/en/preview/concepts/nodepools.md @@ -157,7 +157,7 @@ spec: duration: 8h nodes: "0" - # Resource limits constrain the total size of the cluster. + # Resource limits constrain the total size of the pool. # Limits prevent Karpenter from creating new instances once the limit is exceeded. limits: cpu: "1000" diff --git a/website/content/en/preview/concepts/scheduling.md b/website/content/en/preview/concepts/scheduling.md index 76c09cbc5124..696b2bb4afc6 100755 --- a/website/content/en/preview/concepts/scheduling.md +++ b/website/content/en/preview/concepts/scheduling.md @@ -104,8 +104,6 @@ Refer to general [Kubernetes GPU](https://kubernetes.io/docs/tasks/manage-gpus/s You must enable Pod ENI support in the AWS VPC CNI Plugin before enabling Pod ENI support in Karpenter. Please refer to the [Security Groups for Pods documentation](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) for instructions. {{% /alert %}} -Now that Pod ENI support is enabled in the AWS VPC CNI Plugin, you can enable Pod ENI support in Karpenter by setting the `settings.aws.enablePodENI` Helm chart value to `true`. - Here is an example of a pod-eni resource defined in a deployment manifest: ``` spec: @@ -626,19 +624,73 @@ If using Gt/Lt operators, make sure to use values under the actual label values The `Exists` operator can be used on a NodePool to provide workload segregation across nodes. ```yaml -... -requirements: -- key: company.com/team - operator: Exists +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +spec: + template: + spec: + requirements: + - key: company.com/team + operator: Exists ... ``` -With the requirement on the NodePool, workloads can optionally specify a custom value as a required node affinity or node selector. Karpenter will then label the nodes it launches for these pods which prevents `kube-scheduler` from scheduling conflicting pods to those nodes. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. +With this requirement on the NodePool, workloads can specify the same key (e.g. `company.com/team`) with custom values (e.g. `team-a`, `team-b`, etc.) as a required `nodeAffinity` or `nodeSelector`. Karpenter will then apply the key/value pair to nodes it launches dynamically based on the pod's node requirements. + +If each set of pods that can schedule with this NodePool specifies this key in its `nodeAffinity` or `nodeSelector`, you can isolate pods onto different nodes based on their values. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. + +For example, providing the following `nodeSelectors` would isolate the pods for each of these deployments on different nodes. + +#### Team A Deployment ```yaml -nodeSelector: - company.com/team: team-a +apiVersion: v1 +kind: Deployment +metadata: + name: team-a-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-a ``` + +#### Team A Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-a +``` + +#### Team B Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: team-b-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-b +``` + +#### Team B Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-b +``` + {{% alert title="Note" color="primary" %}} If a workload matches the NodePool but doesn't specify a label, Karpenter will generate a random label for the node. {{% /alert %}} diff --git a/website/content/en/preview/faq.md b/website/content/en/preview/faq.md index 4549a26bef94..d9b9118de4cb 100644 --- a/website/content/en/preview/faq.md +++ b/website/content/en/preview/faq.md @@ -17,7 +17,7 @@ AWS is the first cloud provider supported by Karpenter, although it is designed Yes, but there is no documentation yet for it. Start with Karpenter's GitHub [cloudprovider](https://github.com/aws/karpenter-core/tree{{< githubRelRef >}}pkg/cloudprovider) documentation to see how the AWS provider is built, but there are other sections of the code that will require changes too. ### What operating system nodes does Karpenter deploy? -Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). +Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). ### Can I provide my own custom operating system images? Karpenter has multiple mechanisms for configuring the [operating system]({{< ref "./concepts/nodeclasses/#specamiselectorterms" >}}) for your nodes. @@ -231,7 +231,7 @@ Karpenter's native interruption handling offers two main benefits over the stand 1. You don't have to manage and maintain a separate component to exclusively handle interruption events. 2. Karpenter's native interruption handling coordinates with other deprovisioning so that consolidation, expiration, etc. can be aware of interruption events and vice-versa. -### Why am I receiving QueueNotFound errors when I set `--interruption-queue-name`? +### Why am I receiving QueueNotFound errors when I set `--interruption-queue`? Karpenter requires a queue to exist that receives event messages from EC2 and health services in order to handle interruption messages properly for nodes. Details on the types of events that Karpenter handles can be found in the [Interruption Handling Docs]({{< ref "./concepts/disruption/#interruption" >}}). diff --git a/website/content/en/preview/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh b/website/content/en/preview/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh index 47df188dc87d..139bbbd1cd02 100644 --- a/website/content/en/preview/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh +++ b/website/content/en/preview/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh @@ -1,6 +1,6 @@ for NODEGROUP in $(aws eks list-nodegroups --cluster-name "${CLUSTER_NAME}" --query 'nodegroups' --output text); do aws ec2 create-tags \ --tags "Key=karpenter.sh/discovery,Value=${CLUSTER_NAME}" \ - --resources "$(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ - --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text )" -done + --resources $(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ + --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text ) +done \ No newline at end of file diff --git a/website/content/en/preview/troubleshooting.md b/website/content/en/preview/troubleshooting.md index 6dc784007b75..0e6c0d3114b9 100644 --- a/website/content/en/preview/troubleshooting.md +++ b/website/content/en/preview/troubleshooting.md @@ -663,7 +663,7 @@ This typically occurs when the node has not been considered fully initialized fo ### Log message of `inflight check failed for node, Expected resource "vpc.amazonaws.com/pod-eni" didn't register on the node` is reported -This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. If you've enabled Pod ENI for Karpenter nodes via the `aws.enablePodENI` setting, you will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. +This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. You will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. ### AWS Node Termination Handler (NTH) interactions Karpenter [doesn't currently support draining and terminating on spot rebalance recommendations]({{< ref "concepts/disruption#interruption" >}}). Users who want support for both drain and terminate on spot interruption as well as drain and termination on spot rebalance recommendations may install Node Termination Handler (NTH) on their clusters to support this behavior. diff --git a/website/content/en/preview/upgrading/upgrade-guide.md b/website/content/en/preview/upgrading/upgrade-guide.md index b991ad2dd856..36ee9af27627 100644 --- a/website/content/en/preview/upgrading/upgrade-guide.md +++ b/website/content/en/preview/upgrading/upgrade-guide.md @@ -41,14 +41,14 @@ WHEN CREATING A NEW SECTION OF THE UPGRADE GUIDANCE FOR NEWER VERSIONS, ENSURE T * Karpenter updated the NodeClass controller naming in the following way: `nodeclass` -> `nodeclass.status`, `nodeclass.hash`, `nodeclass.termination` -### Upgrading to `0.36.0`+ +### Upgrading to `0.36.1`+ {{% alert title="Warning" color="warning" %}} -`0.33.0`+ _only_ supports Karpenter v1beta1 APIs and will not work with existing Provisioner, AWSNodeTemplate or Machine alpha APIs. Do not upgrade to `0.36.0`+ without first [upgrading to `0.32.x`]({{}}). This version supports both the alpha and beta APIs, allowing you to migrate all of your existing APIs to beta APIs without experiencing downtime. +`0.33.0`+ _only_ supports Karpenter v1beta1 APIs and will not work with existing Provisioner, AWSNodeTemplate or Machine alpha APIs. Do not upgrade to `0.36.1`+ without first [upgrading to `0.32.x`]({{}}). This version supports both the alpha and beta APIs, allowing you to migrate all of your existing APIs to beta APIs without experiencing downtime. {{% /alert %}} {{% alert title="Warning" color="warning" %}} - v0.36.x introduces update to drift that restricts rollback. When rolling back from >=v0.36.0, note that v0.32.9+, v0.33.4+, v0.34.5+, v0.35.4+ are the patch versions that support rollback. If Karpenter is rolled back to an older patch version, Karpenter can potentially drift all the nodes in the cluster. + v0.36.x introduces update to drift that restricts rollback. When rolling back from >=v0.36.1, note that v0.32.9+, v0.33.4+, v0.34.5+, v0.35.4+ are the patch versions that support rollback. If Karpenter is rolled back to an older patch version, Karpenter can potentially drift all the nodes in the cluster. {{% /alert %}} * Karpenter changed the name of the `karpenter_cloudprovider_instance_type_price_estimate` metric to `karpenter_cloudprovider_instance_type_offering_price_estimate` to align with the new `karpenter_cloudprovider_instance_type_offering_available` metric. The `region` label was also dropped from the metric, since this can be inferred from the environment that Karpenter is running in. @@ -76,7 +76,7 @@ The Ubuntu EKS optimized AMI has moved from 20.04 to 22.04 for Kubernetes 1.29+. * `Empty Expiration / Empty Drift / Empty Consolidation`: infinite parallelism * `Non-Empty Expiration / Non-Empty Drift / Single-Node Consolidation`: one node at a time * `Multi-Node Consolidation`: max 100 nodes -* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. +* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. * Karpenter now adds a default `podSecurityContext` that configures the `fsgroup: 65536` of volumes in the pod. If you are using sidecar containers, you should review if this configuration is compatible for them. You can disable this default `podSecurityContext` through helm by performing `--set podSecurityContext=null` when installing/upgrading the chart. * The `dnsPolicy` for the Karpenter controller pod has been changed back to the Kubernetes cluster default of `ClusterFirst`. Setting our `dnsPolicy` to `Default` (confusingly, this is not the Kubernetes cluster default) caused more confusion for any users running IPv6 clusters with dual-stack nodes or anyone running Karpenter with dependencies on cluster services (like clusters running service meshes). This change may be breaking for any users on Fargate or MNG who were allowing Karpenter to manage their in-cluster DNS service (`core-dns` on most clusters). If you still want the old behavior here, you can change the `dnsPolicy` to point to use `Default` by setting the helm value on install/upgrade with `--set dnsPolicy=Default`. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). * Karpenter now disallows `nodepool.spec.template.spec.resources` to be set. The webhook validation never allowed `nodepool.spec.template.spec.resources`. We are now ensuring that CEL validation also disallows `nodepool.spec.template.spec.resources` to be set. If you were previously setting the resources field on your NodePool, ensure that you remove this field before upgrading to the newest version of Karpenter or else updates to the resource may fail on the new version. @@ -107,6 +107,7 @@ This version includes **dual support** for both alpha and beta APIs to ensure th Note that if you are rolling back after upgrading to `0.32.0`, note that `0.31.4` is the only version that supports handling rollback after you have deployed the v1beta1 APIs to your cluster. {{% /alert %}} +* Karpenter now uses `settings.InterruptionQueue` instead of `settings.aws.InterruptionQueueName` in its helm chart. The CLI argument also changed to `--interruption-queue`. * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. diff --git a/website/content/en/v0.32/concepts/disruption.md b/website/content/en/v0.32/concepts/disruption.md index 408e591147a0..b7ebec2b83df 100644 --- a/website/content/en/v0.32/concepts/disruption.md +++ b/website/content/en/v0.32/concepts/disruption.md @@ -174,7 +174,7 @@ If you require handling for Spot Rebalance Recommendations, you can use the [AWS Karpenter enables this feature by watching an SQS queue which receives critical events from AWS services which may affect your nodes. Karpenter requires that an SQS queue be provisioned and EventBridge rules and targets be added that forward interruption events from AWS services to the SQS queue. Karpenter provides details for provisioning this infrastructure in the [CloudFormation template in the Getting Started Guide](../../getting-started/getting-started-with-karpenter/#create-the-karpenter-infrastructure-and-iam-roles). -To enable interruption handling, configure the `--interruption-queue-name` CLI argument with the name of the interruption queue provisioned to handle interruption events. +To enable interruption handling, configure the `--interruption-queue` CLI argument with the name of the interruption queue provisioned to handle interruption events. ## Controls diff --git a/website/content/en/v0.32/concepts/nodepools.md b/website/content/en/v0.32/concepts/nodepools.md index 607311c5ef6e..7a369b13f326 100644 --- a/website/content/en/v0.32/concepts/nodepools.md +++ b/website/content/en/v0.32/concepts/nodepools.md @@ -140,7 +140,7 @@ spec: # You can choose to disable expiration entirely by setting the string value 'Never' here expireAfter: 720h - # Resource limits constrain the total size of the cluster. + # Resource limits constrain the total size of the pool. # Limits prevent Karpenter from creating new instances once the limit is exceeded. limits: cpu: "1000" diff --git a/website/content/en/v0.32/concepts/scheduling.md b/website/content/en/v0.32/concepts/scheduling.md index 466b54910454..a52f1c0aa3e1 100755 --- a/website/content/en/v0.32/concepts/scheduling.md +++ b/website/content/en/v0.32/concepts/scheduling.md @@ -104,8 +104,6 @@ Refer to general [Kubernetes GPU](https://kubernetes.io/docs/tasks/manage-gpus/s You must enable Pod ENI support in the AWS VPC CNI Plugin before enabling Pod ENI support in Karpenter. Please refer to the [Security Groups for Pods documentation](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) for instructions. {{% /alert %}} -Now that Pod ENI support is enabled in the AWS VPC CNI Plugin, you can enable Pod ENI support in Karpenter by setting the `settings.aws.enablePodENI` Helm chart value to `true`. - Here is an example of a pod-eni resource defined in a deployment manifest: ``` spec: @@ -625,19 +623,73 @@ If using Gt/Lt operators, make sure to use values under the actual label values The `Exists` operator can be used on a NodePool to provide workload segregation across nodes. ```yaml -... -requirements: -- key: company.com/team - operator: Exists +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +spec: + template: + spec: + requirements: + - key: company.com/team + operator: Exists ... ``` -With the requirement on the NodePool, workloads can optionally specify a custom value as a required node affinity or node selector. Karpenter will then label the nodes it launches for these pods which prevents `kube-scheduler` from scheduling conflicting pods to those nodes. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. +With this requirement on the NodePool, workloads can specify the same key (e.g. `company.com/team`) with custom values (e.g. `team-a`, `team-b`, etc.) as a required `nodeAffinity` or `nodeSelector`. Karpenter will then apply the key/value pair to nodes it launches dynamically based on the pod's node requirements. + +If each set of pods that can schedule with this NodePool specifies this key in its `nodeAffinity` or `nodeSelector`, you can isolate pods onto different nodes based on their values. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. + +For example, providing the following `nodeSelectors` would isolate the pods for each of these deployments on different nodes. + +#### Team A Deployment ```yaml -nodeSelector: - company.com/team: team-a +apiVersion: v1 +kind: Deployment +metadata: + name: team-a-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-a ``` + +#### Team A Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-a +``` + +#### Team B Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: team-b-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-b +``` + +#### Team B Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-b +``` + {{% alert title="Note" color="primary" %}} If a workload matches the NodePool but doesn't specify a label, Karpenter will generate a random label for the node. {{% /alert %}} diff --git a/website/content/en/v0.32/faq.md b/website/content/en/v0.32/faq.md index eb3a46e16351..cca4304b2d07 100644 --- a/website/content/en/v0.32/faq.md +++ b/website/content/en/v0.32/faq.md @@ -227,7 +227,7 @@ Karpenter's native interruption handling offers two main benefits over the stand 1. You don't have to manage and maintain a separate component to exclusively handle interruption events. 2. Karpenter's native interruption handling coordinates with other deprovisioning so that consolidation, expiration, etc. can be aware of interruption events and vice-versa. -### Why am I receiving QueueNotFound errors when I set `--interruption-queue-name`? +### Why am I receiving QueueNotFound errors when I set `--interruption-queue`? Karpenter requires a queue to exist that receives event messages from EC2 and health services in order to handle interruption messages properly for nodes. Details on the types of events that Karpenter handles can be found in the [Interruption Handling Docs]({{< ref "./concepts/disruption/#interruption" >}}). diff --git a/website/content/en/v0.32/troubleshooting.md b/website/content/en/v0.32/troubleshooting.md index cb51859789d9..a5db4f33597e 100644 --- a/website/content/en/v0.32/troubleshooting.md +++ b/website/content/en/v0.32/troubleshooting.md @@ -651,7 +651,7 @@ This typically occurs when the node has not been considered fully initialized fo ### Log message of `inflight check failed for node, Expected resource "vpc.amazonaws.com/pod-eni" didn't register on the node` is reported -This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. If you've enabled Pod ENI for Karpenter nodes via the `aws.enablePodENI` setting, you will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. +This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. You will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. ### AWS Node Termination Handler (NTH) interactions Karpenter [doesn't currently support draining and terminating on spot rebalance recommendations]({{< ref "concepts/disruption#interruption" >}}). Users who want support for both drain and terminate on spot interruption as well as drain and termination on spot rebalance recommendations may install Node Termination Handler (NTH) on their clusters to support this behavior. diff --git a/website/content/en/v0.32/upgrading/upgrade-guide.md b/website/content/en/v0.32/upgrading/upgrade-guide.md index 5deb0741f0b2..91bf44070f22 100644 --- a/website/content/en/v0.32/upgrading/upgrade-guide.md +++ b/website/content/en/v0.32/upgrading/upgrade-guide.md @@ -46,6 +46,7 @@ This version includes **dual support** for both alpha and beta APIs to ensure th Note that if you are rolling back after upgrading to v0.32.0, note that v0.31.4 is the only version that supports handling rollback after you have deployed the v1beta1 APIs to your cluster. {{% /alert %}} +* Karpenter now uses `settings.InterruptionQueue` instead of `settings.aws.InterruptionQueueName` in its helm chart. The CLI argument also changed to `--interruption-queue`. * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. diff --git a/website/content/en/v0.34/concepts/disruption.md b/website/content/en/v0.34/concepts/disruption.md index 48e0b479c5f3..aa5adede7dc6 100644 --- a/website/content/en/v0.34/concepts/disruption.md +++ b/website/content/en/v0.34/concepts/disruption.md @@ -186,7 +186,7 @@ If you require handling for Spot Rebalance Recommendations, you can use the [AWS Karpenter enables this feature by watching an SQS queue which receives critical events from AWS services which may affect your nodes. Karpenter requires that an SQS queue be provisioned and EventBridge rules and targets be added that forward interruption events from AWS services to the SQS queue. Karpenter provides details for provisioning this infrastructure in the [CloudFormation template in the Getting Started Guide](../../getting-started/getting-started-with-karpenter/#create-the-karpenter-infrastructure-and-iam-roles). -To enable interruption handling, configure the `--interruption-queue-name` CLI argument with the name of the interruption queue provisioned to handle interruption events. +To enable interruption handling, configure the `--interruption-queue` CLI argument with the name of the interruption queue provisioned to handle interruption events. ## Controls diff --git a/website/content/en/v0.34/concepts/nodepools.md b/website/content/en/v0.34/concepts/nodepools.md index cb85d5988475..c7f6a6072f86 100644 --- a/website/content/en/v0.34/concepts/nodepools.md +++ b/website/content/en/v0.34/concepts/nodepools.md @@ -150,7 +150,7 @@ spec: duration: 8h nodes: "0" - # Resource limits constrain the total size of the cluster. + # Resource limits constrain the total size of the pool. # Limits prevent Karpenter from creating new instances once the limit is exceeded. limits: cpu: "1000" diff --git a/website/content/en/v0.34/concepts/scheduling.md b/website/content/en/v0.34/concepts/scheduling.md index ef12a29cc347..a121b1e7c9c8 100755 --- a/website/content/en/v0.34/concepts/scheduling.md +++ b/website/content/en/v0.34/concepts/scheduling.md @@ -104,8 +104,6 @@ Refer to general [Kubernetes GPU](https://kubernetes.io/docs/tasks/manage-gpus/s You must enable Pod ENI support in the AWS VPC CNI Plugin before enabling Pod ENI support in Karpenter. Please refer to the [Security Groups for Pods documentation](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) for instructions. {{% /alert %}} -Now that Pod ENI support is enabled in the AWS VPC CNI Plugin, you can enable Pod ENI support in Karpenter by setting the `settings.aws.enablePodENI` Helm chart value to `true`. - Here is an example of a pod-eni resource defined in a deployment manifest: ``` spec: @@ -625,19 +623,73 @@ If using Gt/Lt operators, make sure to use values under the actual label values The `Exists` operator can be used on a NodePool to provide workload segregation across nodes. ```yaml -... -requirements: -- key: company.com/team - operator: Exists +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +spec: + template: + spec: + requirements: + - key: company.com/team + operator: Exists ... ``` -With the requirement on the NodePool, workloads can optionally specify a custom value as a required node affinity or node selector. Karpenter will then label the nodes it launches for these pods which prevents `kube-scheduler` from scheduling conflicting pods to those nodes. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. +With this requirement on the NodePool, workloads can specify the same key (e.g. `company.com/team`) with custom values (e.g. `team-a`, `team-b`, etc.) as a required `nodeAffinity` or `nodeSelector`. Karpenter will then apply the key/value pair to nodes it launches dynamically based on the pod's node requirements. + +If each set of pods that can schedule with this NodePool specifies this key in its `nodeAffinity` or `nodeSelector`, you can isolate pods onto different nodes based on their values. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. + +For example, providing the following `nodeSelectors` would isolate the pods for each of these deployments on different nodes. + +#### Team A Deployment ```yaml -nodeSelector: - company.com/team: team-a +apiVersion: v1 +kind: Deployment +metadata: + name: team-a-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-a ``` + +#### Team A Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-a +``` + +#### Team B Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: team-b-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-b +``` + +#### Team B Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-b +``` + {{% alert title="Note" color="primary" %}} If a workload matches the NodePool but doesn't specify a label, Karpenter will generate a random label for the node. {{% /alert %}} diff --git a/website/content/en/v0.34/faq.md b/website/content/en/v0.34/faq.md index 5096c34f69db..b7e3c91adcaa 100644 --- a/website/content/en/v0.34/faq.md +++ b/website/content/en/v0.34/faq.md @@ -17,7 +17,7 @@ AWS is the first cloud provider supported by Karpenter, although it is designed Yes, but there is no documentation yet for it. Start with Karpenter's GitHub [cloudprovider](https://github.com/aws/karpenter-core/tree/v0.34.5/pkg/cloudprovider) documentation to see how the AWS provider is built, but there are other sections of the code that will require changes too. ### What operating system nodes does Karpenter deploy? -Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). +Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). ### Can I provide my own custom operating system images? Karpenter has multiple mechanisms for configuring the [operating system]({{< ref "./concepts/nodeclasses/#specamiselectorterms" >}}) for your nodes. @@ -231,7 +231,7 @@ Karpenter's native interruption handling offers two main benefits over the stand 1. You don't have to manage and maintain a separate component to exclusively handle interruption events. 2. Karpenter's native interruption handling coordinates with other deprovisioning so that consolidation, expiration, etc. can be aware of interruption events and vice-versa. -### Why am I receiving QueueNotFound errors when I set `--interruption-queue-name`? +### Why am I receiving QueueNotFound errors when I set `--interruption-queue`? Karpenter requires a queue to exist that receives event messages from EC2 and health services in order to handle interruption messages properly for nodes. Details on the types of events that Karpenter handles can be found in the [Interruption Handling Docs]({{< ref "./concepts/disruption/#interruption" >}}). diff --git a/website/content/en/v0.34/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh b/website/content/en/v0.34/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh index de972ea2bddd..139bbbd1cd02 100644 --- a/website/content/en/v0.34/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh +++ b/website/content/en/v0.34/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh @@ -1,6 +1,6 @@ -for NODEGROUP in $(aws eks list-nodegroups --cluster-name ${CLUSTER_NAME} \ - --query 'nodegroups' --output text); do aws ec2 create-tags \ +for NODEGROUP in $(aws eks list-nodegroups --cluster-name "${CLUSTER_NAME}" --query 'nodegroups' --output text); do + aws ec2 create-tags \ --tags "Key=karpenter.sh/discovery,Value=${CLUSTER_NAME}" \ - --resources $(aws eks describe-nodegroup --cluster-name ${CLUSTER_NAME} \ - --nodegroup-name $NODEGROUP --query 'nodegroup.subnets' --output text ) -done + --resources $(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ + --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text ) +done \ No newline at end of file diff --git a/website/content/en/v0.34/troubleshooting.md b/website/content/en/v0.34/troubleshooting.md index ee912f7fe180..5cef2b9a0eff 100644 --- a/website/content/en/v0.34/troubleshooting.md +++ b/website/content/en/v0.34/troubleshooting.md @@ -663,7 +663,7 @@ This typically occurs when the node has not been considered fully initialized fo ### Log message of `inflight check failed for node, Expected resource "vpc.amazonaws.com/pod-eni" didn't register on the node` is reported -This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. If you've enabled Pod ENI for Karpenter nodes via the `aws.enablePodENI` setting, you will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. +This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. You will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. ### AWS Node Termination Handler (NTH) interactions Karpenter [doesn't currently support draining and terminating on spot rebalance recommendations]({{< ref "concepts/disruption#interruption" >}}). Users who want support for both drain and terminate on spot interruption as well as drain and termination on spot rebalance recommendations may install Node Termination Handler (NTH) on their clusters to support this behavior. diff --git a/website/content/en/v0.34/upgrading/upgrade-guide.md b/website/content/en/v0.34/upgrading/upgrade-guide.md index fbe1e7ba16df..18073f128e94 100644 --- a/website/content/en/v0.34/upgrading/upgrade-guide.md +++ b/website/content/en/v0.34/upgrading/upgrade-guide.md @@ -50,7 +50,7 @@ The Ubuntu EKS optimized AMI has moved from 20.04 to 22.04 for Kubernetes 1.29+. * `Empty Expiration / Empty Drift / Empty Consolidation`: infinite parallelism * `Non-Empty Expiration / Non-Empty Drift / Single-Node Consolidation`: one node at a time * `Multi-Node Consolidation`: max 100 nodes -* To support Disruption Budgets, v0.34+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter v0.34+, please note that you may need to increase the resources allocated to the Karpenter controller pods. +* To support Disruption Budgets, v0.34+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter v0.34+, please note that you may need to increase the resources allocated to the Karpenter controller pods. * Karpenter now adds a default `podSecurityContext` that configures the `fsgroup: 65536` of volumes in the pod. If you are using sidecar containers, you should review if this configuration is compatible for them. You can disable this default `podSecurityContext` through helm by performing `--set podSecurityContext=null` when installing/upgrading the chart. * The `dnsPolicy` for the Karpenter controller pod has been changed back to the Kubernetes cluster default of `ClusterFirst`. Setting our `dnsPolicy` to `Default` (confusingly, this is not the Kubernetes cluster default) caused more confusion for any users running IPv6 clusters with dual-stack nodes or anyone running Karpenter with dependencies on cluster services (like clusters running service meshes). This change may be breaking for any users on Fargate or MNG who were allowing Karpenter to manage their in-cluster DNS service (`core-dns` on most clusters). If you still want the old behavior here, you can change the `dnsPolicy` to point to use `Default` by setting the helm value on install/upgrade with `--set dnsPolicy=Default`. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). * Karpenter now disallows `nodepool.spec.template.spec.resources` to be set. The webhook validation never allowed `nodepool.spec.template.spec.resources`. We are now ensuring that CEL validation also disallows `nodepool.spec.template.spec.resources` to be set. If you were previously setting the resources field on your NodePool, ensure that you remove this field before upgrading to the newest version of Karpenter or else updates to the resource may fail on the new version. @@ -83,6 +83,7 @@ This version includes **dual support** for both alpha and beta APIs to ensure th Note that if you are rolling back after upgrading to v0.32.0, note that v0.31.4 is the only version that supports handling rollback after you have deployed the v1beta1 APIs to your cluster. {{% /alert %}} +* Karpenter now uses `settings.InterruptionQueue` instead of `settings.aws.InterruptionQueueName` in its helm chart. The CLI argument also changed to `--interruption-queue`. * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. diff --git a/website/content/en/v0.35/concepts/disruption.md b/website/content/en/v0.35/concepts/disruption.md index 2799846b1e8f..44c746da121e 100644 --- a/website/content/en/v0.35/concepts/disruption.md +++ b/website/content/en/v0.35/concepts/disruption.md @@ -186,7 +186,7 @@ If you require handling for Spot Rebalance Recommendations, you can use the [AWS Karpenter enables this feature by watching an SQS queue which receives critical events from AWS services which may affect your nodes. Karpenter requires that an SQS queue be provisioned and EventBridge rules and targets be added that forward interruption events from AWS services to the SQS queue. Karpenter provides details for provisioning this infrastructure in the [CloudFormation template in the Getting Started Guide](../../getting-started/getting-started-with-karpenter/#create-the-karpenter-infrastructure-and-iam-roles). -To enable interruption handling, configure the `--interruption-queue-name` CLI argument with the name of the interruption queue provisioned to handle interruption events. +To enable interruption handling, configure the `--interruption-queue` CLI argument with the name of the interruption queue provisioned to handle interruption events. ## Controls diff --git a/website/content/en/v0.35/concepts/nodepools.md b/website/content/en/v0.35/concepts/nodepools.md index d679bb5f4373..9a9b9a06d57a 100644 --- a/website/content/en/v0.35/concepts/nodepools.md +++ b/website/content/en/v0.35/concepts/nodepools.md @@ -157,7 +157,7 @@ spec: duration: 8h nodes: "0" - # Resource limits constrain the total size of the cluster. + # Resource limits constrain the total size of the pool. # Limits prevent Karpenter from creating new instances once the limit is exceeded. limits: cpu: "1000" diff --git a/website/content/en/v0.35/concepts/scheduling.md b/website/content/en/v0.35/concepts/scheduling.md index ef12a29cc347..a121b1e7c9c8 100755 --- a/website/content/en/v0.35/concepts/scheduling.md +++ b/website/content/en/v0.35/concepts/scheduling.md @@ -104,8 +104,6 @@ Refer to general [Kubernetes GPU](https://kubernetes.io/docs/tasks/manage-gpus/s You must enable Pod ENI support in the AWS VPC CNI Plugin before enabling Pod ENI support in Karpenter. Please refer to the [Security Groups for Pods documentation](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) for instructions. {{% /alert %}} -Now that Pod ENI support is enabled in the AWS VPC CNI Plugin, you can enable Pod ENI support in Karpenter by setting the `settings.aws.enablePodENI` Helm chart value to `true`. - Here is an example of a pod-eni resource defined in a deployment manifest: ``` spec: @@ -625,19 +623,73 @@ If using Gt/Lt operators, make sure to use values under the actual label values The `Exists` operator can be used on a NodePool to provide workload segregation across nodes. ```yaml -... -requirements: -- key: company.com/team - operator: Exists +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +spec: + template: + spec: + requirements: + - key: company.com/team + operator: Exists ... ``` -With the requirement on the NodePool, workloads can optionally specify a custom value as a required node affinity or node selector. Karpenter will then label the nodes it launches for these pods which prevents `kube-scheduler` from scheduling conflicting pods to those nodes. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. +With this requirement on the NodePool, workloads can specify the same key (e.g. `company.com/team`) with custom values (e.g. `team-a`, `team-b`, etc.) as a required `nodeAffinity` or `nodeSelector`. Karpenter will then apply the key/value pair to nodes it launches dynamically based on the pod's node requirements. + +If each set of pods that can schedule with this NodePool specifies this key in its `nodeAffinity` or `nodeSelector`, you can isolate pods onto different nodes based on their values. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. + +For example, providing the following `nodeSelectors` would isolate the pods for each of these deployments on different nodes. + +#### Team A Deployment ```yaml -nodeSelector: - company.com/team: team-a +apiVersion: v1 +kind: Deployment +metadata: + name: team-a-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-a ``` + +#### Team A Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-a +``` + +#### Team B Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: team-b-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-b +``` + +#### Team B Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-b +``` + {{% alert title="Note" color="primary" %}} If a workload matches the NodePool but doesn't specify a label, Karpenter will generate a random label for the node. {{% /alert %}} diff --git a/website/content/en/v0.35/faq.md b/website/content/en/v0.35/faq.md index a825e34056f6..e25969f94526 100644 --- a/website/content/en/v0.35/faq.md +++ b/website/content/en/v0.35/faq.md @@ -17,7 +17,7 @@ AWS is the first cloud provider supported by Karpenter, although it is designed Yes, but there is no documentation yet for it. Start with Karpenter's GitHub [cloudprovider](https://github.com/aws/karpenter-core/tree/v0.35.4/pkg/cloudprovider) documentation to see how the AWS provider is built, but there are other sections of the code that will require changes too. ### What operating system nodes does Karpenter deploy? -Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). +Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). ### Can I provide my own custom operating system images? Karpenter has multiple mechanisms for configuring the [operating system]({{< ref "./concepts/nodeclasses/#specamiselectorterms" >}}) for your nodes. @@ -231,7 +231,7 @@ Karpenter's native interruption handling offers two main benefits over the stand 1. You don't have to manage and maintain a separate component to exclusively handle interruption events. 2. Karpenter's native interruption handling coordinates with other deprovisioning so that consolidation, expiration, etc. can be aware of interruption events and vice-versa. -### Why am I receiving QueueNotFound errors when I set `--interruption-queue-name`? +### Why am I receiving QueueNotFound errors when I set `--interruption-queue`? Karpenter requires a queue to exist that receives event messages from EC2 and health services in order to handle interruption messages properly for nodes. Details on the types of events that Karpenter handles can be found in the [Interruption Handling Docs]({{< ref "./concepts/disruption/#interruption" >}}). diff --git a/website/content/en/v0.35/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh b/website/content/en/v0.35/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh index 47df188dc87d..139bbbd1cd02 100644 --- a/website/content/en/v0.35/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh +++ b/website/content/en/v0.35/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh @@ -1,6 +1,6 @@ for NODEGROUP in $(aws eks list-nodegroups --cluster-name "${CLUSTER_NAME}" --query 'nodegroups' --output text); do aws ec2 create-tags \ --tags "Key=karpenter.sh/discovery,Value=${CLUSTER_NAME}" \ - --resources "$(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ - --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text )" -done + --resources $(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ + --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text ) +done \ No newline at end of file diff --git a/website/content/en/v0.35/troubleshooting.md b/website/content/en/v0.35/troubleshooting.md index 6dc784007b75..0e6c0d3114b9 100644 --- a/website/content/en/v0.35/troubleshooting.md +++ b/website/content/en/v0.35/troubleshooting.md @@ -663,7 +663,7 @@ This typically occurs when the node has not been considered fully initialized fo ### Log message of `inflight check failed for node, Expected resource "vpc.amazonaws.com/pod-eni" didn't register on the node` is reported -This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. If you've enabled Pod ENI for Karpenter nodes via the `aws.enablePodENI` setting, you will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. +This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. You will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. ### AWS Node Termination Handler (NTH) interactions Karpenter [doesn't currently support draining and terminating on spot rebalance recommendations]({{< ref "concepts/disruption#interruption" >}}). Users who want support for both drain and terminate on spot interruption as well as drain and termination on spot rebalance recommendations may install Node Termination Handler (NTH) on their clusters to support this behavior. diff --git a/website/content/en/v0.35/upgrading/upgrade-guide.md b/website/content/en/v0.35/upgrading/upgrade-guide.md index 42acba982977..46739a103c1c 100644 --- a/website/content/en/v0.35/upgrading/upgrade-guide.md +++ b/website/content/en/v0.35/upgrading/upgrade-guide.md @@ -60,7 +60,7 @@ The Ubuntu EKS optimized AMI has moved from 20.04 to 22.04 for Kubernetes 1.29+. * `Empty Expiration / Empty Drift / Empty Consolidation`: infinite parallelism * `Non-Empty Expiration / Non-Empty Drift / Single-Node Consolidation`: one node at a time * `Multi-Node Consolidation`: max 100 nodes -* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. +* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. * Karpenter now adds a default `podSecurityContext` that configures the `fsgroup: 65536` of volumes in the pod. If you are using sidecar containers, you should review if this configuration is compatible for them. You can disable this default `podSecurityContext` through helm by performing `--set podSecurityContext=null` when installing/upgrading the chart. * The `dnsPolicy` for the Karpenter controller pod has been changed back to the Kubernetes cluster default of `ClusterFirst`. Setting our `dnsPolicy` to `Default` (confusingly, this is not the Kubernetes cluster default) caused more confusion for any users running IPv6 clusters with dual-stack nodes or anyone running Karpenter with dependencies on cluster services (like clusters running service meshes). This change may be breaking for any users on Fargate or MNG who were allowing Karpenter to manage their in-cluster DNS service (`core-dns` on most clusters). If you still want the old behavior here, you can change the `dnsPolicy` to point to use `Default` by setting the helm value on install/upgrade with `--set dnsPolicy=Default`. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). * Karpenter now disallows `nodepool.spec.template.spec.resources` to be set. The webhook validation never allowed `nodepool.spec.template.spec.resources`. We are now ensuring that CEL validation also disallows `nodepool.spec.template.spec.resources` to be set. If you were previously setting the resources field on your NodePool, ensure that you remove this field before upgrading to the newest version of Karpenter or else updates to the resource may fail on the new version. @@ -91,6 +91,7 @@ This version includes **dual support** for both alpha and beta APIs to ensure th Note that if you are rolling back after upgrading to `0.32.0`, note that `0.31.4` is the only version that supports handling rollback after you have deployed the v1beta1 APIs to your cluster. {{% /alert %}} +* Karpenter now uses `settings.InterruptionQueue` instead of `settings.aws.InterruptionQueueName` in its helm chart. The CLI argument also changed to `--interruption-queue`. * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. diff --git a/website/content/en/v0.36/concepts/disruption.md b/website/content/en/v0.36/concepts/disruption.md index d92d1c68fc2c..aa10df2124f3 100644 --- a/website/content/en/v0.36/concepts/disruption.md +++ b/website/content/en/v0.36/concepts/disruption.md @@ -190,7 +190,7 @@ If you require handling for Spot Rebalance Recommendations, you can use the [AWS Karpenter enables this feature by watching an SQS queue which receives critical events from AWS services which may affect your nodes. Karpenter requires that an SQS queue be provisioned and EventBridge rules and targets be added that forward interruption events from AWS services to the SQS queue. Karpenter provides details for provisioning this infrastructure in the [CloudFormation template in the Getting Started Guide](../../getting-started/getting-started-with-karpenter/#create-the-karpenter-infrastructure-and-iam-roles). -To enable interruption handling, configure the `--interruption-queue-name` CLI argument with the name of the interruption queue provisioned to handle interruption events. +To enable interruption handling, configure the `--interruption-queue` CLI argument with the name of the interruption queue provisioned to handle interruption events. ## Controls diff --git a/website/content/en/v0.36/concepts/nodepools.md b/website/content/en/v0.36/concepts/nodepools.md index d679bb5f4373..9a9b9a06d57a 100644 --- a/website/content/en/v0.36/concepts/nodepools.md +++ b/website/content/en/v0.36/concepts/nodepools.md @@ -157,7 +157,7 @@ spec: duration: 8h nodes: "0" - # Resource limits constrain the total size of the cluster. + # Resource limits constrain the total size of the pool. # Limits prevent Karpenter from creating new instances once the limit is exceeded. limits: cpu: "1000" diff --git a/website/content/en/v0.36/concepts/scheduling.md b/website/content/en/v0.36/concepts/scheduling.md index 76c09cbc5124..696b2bb4afc6 100755 --- a/website/content/en/v0.36/concepts/scheduling.md +++ b/website/content/en/v0.36/concepts/scheduling.md @@ -104,8 +104,6 @@ Refer to general [Kubernetes GPU](https://kubernetes.io/docs/tasks/manage-gpus/s You must enable Pod ENI support in the AWS VPC CNI Plugin before enabling Pod ENI support in Karpenter. Please refer to the [Security Groups for Pods documentation](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) for instructions. {{% /alert %}} -Now that Pod ENI support is enabled in the AWS VPC CNI Plugin, you can enable Pod ENI support in Karpenter by setting the `settings.aws.enablePodENI` Helm chart value to `true`. - Here is an example of a pod-eni resource defined in a deployment manifest: ``` spec: @@ -626,19 +624,73 @@ If using Gt/Lt operators, make sure to use values under the actual label values The `Exists` operator can be used on a NodePool to provide workload segregation across nodes. ```yaml -... -requirements: -- key: company.com/team - operator: Exists +apiVersion: karpenter.sh/v1beta1 +kind: NodePool +spec: + template: + spec: + requirements: + - key: company.com/team + operator: Exists ... ``` -With the requirement on the NodePool, workloads can optionally specify a custom value as a required node affinity or node selector. Karpenter will then label the nodes it launches for these pods which prevents `kube-scheduler` from scheduling conflicting pods to those nodes. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. +With this requirement on the NodePool, workloads can specify the same key (e.g. `company.com/team`) with custom values (e.g. `team-a`, `team-b`, etc.) as a required `nodeAffinity` or `nodeSelector`. Karpenter will then apply the key/value pair to nodes it launches dynamically based on the pod's node requirements. + +If each set of pods that can schedule with this NodePool specifies this key in its `nodeAffinity` or `nodeSelector`, you can isolate pods onto different nodes based on their values. This provides a way to more dynamically isolate workloads without requiring a unique NodePool for each workload subset. + +For example, providing the following `nodeSelectors` would isolate the pods for each of these deployments on different nodes. + +#### Team A Deployment ```yaml -nodeSelector: - company.com/team: team-a +apiVersion: v1 +kind: Deployment +metadata: + name: team-a-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-a ``` + +#### Team A Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-a +``` + +#### Team B Deployment + +```yaml +apiVersion: apps/v1 +kind: Deployment +metadata: + name: team-b-deployment +spec: + replicas: 5 + template: + spec: + nodeSelector: + company.com/team: team-b +``` + +#### Team B Node + +```yaml +apiVersion: v1 +kind: Node +metadata: + labels: + company.com/team: team-b +``` + {{% alert title="Note" color="primary" %}} If a workload matches the NodePool but doesn't specify a label, Karpenter will generate a random label for the node. {{% /alert %}} diff --git a/website/content/en/v0.36/faq.md b/website/content/en/v0.36/faq.md index c765ef174e2d..3c1b1df14dc0 100644 --- a/website/content/en/v0.36/faq.md +++ b/website/content/en/v0.36/faq.md @@ -14,10 +14,10 @@ See [Configuring NodePools]({{< ref "./concepts/#configuring-nodepools" >}}) for AWS is the first cloud provider supported by Karpenter, although it is designed to be used with other cloud providers as well. ### Can I write my own cloud provider for Karpenter? -Yes, but there is no documentation yet for it. Start with Karpenter's GitHub [cloudprovider](https://github.com/aws/karpenter-core/tree/v0.36.0/pkg/cloudprovider) documentation to see how the AWS provider is built, but there are other sections of the code that will require changes too. +Yes, but there is no documentation yet for it. Start with Karpenter's GitHub [cloudprovider](https://github.com/aws/karpenter-core/tree/v0.36.1/pkg/cloudprovider) documentation to see how the AWS provider is built, but there are other sections of the code that will require changes too. ### What operating system nodes does Karpenter deploy? -Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). +Karpenter uses the OS defined by the [AMI Family in your EC2NodeClass]({{< ref "./concepts/nodeclasses#specamifamily" >}}). ### Can I provide my own custom operating system images? Karpenter has multiple mechanisms for configuring the [operating system]({{< ref "./concepts/nodeclasses/#specamiselectorterms" >}}) for your nodes. @@ -26,7 +26,7 @@ Karpenter has multiple mechanisms for configuring the [operating system]({{< ref Karpenter is flexible to multi-architecture configurations using [well known labels]({{< ref "./concepts/scheduling/#supported-labels">}}). ### What RBAC access is required? -All the required RBAC rules can be found in the Helm chart template. See [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole-core.yaml), [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole.yaml), [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/rolebinding.yaml), and [role.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/role.yaml) files for details. +All the required RBAC rules can be found in the Helm chart template. See [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole-core.yaml), [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole.yaml), [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/rolebinding.yaml), and [role.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/role.yaml) files for details. ### Can I run Karpenter outside of a Kubernetes cluster? Yes, as long as the controller has network and IAM/RBAC access to the Kubernetes API and your provider API. @@ -231,7 +231,7 @@ Karpenter's native interruption handling offers two main benefits over the stand 1. You don't have to manage and maintain a separate component to exclusively handle interruption events. 2. Karpenter's native interruption handling coordinates with other deprovisioning so that consolidation, expiration, etc. can be aware of interruption events and vice-versa. -### Why am I receiving QueueNotFound errors when I set `--interruption-queue-name`? +### Why am I receiving QueueNotFound errors when I set `--interruption-queue`? Karpenter requires a queue to exist that receives event messages from EC2 and health services in order to handle interruption messages properly for nodes. Details on the types of events that Karpenter handles can be found in the [Interruption Handling Docs]({{< ref "./concepts/disruption/#interruption" >}}). diff --git a/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md b/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md index 89242a872341..9b5891b1658b 100644 --- a/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md +++ b/website/content/en/v0.36/getting-started/getting-started-with-karpenter/_index.md @@ -45,7 +45,7 @@ After setting up the tools, set the Karpenter and Kubernetes version: ```bash export KARPENTER_NAMESPACE="kube-system" -export KARPENTER_VERSION="0.36.0" +export KARPENTER_VERSION="0.36.1" export K8S_VERSION="1.29" ``` @@ -109,13 +109,13 @@ See [Enabling Windows support](https://docs.aws.amazon.com/eks/latest/userguide/ As the OCI Helm chart is signed by [Cosign](https://github.com/sigstore/cosign) as part of the release process you can verify the chart before installing it by running the following command. ```bash -cosign verify public.ecr.aws/karpenter/karpenter:0.36.0 \ +cosign verify public.ecr.aws/karpenter/karpenter:0.36.1 \ --certificate-oidc-issuer=https://token.actions.githubusercontent.com \ --certificate-identity-regexp='https://github\.com/aws/karpenter-provider-aws/\.github/workflows/release\.yaml@.+' \ --certificate-github-workflow-repository=aws/karpenter-provider-aws \ --certificate-github-workflow-name=Release \ - --certificate-github-workflow-ref=refs/tags/v0.36.0 \ - --annotations version=0.36.0 + --certificate-github-workflow-ref=refs/tags/v0.36.1 \ + --annotations version=0.36.1 ``` {{% alert title="DNS Policy Notice" color="warning" %}} diff --git a/website/content/en/v0.36/getting-started/migrating-from-cas/_index.md b/website/content/en/v0.36/getting-started/migrating-from-cas/_index.md index f71116389411..8a053ecb51aa 100644 --- a/website/content/en/v0.36/getting-started/migrating-from-cas/_index.md +++ b/website/content/en/v0.36/getting-started/migrating-from-cas/_index.md @@ -92,7 +92,7 @@ One for your Karpenter node role and one for your existing node group. First set the Karpenter release you want to deploy. ```bash -export KARPENTER_VERSION="0.36.0" +export KARPENTER_VERSION="0.36.1" ``` We can now generate a full Karpenter deployment yaml from the Helm chart. @@ -133,7 +133,7 @@ Now that our deployment is ready we can create the karpenter namespace, create t ## Create default NodePool -We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v0.36.0/examples/v1beta1) for specific needs. +We need to create a default NodePool so Karpenter knows what types of nodes we want for unscheduled workloads. You can refer to some of the [example NodePool](https://github.com/aws/karpenter/tree/v0.36.1/examples/v1beta1) for specific needs. {{% script file="./content/en/{VERSION}/getting-started/migrating-from-cas/scripts/step10-create-nodepool.sh" language="bash" %}} diff --git a/website/content/en/v0.36/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh b/website/content/en/v0.36/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh index 47df188dc87d..139bbbd1cd02 100644 --- a/website/content/en/v0.36/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh +++ b/website/content/en/v0.36/getting-started/migrating-from-cas/scripts/step05-tag-subnets.sh @@ -1,6 +1,6 @@ for NODEGROUP in $(aws eks list-nodegroups --cluster-name "${CLUSTER_NAME}" --query 'nodegroups' --output text); do aws ec2 create-tags \ --tags "Key=karpenter.sh/discovery,Value=${CLUSTER_NAME}" \ - --resources "$(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ - --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text )" -done + --resources $(aws eks describe-nodegroup --cluster-name "${CLUSTER_NAME}" \ + --nodegroup-name "${NODEGROUP}" --query 'nodegroup.subnets' --output text ) +done \ No newline at end of file diff --git a/website/content/en/v0.36/reference/cloudformation.md b/website/content/en/v0.36/reference/cloudformation.md index 1e3993b0b1bf..cdd34f44f47f 100644 --- a/website/content/en/v0.36/reference/cloudformation.md +++ b/website/content/en/v0.36/reference/cloudformation.md @@ -17,7 +17,7 @@ These descriptions should allow you to understand: To download a particular version of `cloudformation.yaml`, set the version and use `curl` to pull the file to your local system: ```bash -export KARPENTER_VERSION="0.36.0" +export KARPENTER_VERSION="0.36.1" curl https://raw.githubusercontent.com/aws/karpenter-provider-aws/v"${KARPENTER_VERSION}"/website/content/en/preview/getting-started/getting-started-with-karpenter/cloudformation.yaml > cloudformation.yaml ``` diff --git a/website/content/en/v0.36/reference/threat-model.md b/website/content/en/v0.36/reference/threat-model.md index 71f8beaf3532..9f6cf6fe9c23 100644 --- a/website/content/en/v0.36/reference/threat-model.md +++ b/website/content/en/v0.36/reference/threat-model.md @@ -31,11 +31,11 @@ A Cluster Developer has the ability to create pods via `Deployments`, `ReplicaSe Karpenter has permissions to create and manage cloud instances. Karpenter has Kubernetes API permissions to create, update, and remove nodes, as well as evict pods. For a full list of the permissions, see the RBAC rules in the helm chart template. Karpenter also has AWS IAM permissions to create instances with IAM roles. -* [aggregate-clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/aggregate-clusterrole.yaml) -* [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole-core.yaml) -* [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/clusterrole.yaml) -* [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/rolebinding.yaml) -* [role.yaml](https://github.com/aws/karpenter/blob/v0.36.0/charts/karpenter/templates/role.yaml) +* [aggregate-clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/aggregate-clusterrole.yaml) +* [clusterrole-core.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole-core.yaml) +* [clusterrole.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/clusterrole.yaml) +* [rolebinding.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/rolebinding.yaml) +* [role.yaml](https://github.com/aws/karpenter/blob/v0.36.1/charts/karpenter/templates/role.yaml) ## Assumptions diff --git a/website/content/en/v0.36/troubleshooting.md b/website/content/en/v0.36/troubleshooting.md index 6dc784007b75..0e6c0d3114b9 100644 --- a/website/content/en/v0.36/troubleshooting.md +++ b/website/content/en/v0.36/troubleshooting.md @@ -663,7 +663,7 @@ This typically occurs when the node has not been considered fully initialized fo ### Log message of `inflight check failed for node, Expected resource "vpc.amazonaws.com/pod-eni" didn't register on the node` is reported -This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. If you've enabled Pod ENI for Karpenter nodes via the `aws.enablePodENI` setting, you will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. +This error indicates that the `vpc.amazonaws.com/pod-eni` resource was never reported on the node. You will need to make the corresponding change to the VPC CNI to enable [security groups for pods](https://docs.aws.amazon.com/eks/latest/userguide/security-groups-for-pods.html) which will cause the resource to be registered. ### AWS Node Termination Handler (NTH) interactions Karpenter [doesn't currently support draining and terminating on spot rebalance recommendations]({{< ref "concepts/disruption#interruption" >}}). Users who want support for both drain and terminate on spot interruption as well as drain and termination on spot rebalance recommendations may install Node Termination Handler (NTH) on their clusters to support this behavior. diff --git a/website/content/en/v0.36/upgrading/upgrade-guide.md b/website/content/en/v0.36/upgrading/upgrade-guide.md index c7e84b894521..2ffd380031c8 100644 --- a/website/content/en/v0.36/upgrading/upgrade-guide.md +++ b/website/content/en/v0.36/upgrading/upgrade-guide.md @@ -28,23 +28,23 @@ If you get the error `invalid ownership metadata; label validation error:` while In general, you can reapply the CRDs in the `crds` directory of the Karpenter Helm chart: ```shell -kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.0/pkg/apis/crds/karpenter.sh_nodepools.yaml -kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.0/pkg/apis/crds/karpenter.sh_nodeclaims.yaml -kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.0/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml +kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.1/pkg/apis/crds/karpenter.sh_nodepools.yaml +kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.1/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +kubectl apply -f https://raw.githubusercontent.com/aws/karpenter/v0.36.1/pkg/apis/crds/karpenter.k8s.aws_ec2nodeclasses.yaml ``` -### Upgrading to `0.36.0`+ +### Upgrading to `0.36.1`+ {{% alert title="Warning" color="warning" %}} -`0.33.0`+ _only_ supports Karpenter v1beta1 APIs and will not work with existing Provisioner, AWSNodeTemplate or Machine alpha APIs. Do not upgrade to `0.36.0`+ without first [upgrading to `0.32.x`]({{}}). This version supports both the alpha and beta APIs, allowing you to migrate all of your existing APIs to beta APIs without experiencing downtime. +`0.33.0`+ _only_ supports Karpenter v1beta1 APIs and will not work with existing Provisioner, AWSNodeTemplate or Machine alpha APIs. Do not upgrade to `0.36.1`+ without first [upgrading to `0.32.x`]({{}}). This version supports both the alpha and beta APIs, allowing you to migrate all of your existing APIs to beta APIs without experiencing downtime. {{% /alert %}} {{% alert title="Warning" color="warning" %}} - v0.36.x introduces update to drift that restricts rollback. When rolling back from >=v0.36.0, note that v0.32.9+, v0.33.4+, v0.34.5+, v0.35.4+ are the patch versions that support rollback. If Karpenter is rolled back to an older patch version, Karpenter can potentially drift all the nodes in the cluster. + v0.36.x introduces update to drift that restricts rollback. When rolling back from >=v0.36.1, note that v0.32.9+, v0.33.4+, v0.34.5+, v0.35.4+ are the patch versions that support rollback. If Karpenter is rolled back to an older patch version, Karpenter can potentially drift all the nodes in the cluster. {{% /alert %}} * Karpenter changed the name of the `karpenter_cloudprovider_instance_type_price_estimate` metric to `karpenter_cloudprovider_instance_type_offering_price_estimate` to align with the new `karpenter_cloudprovider_instance_type_offering_available` metric. The `region` label was also dropped from the metric, since this can be inferred from the environment that Karpenter is running in. @@ -72,7 +72,7 @@ The Ubuntu EKS optimized AMI has moved from 20.04 to 22.04 for Kubernetes 1.29+. * `Empty Expiration / Empty Drift / Empty Consolidation`: infinite parallelism * `Non-Empty Expiration / Non-Empty Drift / Single-Node Consolidation`: one node at a time * `Multi-Node Consolidation`: max 100 nodes -* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. +* To support Disruption Budgets, `0.34.0`+ includes critical changes to Karpenter's core controllers, which allows Karpenter to consider multiple batches of disrupting nodes simultaneously. This increases Karpenter's performance with the potential downside of higher CPU and memory utilization from the Karpenter pod. While the magnitude of this difference varies on a case-by-case basis, when upgrading to Karpenter `0.34.0`+, please note that you may need to increase the resources allocated to the Karpenter controller pods. * Karpenter now adds a default `podSecurityContext` that configures the `fsgroup: 65536` of volumes in the pod. If you are using sidecar containers, you should review if this configuration is compatible for them. You can disable this default `podSecurityContext` through helm by performing `--set podSecurityContext=null` when installing/upgrading the chart. * The `dnsPolicy` for the Karpenter controller pod has been changed back to the Kubernetes cluster default of `ClusterFirst`. Setting our `dnsPolicy` to `Default` (confusingly, this is not the Kubernetes cluster default) caused more confusion for any users running IPv6 clusters with dual-stack nodes or anyone running Karpenter with dependencies on cluster services (like clusters running service meshes). This change may be breaking for any users on Fargate or MNG who were allowing Karpenter to manage their in-cluster DNS service (`core-dns` on most clusters). If you still want the old behavior here, you can change the `dnsPolicy` to point to use `Default` by setting the helm value on install/upgrade with `--set dnsPolicy=Default`. More details on this issue can be found in the following Github issues: [#2186](https://github.com/aws/karpenter-provider-aws/issues/2186) and [#4947](https://github.com/aws/karpenter-provider-aws/issues/4947). * Karpenter now disallows `nodepool.spec.template.spec.resources` to be set. The webhook validation never allowed `nodepool.spec.template.spec.resources`. We are now ensuring that CEL validation also disallows `nodepool.spec.template.spec.resources` to be set. If you were previously setting the resources field on your NodePool, ensure that you remove this field before upgrading to the newest version of Karpenter or else updates to the resource may fail on the new version. @@ -103,6 +103,7 @@ This version includes **dual support** for both alpha and beta APIs to ensure th Note that if you are rolling back after upgrading to `0.32.0`, note that `0.31.4` is the only version that supports handling rollback after you have deployed the v1beta1 APIs to your cluster. {{% /alert %}} +* Karpenter now uses `settings.InterruptionQueue` instead of `settings.aws.InterruptionQueueName` in its helm chart. The CLI argument also changed to `--interruption-queue`. * Karpenter now serves the webhook prometheus metrics server on port `8001`. If this port is already in-use on the pod or you are running in `hostNetworking` mode, you may need to change this port value. You can configure this port value through the `WEBHOOK_METRICS_PORT` environment variable or the `webhook.metrics.port` value if installing via Helm. * Karpenter now exposes the ability to disable webhooks through the `webhook.enabled=false` value. This value will disable the webhook server and will prevent any permissions, mutating or validating webhook configurations from being deployed to the cluster. * Karpenter now moves all logging configuration for the Zap logger into the `logConfig` values block. Configuring Karpenter logging with this mechanism _is_ deprecated and will be dropped at v1. Karpenter now only surfaces logLevel through the `logLevel` helm value. If you need more advanced configuration due to log parsing constraints, we recommend configuring your log parser to handle Karpenter's Zap JSON logging. diff --git a/website/hugo.yaml b/website/hugo.yaml index a58124bf6053..93457276c925 100644 --- a/website/hugo.yaml +++ b/website/hugo.yaml @@ -76,7 +76,7 @@ params: url: "https://slack.k8s.io/" icon: fab fa-slack desc: "Chat with us on Slack in the #aws-provider channel" - latest_release_version: 0.36.0 + latest_release_version: 0.36.1 latest_k8s_version: 1.29 versions: - v0.36