diff --git a/Makefile b/Makefile index c12dc1b93e6a..c4400d87b8d3 100644 --- a/Makefile +++ b/Makefile @@ -49,6 +49,7 @@ run: ## Run Karpenter controller binary against your local cluster CLUSTER_NAME=${CLUSTER_NAME} \ INTERRUPTION_QUEUE=${CLUSTER_NAME} \ FEATURE_GATES="SpotToSpotConsolidation=true" \ + LOG_LEVEL="debug" \ go run ./cmd/controller/main.go test: ## Run tests diff --git a/charts/karpenter/templates/service.yaml b/charts/karpenter/templates/service.yaml index d05fb8eff851..694265c89ff6 100644 --- a/charts/karpenter/templates/service.yaml +++ b/charts/karpenter/templates/service.yaml @@ -5,10 +5,15 @@ metadata: namespace: {{ .Release.Namespace }} labels: {{- include "karpenter.labels" . | nindent 4 }} - {{- with .Values.additionalAnnotations }} + {{- if or .Values.additionalAnnotations .Values.service.annotations }} annotations: + {{- with .Values.additionalAnnotations }} {{- toYaml . | nindent 4 }} {{- end }} + {{- with .Values.service.annotations }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- end }} spec: type: ClusterIP ports: diff --git a/charts/karpenter/values.yaml b/charts/karpenter/values.yaml index 29928fa72153..c865a2e9bf13 100644 --- a/charts/karpenter/values.yaml +++ b/charts/karpenter/values.yaml @@ -12,6 +12,9 @@ additionalAnnotations: {} imagePullPolicy: IfNotPresent # -- Image pull secrets for Docker images. imagePullSecrets: [] +service: + # -- Additional annotations for the Service. + annotations: {} serviceAccount: # -- Specifies if a ServiceAccount should be created. create: true diff --git a/examples/v1/max-node-lifetime.yaml b/examples/v1/max-node-lifetime.yaml index f10518c42e38..1a4436db8271 100644 --- a/examples/v1/max-node-lifetime.yaml +++ b/examples/v1/max-node-lifetime.yaml @@ -32,7 +32,7 @@ spec: kind: EC2NodeClass name: default expireAfter: 168h # expire nodes after 7 days = 7 * 24h - terminationGracePeirod: 24h # grace period after 1 day = 7 * 24h, for a max node lifetime of 8 days + terminationGracePeriod: 24h # grace period after 1 day = 7 * 24h, for a max node lifetime of 8 days disruption: consolidationPolicy: WhenEmpty consolidateAfter: 60s # scale down nodes after 60 seconds without workloads (excluding daemons) diff --git a/go.mod b/go.mod index 12e4a7d937c6..da3a688bc351 100644 --- a/go.mod +++ b/go.mod @@ -8,9 +8,9 @@ require ( github.com/avast/retry-go v3.0.0+incompatible github.com/aws/aws-sdk-go v1.55.5 github.com/aws/aws-sdk-go-v2 v1.32.2 - github.com/aws/aws-sdk-go-v2/config v1.27.43 + github.com/aws/aws-sdk-go-v2/config v1.28.0 github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.17 - github.com/aws/aws-sdk-go-v2/service/ec2 v1.182.0 + github.com/aws/aws-sdk-go-v2/service/ec2 v1.183.0 github.com/aws/aws-sdk-go-v2/service/iam v1.37.2 github.com/aws/aws-sdk-go-v2/service/sts v1.32.2 github.com/aws/karpenter-provider-aws/tools/kompat v0.0.0-20240410220356-6b868db24881 diff --git a/go.sum b/go.sum index d374e4f47acc..5cc1abdef1eb 100644 --- a/go.sum +++ b/go.sum @@ -12,8 +12,8 @@ github.com/aws/aws-sdk-go v1.55.5 h1:KKUZBfBoyqy5d3swXyiC7Q76ic40rYcbqH7qjh59kzU github.com/aws/aws-sdk-go v1.55.5/go.mod h1:eRwEWoyTWFMVYVQzKMNHWP5/RV4xIUGMQfXQHfHkpNU= github.com/aws/aws-sdk-go-v2 v1.32.2 h1:AkNLZEyYMLnx/Q/mSKkcMqwNFXMAvFto9bNsHqcTduI= github.com/aws/aws-sdk-go-v2 v1.32.2/go.mod h1:2SK5n0a2karNTv5tbP1SjsX0uhttou00v/HpXKM1ZUo= -github.com/aws/aws-sdk-go-v2/config v1.27.43 h1:p33fDDihFC390dhhuv8nOmX419wjOSDQRb+USt20RrU= -github.com/aws/aws-sdk-go-v2/config v1.27.43/go.mod h1:pYhbtvg1siOOg8h5an77rXle9tVG8T+BWLWAo7cOukc= +github.com/aws/aws-sdk-go-v2/config v1.28.0 h1:FosVYWcqEtWNxHn8gB/Vs6jOlNwSoyOCA/g/sxyySOQ= +github.com/aws/aws-sdk-go-v2/config v1.28.0/go.mod h1:pYhbtvg1siOOg8h5an77rXle9tVG8T+BWLWAo7cOukc= github.com/aws/aws-sdk-go-v2/credentials v1.17.41 h1:7gXo+Axmp+R4Z+AK8YFQO0ZV3L0gizGINCOWxSLY9W8= github.com/aws/aws-sdk-go-v2/credentials v1.17.41/go.mod h1:u4Eb8d3394YLubphT4jLEwN1rLNq2wFOlT6OuxFwPzU= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.17 h1:TMH3f/SCAWdNtXXVPPu5D6wrr4G5hI1rAxbcocKfC7Q= @@ -24,8 +24,8 @@ github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21 h1:6jZVETqmYCadGFvrYE github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.21/go.mod h1:1SR0GbLlnN3QUmYaflZNiH1ql+1qrSiB2vwcJ+4UM60= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1 h1:VaRN3TlFdd6KxX1x3ILT5ynH6HvKgqdiXoTxAF4HQcQ= github.com/aws/aws-sdk-go-v2/internal/ini v1.8.1/go.mod h1:FbtygfRFze9usAadmnGJNc8KsP346kEe+y2/oyhGAGc= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.182.0 h1:LaeziEhHZ/SJZYBK223QVzl3ucHvA9IP4tQMcxGrc9I= -github.com/aws/aws-sdk-go-v2/service/ec2 v1.182.0/go.mod h1:kYXaB4FzyhEJjvrJ84oPnMElLiEAjGxxUunVW2tBSng= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.183.0 h1:LgwYvo4kycfT/UD7vjQhSVZSatxHAI41/54q9O6jljI= +github.com/aws/aws-sdk-go-v2/service/ec2 v1.183.0/go.mod h1:kYXaB4FzyhEJjvrJ84oPnMElLiEAjGxxUunVW2tBSng= github.com/aws/aws-sdk-go-v2/service/fis v1.30.2 h1:qw7ZkSCy0akQJbJdIgRQaqXEHe7PrA3DHvE4VvemFJw= github.com/aws/aws-sdk-go-v2/service/fis v1.30.2/go.mod h1:CArS66NFuL1fBiSLVfWZV6oQjicsdViLm7Ic9Lte7x4= github.com/aws/aws-sdk-go-v2/service/iam v1.37.2 h1:E7vCDUFeDN8uOk8Nb2d4E1howWS1TR4HrKABXsvttIs= diff --git a/pkg/providers/instance/instance.go b/pkg/providers/instance/instance.go index 54799c00d123..ffdd907e96a7 100644 --- a/pkg/providers/instance/instance.go +++ b/pkg/providers/instance/instance.go @@ -97,6 +97,7 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1.EC2NodeClass } instanceTypes, err := cloudprovider.InstanceTypes(instanceTypes).Truncate(schedulingRequirements, maxInstanceTypes) if err != nil { + log.FromContext(ctx).Error(err, "truncating instance types") return nil, fmt.Errorf("truncating instance types, %w", err) } tags := getTags(ctx, nodeClass, nodeClaim) @@ -107,6 +108,7 @@ func (p *DefaultProvider) Create(ctx context.Context, nodeClass *v1.EC2NodeClass fleetInstance, err = p.launchInstance(ctx, nodeClass, nodeClaim, instanceTypes, tags) } if err != nil { + log.FromContext(ctx).Error(err, "launching instance") return nil, err } efaEnabled := lo.Contains(lo.Keys(nodeClaim.Spec.Resources.Requests), v1.ResourceEFA) diff --git a/test/suites/integration/extended_resources_test.go b/test/suites/integration/extended_resources_test.go index 48c1ad0116fe..4f47f1b3ec16 100644 --- a/test/suites/integration/extended_resources_test.go +++ b/test/suites/integration/extended_resources_test.go @@ -44,8 +44,6 @@ var _ = Describe("Extended Resources", func() { }) It("should provision nodes for a deployment that requests nvidia.com/gpu", func() { ExpectNvidiaDevicePluginCreated() - // TODO: jmdeal@ remove AL2 pin once AL2023 accelerated AMIs are available - nodeClass.Spec.AMISelectorTerms = []v1.AMISelectorTerm{{Alias: "al2@latest"}} numPods := 1 dep := test.Deployment(test.DeploymentOptions{ Replicas: int32(numPods), @@ -224,8 +222,6 @@ var _ = Describe("Extended Resources", func() { } // Only select private subnets since instances with multiple network instances at launch won't get a public IP. nodeClass.Spec.SubnetSelectorTerms[0].Tags["Name"] = "*Private*" - // TODO: jmdeal@ remove AL2 pin once AL2023 accelerated AMIs are available - nodeClass.Spec.AMISelectorTerms = []v1.AMISelectorTerm{{Alias: "al2@latest"}} numPods := 1 dep := test.Deployment(test.DeploymentOptions{ diff --git a/website/content/en/docs/troubleshooting.md b/website/content/en/docs/troubleshooting.md index ee9d4e572672..6819a794bd4c 100644 --- a/website/content/en/docs/troubleshooting.md +++ b/website/content/en/docs/troubleshooting.md @@ -202,7 +202,7 @@ Karpenter does not support [in-tree storage plugins](https://kubernetes.io/blog/ #### Pods were scheduled due to a race condition in Kubernetes -Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `toplogySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. +Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `topologySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. The following is a list of known CSI drivers which support a startupTaint to eliminate this issue: - [aws-ebs-csi-driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md#configure-node-startup-taint) diff --git a/website/content/en/preview/troubleshooting.md b/website/content/en/preview/troubleshooting.md index 89f48766e8d5..7589d4ff503e 100644 --- a/website/content/en/preview/troubleshooting.md +++ b/website/content/en/preview/troubleshooting.md @@ -202,7 +202,7 @@ Karpenter does not support [in-tree storage plugins](https://kubernetes.io/blog/ #### Pods were scheduled due to a race condition in Kubernetes -Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `toplogySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. +Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `topologySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. The following is a list of known CSI drivers which support a startupTaint to eliminate this issue: - [aws-ebs-csi-driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md#configure-node-startup-taint) diff --git a/website/content/en/v0.32/troubleshooting.md b/website/content/en/v0.32/troubleshooting.md index b4744d02da9c..cbdc94dbac41 100644 --- a/website/content/en/v0.32/troubleshooting.md +++ b/website/content/en/v0.32/troubleshooting.md @@ -285,7 +285,7 @@ Karpenter does not support [in-tree storage plugins](https://kubernetes.io/blog/ #### Pods were scheduled due to a race condition in Kubernetes -Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `toplogySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. +Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `topologySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. The following is a list of known CSI drivers which support a startupTaint to eliminate this issue: - [aws-ebs-csi-driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md#configure-node-startup-taint) diff --git a/website/content/en/v0.36/troubleshooting.md b/website/content/en/v0.36/troubleshooting.md index ed3dc4a21c6d..461a01a4e6fa 100644 --- a/website/content/en/v0.36/troubleshooting.md +++ b/website/content/en/v0.36/troubleshooting.md @@ -297,7 +297,7 @@ Karpenter does not support [in-tree storage plugins](https://kubernetes.io/blog/ #### Pods were scheduled due to a race condition in Kubernetes -Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `toplogySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. +Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `topologySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. The following is a list of known CSI drivers which support a startupTaint to eliminate this issue: - [aws-ebs-csi-driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md#configure-node-startup-taint) diff --git a/website/content/en/v0.37/troubleshooting.md b/website/content/en/v0.37/troubleshooting.md index ed3dc4a21c6d..461a01a4e6fa 100644 --- a/website/content/en/v0.37/troubleshooting.md +++ b/website/content/en/v0.37/troubleshooting.md @@ -297,7 +297,7 @@ Karpenter does not support [in-tree storage plugins](https://kubernetes.io/blog/ #### Pods were scheduled due to a race condition in Kubernetes -Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `toplogySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. +Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `topologySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. The following is a list of known CSI drivers which support a startupTaint to eliminate this issue: - [aws-ebs-csi-driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md#configure-node-startup-taint) diff --git a/website/content/en/v1.0/troubleshooting.md b/website/content/en/v1.0/troubleshooting.md index ee9d4e572672..6819a794bd4c 100644 --- a/website/content/en/v1.0/troubleshooting.md +++ b/website/content/en/v1.0/troubleshooting.md @@ -202,7 +202,7 @@ Karpenter does not support [in-tree storage plugins](https://kubernetes.io/blog/ #### Pods were scheduled due to a race condition in Kubernetes -Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `toplogySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. +Due to [this race condition in Kubernetes](https://github.com/kubernetes/kubernetes/issues/95911), it's possible that the scheduler and the CSINode can race during node registration such that the scheduler assumes that a node can mount more volumes than the node attachments support. There is currently no universal solve for this problem other than enforcing `topologySpreadConstraints` and `podAntiAffinity` on your workloads that use PVCs such that you attempt to reduce the number of PVCs that schedule to a given node. The following is a list of known CSI drivers which support a startupTaint to eliminate this issue: - [aws-ebs-csi-driver](https://github.com/kubernetes-sigs/aws-ebs-csi-driver/blob/master/docs/install.md#configure-node-startup-taint)