From dbe3f323a6e61ec31b9c4bb26e43e314a6221d0d Mon Sep 17 00:00:00 2001 From: Cameron McAvoy Date: Thu, 27 Jun 2024 16:49:55 -0500 Subject: [PATCH] Expose nodeclaim disruption candidates through nodeclaim condition, add eviction message from condition Signed-off-by: Cameron McAvoy --- .../karpenter.kwok.sh_kwoknodeclasses.yaml | 9 +- kwok/charts/crds/karpenter.sh_nodeclaims.yaml | 13 +- kwok/charts/crds/karpenter.sh_nodepools.yaml | 13 +- pkg/apis/crds/karpenter.sh_nodeclaims.yaml | 688 +++++++------- pkg/apis/crds/karpenter.sh_nodepools.yaml | 891 +++++++++--------- pkg/apis/v1/nodeclaim_status.go | 1 + pkg/controllers/disruption/controller.go | 12 + pkg/controllers/disruption/emptiness.go | 4 + pkg/controllers/disruption/eventual.go | 5 + .../disruption/multinodeconsolidation.go | 4 + .../disruption/singlenodeconsolidation.go | 4 + pkg/controllers/disruption/suite_test.go | 18 +- pkg/controllers/disruption/types.go | 1 + .../termination/terminator/events/events.go | 4 +- .../node/termination/terminator/eviction.go | 36 +- pkg/events/suite_test.go | 18 +- pkg/operator/operator.go | 3 + pkg/scheduling/zz_generated.deepcopy.go | 8 +- .../karpenter.test.sh_testnodeclasses.yaml | 9 +- 19 files changed, 902 insertions(+), 839 deletions(-) diff --git a/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml b/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml index d02b382821..2ed31b19e9 100644 --- a/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml +++ b/kwok/apis/crds/karpenter.kwok.sh_kwoknodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.15.0 name: kwoknodeclasses.karpenter.kwok.sh spec: group: karpenter.kwok.sh @@ -96,7 +96,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/kwok/charts/crds/karpenter.sh_nodeclaims.yaml b/kwok/charts/crds/karpenter.sh_nodeclaims.yaml index 8d2c262d1b..af2a4a1fd0 100644 --- a/kwok/charts/crds/karpenter.sh_nodeclaims.yaml +++ b/kwok/charts/crds/karpenter.sh_nodeclaims.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.15.0 name: nodeclaims.karpenter.sh spec: group: karpenter.sh @@ -262,15 +262,19 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. + Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. + This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. + The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -346,7 +350,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/kwok/charts/crds/karpenter.sh_nodepools.yaml b/kwok/charts/crds/karpenter.sh_nodepools.yaml index b261418df8..31a98cb390 100644 --- a/kwok/charts/crds/karpenter.sh_nodepools.yaml +++ b/kwok/charts/crds/karpenter.sh_nodepools.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.15.0 name: nodepools.karpenter.sh spec: group: karpenter.sh @@ -393,15 +393,19 @@ spec: description: |- TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. + Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. + This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. + The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. If left undefined, the controller will wait indefinitely for pods to be drained. pattern: ^([0-9]+(s|m|h))+$ @@ -473,7 +477,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string diff --git a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml index 7ffef2e73a..9e88fd8557 100644 --- a/pkg/apis/crds/karpenter.sh_nodeclaims.yaml +++ b/pkg/apis/crds/karpenter.sh_nodeclaims.yaml @@ -3,378 +3,368 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.15.0 name: nodeclaims.karpenter.sh spec: group: karpenter.sh names: categories: - - karpenter + - karpenter kind: NodeClaim listKind: NodeClaimList plural: nodeclaims singular: nodeclaim scope: Cluster versions: - - additionalPrinterColumns: - - jsonPath: .metadata.labels.node\.kubernetes\.io/instance-type - name: Type - type: string - - jsonPath: .metadata.labels.karpenter\.sh/capacity-type - name: Capacity - type: string - - jsonPath: .metadata.labels.topology\.kubernetes\.io/zone - name: Zone - type: string - - jsonPath: .status.nodeName - name: Node - type: string - - jsonPath: .status.conditions[?(@.type=="Ready")].status - name: Ready - type: string - - jsonPath: .metadata.creationTimestamp - name: Age - type: date - - jsonPath: .status.providerID - name: ID - priority: 1 - type: string - - jsonPath: .metadata.labels.karpenter\.sh/nodepool - name: NodePool - priority: 1 - type: string - - jsonPath: .spec.nodeClassRef.name - name: NodeClass - priority: 1 - type: string - name: v1 - schema: - openAPIV3Schema: - description: NodeClaim is the Schema for the NodeClaims API - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: NodeClaimSpec describes the desired state of the NodeClaim - properties: - expireAfter: - default: 720h + - additionalPrinterColumns: + - jsonPath: .metadata.labels.node\.kubernetes\.io/instance-type + name: Type + type: string + - jsonPath: .metadata.labels.karpenter\.sh/capacity-type + name: Capacity + type: string + - jsonPath: .metadata.labels.topology\.kubernetes\.io/zone + name: Zone + type: string + - jsonPath: .status.nodeName + name: Node + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .status.providerID + name: ID + priority: 1 + type: string + - jsonPath: .metadata.labels.karpenter\.sh/nodepool + name: NodePool + priority: 1 + type: string + - jsonPath: .spec.nodeClassRef.name + name: NodeClass + priority: 1 + type: string + name: v1 + schema: + openAPIV3Schema: + description: NodeClaim is the Schema for the NodeClaims API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: NodeClaimSpec describes the desired state of the NodeClaim + properties: + expireAfter: + default: 720h + description: |- + ExpireAfter is the duration the controller will wait + before terminating a node, measured from when the node is created. This + is useful to implement features like eventually consistent node upgrade, + memory leak protection, and disruption testing. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + nodeClassRef: + description: NodeClassRef is a reference to an object that defines + provider specific configuration + properties: + group: + description: API version of the referent + pattern: ^[^/]*$ + type: string + kind: + description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + type: string + name: + description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + type: string + required: + - group + - kind + - name + type: object + requirements: + description: Requirements are layered with GetLabels and applied to + every node. + items: description: |- - ExpireAfter is the duration the controller will wait - before terminating a node, measured from when the node is created. This - is useful to implement features like eventually consistent node upgrade, - memory leak protection, and disruption testing. - pattern: ^(([0-9]+(s|m|h))+)|(Never)$ - type: string - nodeClassRef: - description: NodeClassRef is a reference to an object that defines provider specific configuration + A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values + and minValues that represent the requirement to have at least that many values. properties: - group: - description: API version of the referent - pattern: ^[^/]*$ + key: + description: The label key that the selector applies to. type: string - kind: - description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' - type: string - name: - description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + minValues: + description: |- + This field is ALPHA and can be dropped or replaced at any time + MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. + maximum: 50 + minimum: 1 + type: integer + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic required: - - group - - kind - - name + - key + - operator type: object - requirements: - description: Requirements are layered with GetLabels and applied to every node. - items: - description: |- - A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values - and minValues that represent the requirement to have at least that many values. - properties: - key: - description: The label key that the selector applies to. - type: string - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - x-kubernetes-validations: - - message: label domain "kubernetes.io" is restricted - rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.find("^([^/]+)").endsWith("node.kubernetes.io") || self.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !self.find("^([^/]+)").endsWith("kubernetes.io") - - message: label domain "k8s.io" is restricted - rule: self.find("^([^/]+)").endsWith("kops.k8s.io") || !self.find("^([^/]+)").endsWith("k8s.io") - - message: label domain "karpenter.sh" is restricted - rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") - - message: label "kubernetes.io/hostname" is restricted - rule: self != "kubernetes.io/hostname" - minValues: - description: |- - This field is ALPHA and can be dropped or replaced at any time - MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. - maximum: 50 - minimum: 1 - type: integer - operator: - description: |- - Represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - enum: - - In - - NotIn - - Exists - - DoesNotExist - - Gt - - Lt - values: - description: |- - An array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. If the operator is Gt or Lt, the values - array must have a single element, which will be interpreted as an integer. - This array is replaced during a strategic merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - maxLength: 63 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - required: - - key - - operator + maxItems: 100 + type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : + true)' + - message: requirements operator 'Gt' or 'Lt' must have a single positive + integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') + ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' + - message: requirements with 'minValues' must have at least that many + values specified in the 'values' field + rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) ? + x.values.size() >= x.minValues : true)' + resources: + description: Resources models the resource requirements for the NodeClaim + to launch + properties: + requests: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Requests describes the minimum required resources + for the NodeClaim to launch type: object - maxItems: 100 - type: array - x-kubernetes-validations: - - message: requirements with operator 'In' must have a value defined - rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' - - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value - rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' - - message: requirements with 'minValues' must have at least that many values specified in the 'values' field - rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) ? x.values.size() >= x.minValues : true)' - resources: - description: Resources models the resource requirements for the NodeClaim to launch + type: object + startupTaints: + description: |- + StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically + within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by + daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning + purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + items: + description: |- + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. properties: - requests: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: Requests describes the minimum required resources for the NodeClaim to launch - type: object + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key type: object - startupTaints: + type: array + taints: + description: Taints will be applied to the NodeClaim's node. + items: description: |- - StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically - within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by - daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning - purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. - items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. - properties: - effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. - type: string - enum: - - NoSchedule - - PreferNoSchedule - - NoExecute - key: - description: Required. The taint key to be applied to a node. - type: string - minLength: 1 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. - format: date-time - type: string - value: - description: The taint value corresponding to the taint key. - type: string - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - required: - - effect - - key - type: object - type: array - taints: - description: Taints will be applied to the NodeClaim's node. - items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. - properties: - effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. - type: string - enum: - - NoSchedule - - PreferNoSchedule - - NoExecute - key: - description: Required. The taint key to be applied to a node. - type: string - minLength: 1 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. - format: date-time - type: string - value: - description: The taint value corresponding to the taint key. - type: string - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - required: - - effect - - key - type: object - type: array - terminationGracePeriod: - description: |- - TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint key. + type: string + required: + - effect + - key + type: object + type: array + terminationGracePeriod: + description: |- + TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. - When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. - If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, - that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. - If left undefined, the controller will wait indefinitely for pods to be drained. - pattern: ^([0-9]+(s|m|h))+$ - type: string - required: - - nodeClassRef - - requirements - type: object - x-kubernetes-validations: - - message: spec is immutable - rule: self == oldSelf - status: - description: NodeClaimStatus defines the observed state of NodeClaim - properties: - allocatable: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: Allocatable is the estimated allocatable capacity of the node - type: object - capacity: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: Capacity is the estimated full capacity of the node + This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. + When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + + + Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. + If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, + that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. + + + The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. + If left undefined, the controller will wait indefinitely for pods to be drained. + pattern: ^([0-9]+(s|m|h))+$ + type: string + required: + - nodeClassRef + - requirements + type: object + x-kubernetes-validations: + - message: spec is immutable + rule: self == oldSelf + status: + description: NodeClaimStatus defines the observed state of NodeClaim + properties: + allocatable: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Allocatable is the estimated allocatable capacity of + the node + type: object + capacity: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Capacity is the estimated full capacity of the node + type: object + conditions: + description: Conditions contains signals for health and readiness + items: + description: Condition aliases the upstream type and adds additional + helper methods + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type type: object - conditions: - description: Conditions contains signals for health and readiness - items: - description: Condition aliases the upstream type and adds additional helper methods - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - pattern: ^([A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?|)$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ - type: string - required: - - lastTransitionTime - - status - - type - type: object - type: array - imageID: - description: ImageID is an identifier for the image that runs on the node - type: string - lastPodEventTime: - description: |- - LastPodEventTime is updated with the last time a pod was scheduled - or removed from the node. A pod going terminal or terminating - is also considered as removed. - format: date-time - type: string - nodeName: - description: NodeName is the name of the corresponding node object - type: string - providerID: - description: ProviderID of the corresponding node object - type: string - type: object - required: - - spec - type: object - served: true - storage: true - subresources: - status: {} + type: array + imageID: + description: ImageID is an identifier for the image that runs on the + node + type: string + lastPodEventTime: + description: |- + LastPodEventTime is updated with the last time a pod was scheduled + or removed from the node. A pod going terminal or terminating + is also considered as removed. + format: date-time + type: string + nodeName: + description: NodeName is the name of the corresponding node object + type: string + providerID: + description: ProviderID of the corresponding node object + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/pkg/apis/crds/karpenter.sh_nodepools.yaml b/pkg/apis/crds/karpenter.sh_nodepools.yaml index fdc0c671e4..3faa54adcf 100644 --- a/pkg/apis/crds/karpenter.sh_nodepools.yaml +++ b/pkg/apis/crds/karpenter.sh_nodepools.yaml @@ -3,499 +3,474 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.15.0 name: nodepools.karpenter.sh spec: group: karpenter.sh names: categories: - - karpenter + - karpenter kind: NodePool listKind: NodePoolList plural: nodepools singular: nodepool scope: Cluster versions: - - additionalPrinterColumns: - - jsonPath: .spec.template.spec.nodeClassRef.name - name: NodeClass - type: string - - jsonPath: .status.resources.nodes - name: Nodes - type: string - - jsonPath: .status.conditions[?(@.type=="Ready")].status - name: Ready - type: string - - jsonPath: .metadata.creationTimestamp - name: Age - type: date - - jsonPath: .spec.weight - name: Weight - priority: 1 - type: integer - - jsonPath: .status.resources.cpu - name: CPU - priority: 1 - type: string - - jsonPath: .status.resources.memory - name: Memory - priority: 1 - type: string - name: v1 - schema: - openAPIV3Schema: - description: NodePool is the Schema for the NodePools API - properties: - apiVersion: - description: |- - APIVersion defines the versioned schema of this representation of an object. - Servers should convert recognized schemas to the latest internal value, and - may reject unrecognized values. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources - type: string - kind: - description: |- - Kind is a string value representing the REST resource this object represents. - Servers may infer this from the endpoint the client submits requests to. - Cannot be updated. - In CamelCase. - More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds - type: string - metadata: - type: object - spec: - description: |- - NodePoolSpec is the top level nodepool specification. Nodepools - launch nodes in response to pods that are unschedulable. A single nodepool - is capable of managing a diverse set of nodes. Node properties are determined - from a combination of nodepool and pod scheduling constraints. - properties: - disruption: - description: Disruption contains the parameters that relate to Karpenter's disruption logic - properties: - budgets: - default: - - nodes: 10% + - additionalPrinterColumns: + - jsonPath: .spec.template.spec.nodeClassRef.name + name: NodeClass + type: string + - jsonPath: .status.resources.nodes + name: Nodes + type: string + - jsonPath: .status.conditions[?(@.type=="Ready")].status + name: Ready + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + - jsonPath: .spec.weight + name: Weight + priority: 1 + type: integer + - jsonPath: .status.resources.cpu + name: CPU + priority: 1 + type: string + - jsonPath: .status.resources.memory + name: Memory + priority: 1 + type: string + name: v1 + schema: + openAPIV3Schema: + description: NodePool is the Schema for the NodePools API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: |- + NodePoolSpec is the top level nodepool specification. Nodepools + launch nodes in response to pods that are unschedulable. A single nodepool + is capable of managing a diverse set of nodes. Node properties are determined + from a combination of nodepool and pod scheduling constraints. + properties: + disruption: + description: Disruption contains the parameters that relate to Karpenter's + disruption logic + properties: + budgets: + default: + - nodes: 10% + description: |- + Budgets is a list of Budgets. + If there are multiple active budgets, Karpenter uses + the most restrictive value. If left undefined, + this will default to one budget with a value to 10%. + items: description: |- - Budgets is a list of Budgets. - If there are multiple active budgets, Karpenter uses - the most restrictive value. If left undefined, - this will default to one budget with a value to 10%. - items: + Budget defines when Karpenter will restrict the + number of Node Claims that can be terminating simultaneously. + properties: + duration: + description: |- + Duration determines how long a Budget is active since each Schedule hit. + Only minutes and hours are accepted, as cron does not work in seconds. + If omitted, the budget is always active. + This is required if Schedule is set. + This regex has an optional 0s at the end since the duration.String() always adds + a 0s at the end. + pattern: ^((([0-9]+(h|m))|([0-9]+h[0-9]+m))(0s)?)$ + type: string + nodes: + default: 10% + description: |- + Nodes dictates the maximum number of NodeClaims owned by this NodePool + that can be terminating at once. This is calculated by counting nodes that + have a deletion timestamp set, or are actively being deleted by Karpenter. + This field is required when specifying a budget. + This cannot be of type intstr.IntOrString since kubebuilder doesn't support pattern + checking for int nodes for IntOrString nodes. + Ref: https://github.com/kubernetes-sigs/controller-tools/blob/55efe4be40394a288216dab63156b0a64fb82929/pkg/crd/markers/validation.go#L379-L388 + pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + type: string + reasons: + description: |- + Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods. + Otherwise, this will apply to each reason defined. + allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons. + items: + description: |- + DisruptionReason defines valid reasons for disruption budgets. + CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons + enum: + - Underutilized + - Empty + - Drifted + type: string + type: array + schedule: + description: |- + Schedule specifies when a budget begins being active, following + the upstream cronjob syntax. If omitted, the budget is always active. + Timezones are not supported. + This field is required if Duration is set. + pattern: ^(@(annually|yearly|monthly|weekly|daily|midnight|hourly))|((.+)\s(.+)\s(.+)\s(.+)\s(.+))$ + type: string + required: + - nodes + type: object + maxItems: 50 + type: array + x-kubernetes-validations: + - message: '''schedule'' must be set with ''duration''' + rule: self.all(x, has(x.schedule) == has(x.duration)) + consolidateAfter: + description: |- + ConsolidateAfter is the duration the controller will wait + before attempting to terminate nodes that are underutilized. + Refer to ConsolidationPolicy for how underutilization is considered. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + consolidationPolicy: + default: WhenEmptyOrUnderutilized + description: |- + ConsolidationPolicy describes which nodes Karpenter can disrupt through its consolidation + algorithm. This policy defaults to "WhenEmptyOrUnderutilized" if not specified + enum: + - WhenEmpty + - WhenEmptyOrUnderutilized + type: string + required: + - consolidateAfter + type: object + limits: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Limits define a set of bounds for provisioning capacity. + type: object + template: + description: |- + Template contains the template of possibilities for the provisioning logic to launch a NodeClaim with. + NodeClaims launched from this NodePool will often be further constrained than the template specifies. + properties: + metadata: + properties: + annotations: + additionalProperties: + type: string + description: |- + Annotations is an unstructured key value map stored with a resource that may be + set by external tools to store and retrieve arbitrary metadata. They are not + queryable and should be preserved when modifying objects. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations + type: object + labels: + additionalProperties: + type: string description: |- - Budget defines when Karpenter will restrict the - number of Node Claims that can be terminating simultaneously. + Map of string keys and values that can be used to organize and categorize + (scope and select) objects. May match selectors of replication controllers + and services. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels + type: object + type: object + spec: + description: |- + NodeClaimTemplateSpec describes the desired state of the NodeClaim in the Nodepool + NodeClaimTemplateSpec is used in the NodePool's NodeClaimTemplate, with the resource requests omitted since + users are not able to set resource requests in the NodePool. + properties: + expireAfter: + default: 720h + description: |- + ExpireAfter is the duration the controller will wait + before terminating a node, measured from when the node is created. This + is useful to implement features like eventually consistent node upgrade, + memory leak protection, and disruption testing. + pattern: ^(([0-9]+(s|m|h))+)|(Never)$ + type: string + nodeClassRef: + description: NodeClassRef is a reference to an object that + defines provider specific configuration properties: - duration: - description: |- - Duration determines how long a Budget is active since each Schedule hit. - Only minutes and hours are accepted, as cron does not work in seconds. - If omitted, the budget is always active. - This is required if Schedule is set. - This regex has an optional 0s at the end since the duration.String() always adds - a 0s at the end. - pattern: ^((([0-9]+(h|m))|([0-9]+h[0-9]+m))(0s)?)$ + group: + description: API version of the referent + pattern: ^[^/]*$ type: string - nodes: - default: 10% - description: |- - Nodes dictates the maximum number of NodeClaims owned by this NodePool - that can be terminating at once. This is calculated by counting nodes that - have a deletion timestamp set, or are actively being deleted by Karpenter. - This field is required when specifying a budget. - This cannot be of type intstr.IntOrString since kubebuilder doesn't support pattern - checking for int nodes for IntOrString nodes. - Ref: https://github.com/kubernetes-sigs/controller-tools/blob/55efe4be40394a288216dab63156b0a64fb82929/pkg/crd/markers/validation.go#L379-L388 - pattern: ^((100|[0-9]{1,2})%|[0-9]+)$ + kind: + description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' type: string - reasons: - description: |- - Reasons is a list of disruption methods that this budget applies to. If Reasons is not set, this budget applies to all methods. - Otherwise, this will apply to each reason defined. - allowed reasons are Underutilized, Empty, and Drifted and additional CloudProvider-specific reasons. - items: - description: |- - DisruptionReason defines valid reasons for disruption budgets. - CloudProviders will need to append to the list of enums when implementing cloud provider disruption reasons - enum: - - Underutilized - - Empty - - Drifted - type: string - type: array - schedule: - description: |- - Schedule specifies when a budget begins being active, following - the upstream cronjob syntax. If omitted, the budget is always active. - Timezones are not supported. - This field is required if Duration is set. - pattern: ^(@(annually|yearly|monthly|weekly|daily|midnight|hourly))|((.+)\s(.+)\s(.+)\s(.+)\s(.+))$ + name: + description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' type: string required: - - nodes + - group + - kind + - name type: object - maxItems: 50 - type: array - x-kubernetes-validations: - - message: '''schedule'' must be set with ''duration''' - rule: self.all(x, has(x.schedule) == has(x.duration)) - consolidateAfter: - description: |- - ConsolidateAfter is the duration the controller will wait - before attempting to terminate nodes that are underutilized. - Refer to ConsolidationPolicy for how underutilization is considered. - pattern: ^(([0-9]+(s|m|h))+)|(Never)$ - type: string - consolidationPolicy: - default: WhenEmptyOrUnderutilized - description: |- - ConsolidationPolicy describes which nodes Karpenter can disrupt through its consolidation - algorithm. This policy defaults to "WhenEmptyOrUnderutilized" if not specified - enum: - - WhenEmpty - - WhenEmptyOrUnderutilized - type: string - required: - - consolidateAfter - type: object - limits: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: Limits define a set of bounds for provisioning capacity. - type: object - template: - description: |- - Template contains the template of possibilities for the provisioning logic to launch a NodeClaim with. - NodeClaims launched from this NodePool will often be further constrained than the template specifies. - properties: - metadata: - properties: - annotations: - additionalProperties: - type: string + requirements: + description: Requirements are layered with GetLabels and applied + to every node. + items: description: |- - Annotations is an unstructured key value map stored with a resource that may be - set by external tools to store and retrieve arbitrary metadata. They are not - queryable and should be preserved when modifying objects. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations + A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values + and minValues that represent the requirement to have at least that many values. + properties: + key: + description: The label key that the selector applies + to. + type: string + minValues: + description: |- + This field is ALPHA and can be dropped or replaced at any time + MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. + maximum: 50 + minimum: 1 + type: integer + operator: + description: |- + Represents a key's relationship to a set of values. + Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. + type: string + values: + description: |- + An array of string values. If the operator is In or NotIn, + the values array must be non-empty. If the operator is Exists or DoesNotExist, + the values array must be empty. If the operator is Gt or Lt, the values + array must have a single element, which will be interpreted as an integer. + This array is replaced during a strategic merge patch. + items: + type: string + type: array + x-kubernetes-list-type: atomic + required: + - key + - operator type: object - labels: - additionalProperties: - type: string - maxLength: 63 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ + maxItems: 100 + type: array + x-kubernetes-validations: + - message: requirements with operator 'In' must have a value + defined + rule: 'self.all(x, x.operator == ''In'' ? x.values.size() + != 0 : true)' + - message: requirements operator 'Gt' or 'Lt' must have a + single positive integer value + rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == + ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= + 0) : true)' + - message: requirements with 'minValues' must have at least + that many values specified in the 'values' field + rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) + ? x.values.size() >= x.minValues : true)' + startupTaints: + description: |- + StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically + within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by + daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning + purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. + items: description: |- - Map of string keys and values that can be used to organize and categorize - (scope and select) objects. May match selectors of replication controllers - and services. - More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/labels + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. + properties: + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to + a node. + type: string + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time + type: string + value: + description: The taint value corresponding to the taint + key. + type: string + required: + - effect + - key type: object - maxProperties: 100 - x-kubernetes-validations: - - message: label domain "kubernetes.io" is restricted - rule: self.all(x, x in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || x.find("^([^/]+)").endsWith("node.kubernetes.io") || x.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !x.find("^([^/]+)").endsWith("kubernetes.io")) - - message: label domain "k8s.io" is restricted - rule: self.all(x, x.find("^([^/]+)").endsWith("kops.k8s.io") || !x.find("^([^/]+)").endsWith("k8s.io")) - - message: label domain "karpenter.sh" is restricted - rule: self.all(x, x in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !x.find("^([^/]+)").endsWith("karpenter.sh")) - - message: label "karpenter.sh/nodepool" is restricted - rule: self.all(x, x != "karpenter.sh/nodepool") - - message: label "kubernetes.io/hostname" is restricted - rule: self.all(x, x != "kubernetes.io/hostname") - type: object - spec: - description: |- - NodeClaimTemplateSpec describes the desired state of the NodeClaim in the Nodepool - NodeClaimTemplateSpec is used in the NodePool's NodeClaimTemplate, with the resource requests omitted since - users are not able to set resource requests in the NodePool. - properties: - expireAfter: - default: 720h + type: array + taints: + description: Taints will be applied to the NodeClaim's node. + items: description: |- - ExpireAfter is the duration the controller will wait - before terminating a node, measured from when the node is created. This - is useful to implement features like eventually consistent node upgrade, - memory leak protection, and disruption testing. - pattern: ^(([0-9]+(s|m|h))+)|(Never)$ - type: string - nodeClassRef: - description: NodeClassRef is a reference to an object that defines provider specific configuration + The node this Taint is attached to has the "effect" on + any pod that does not tolerate the Taint. properties: - group: - description: API version of the referent - pattern: ^[^/]*$ + effect: + description: |- + Required. The effect of the taint on pods + that do not tolerate the taint. + Valid effects are NoSchedule, PreferNoSchedule and NoExecute. + type: string + key: + description: Required. The taint key to be applied to + a node. type: string - kind: - description: 'Kind of the referent; More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds"' + timeAdded: + description: |- + TimeAdded represents the time at which the taint was added. + It is only written for NoExecute taints. + format: date-time type: string - name: - description: 'Name of the referent; More info: http://kubernetes.io/docs/user-guide/identifiers#names' + value: + description: The taint value corresponding to the taint + key. type: string required: - - group - - kind - - name + - effect + - key type: object - requirements: - description: Requirements are layered with GetLabels and applied to every node. - items: - description: |- - A node selector requirement with min values is a selector that contains values, a key, an operator that relates the key and values - and minValues that represent the requirement to have at least that many values. - properties: - key: - description: The label key that the selector applies to. - type: string - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - x-kubernetes-validations: - - message: label domain "kubernetes.io" is restricted - rule: self in ["beta.kubernetes.io/instance-type", "failure-domain.beta.kubernetes.io/region", "beta.kubernetes.io/os", "beta.kubernetes.io/arch", "failure-domain.beta.kubernetes.io/zone", "topology.kubernetes.io/zone", "topology.kubernetes.io/region", "node.kubernetes.io/instance-type", "kubernetes.io/arch", "kubernetes.io/os", "node.kubernetes.io/windows-build"] || self.find("^([^/]+)").endsWith("node.kubernetes.io") || self.find("^([^/]+)").endsWith("node-restriction.kubernetes.io") || !self.find("^([^/]+)").endsWith("kubernetes.io") - - message: label domain "k8s.io" is restricted - rule: self.find("^([^/]+)").endsWith("kops.k8s.io") || !self.find("^([^/]+)").endsWith("k8s.io") - - message: label domain "karpenter.sh" is restricted - rule: self in ["karpenter.sh/capacity-type", "karpenter.sh/nodepool"] || !self.find("^([^/]+)").endsWith("karpenter.sh") - - message: label "karpenter.sh/nodepool" is restricted - rule: self != "karpenter.sh/nodepool" - - message: label "kubernetes.io/hostname" is restricted - rule: self != "kubernetes.io/hostname" - minValues: - description: |- - This field is ALPHA and can be dropped or replaced at any time - MinValues is the minimum number of unique values required to define the flexibility of the specific requirement. - maximum: 50 - minimum: 1 - type: integer - operator: - description: |- - Represents a key's relationship to a set of values. - Valid operators are In, NotIn, Exists, DoesNotExist. Gt, and Lt. - type: string - enum: - - In - - NotIn - - Exists - - DoesNotExist - - Gt - - Lt - values: - description: |- - An array of string values. If the operator is In or NotIn, - the values array must be non-empty. If the operator is Exists or DoesNotExist, - the values array must be empty. If the operator is Gt or Lt, the values - array must have a single element, which will be interpreted as an integer. - This array is replaced during a strategic merge patch. - items: - type: string - type: array - x-kubernetes-list-type: atomic - maxLength: 63 - pattern: ^(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])?$ - required: - - key - - operator - type: object - maxItems: 100 - type: array - x-kubernetes-validations: - - message: requirements with operator 'In' must have a value defined - rule: 'self.all(x, x.operator == ''In'' ? x.values.size() != 0 : true)' - - message: requirements operator 'Gt' or 'Lt' must have a single positive integer value - rule: 'self.all(x, (x.operator == ''Gt'' || x.operator == ''Lt'') ? (x.values.size() == 1 && int(x.values[0]) >= 0) : true)' - - message: requirements with 'minValues' must have at least that many values specified in the 'values' field - rule: 'self.all(x, (x.operator == ''In'' && has(x.minValues)) ? x.values.size() >= x.minValues : true)' - startupTaints: - description: |- - StartupTaints are taints that are applied to nodes upon startup which are expected to be removed automatically - within a short period of time, typically by a DaemonSet that tolerates the taint. These are commonly used by - daemonsets to allow initialization and enforce startup ordering. StartupTaints are ignored for provisioning - purposes in that pods are not required to tolerate a StartupTaint in order to have nodes provisioned for them. - items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. - properties: - effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. - type: string - enum: - - NoSchedule - - PreferNoSchedule - - NoExecute - key: - description: Required. The taint key to be applied to a node. - type: string - minLength: 1 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. - format: date-time - type: string - value: - description: The taint value corresponding to the taint key. - type: string - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - required: - - effect - - key - type: object - type: array - taints: - description: Taints will be applied to the NodeClaim's node. - items: - description: |- - The node this Taint is attached to has the "effect" on - any pod that does not tolerate the Taint. - properties: - effect: - description: |- - Required. The effect of the taint on pods - that do not tolerate the taint. - Valid effects are NoSchedule, PreferNoSchedule and NoExecute. - type: string - enum: - - NoSchedule - - PreferNoSchedule - - NoExecute - key: - description: Required. The taint key to be applied to a node. - type: string - minLength: 1 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - timeAdded: - description: |- - TimeAdded represents the time at which the taint was added. - It is only written for NoExecute taints. - format: date-time - type: string - value: - description: The taint value corresponding to the taint key. - type: string - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*(\/))?([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9]$ - required: - - effect - - key - type: object - type: array - terminationGracePeriod: - description: |- - TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. + type: array + terminationGracePeriod: + description: |- + TerminationGracePeriod is the maximum duration the controller will wait before forcefully deleting the pods on a node, measured from when deletion is first initiated. - Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. - When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + Warning: this feature takes precedence over a Pod's terminationGracePeriodSeconds value, and bypasses any blocked PDBs or the karpenter.sh/do-not-disrupt annotation. - Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. - If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, - that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. - The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. - If left undefined, the controller will wait indefinitely for pods to be drained. - pattern: ^([0-9]+(s|m|h))+$ - type: string - required: - - nodeClassRef - - requirements - type: object - required: - - spec - type: object - weight: - description: |- - Weight is the priority given to the nodepool during scheduling. A higher - numerical weight indicates that this nodepool will be ordered - ahead of other nodepools with lower weights. A nodepool with no weight - will be treated as if it is a nodepool with a weight of 0. - format: int32 - maximum: 100 - minimum: 1 - type: integer - required: - - template - type: object - status: - description: NodePoolStatus defines the observed state of NodePool - properties: - conditions: - description: Conditions contains signals for health and readiness - items: - description: Condition aliases the upstream type and adds additional helper methods - properties: - lastTransitionTime: - description: |- - lastTransitionTime is the last time the condition transitioned from one status to another. - This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. - format: date-time - type: string - message: - description: |- - message is a human readable message indicating details about the transition. - This may be an empty string. - maxLength: 32768 - type: string - observedGeneration: - description: |- - observedGeneration represents the .metadata.generation that the condition was set based upon. - For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date - with respect to the current state of the instance. - format: int64 - minimum: 0 - type: integer - reason: - description: |- - reason contains a programmatic identifier indicating the reason for the condition's last transition. - Producers of specific condition types may define expected values and meanings for this field, - and whether the values are considered a guaranteed API. - The value should be a CamelCase string. - This field may not be empty. - maxLength: 1024 - minLength: 1 - pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ - type: string - status: - description: status of the condition, one of True, False, Unknown. - enum: - - "True" - - "False" - - Unknown - type: string - type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. - maxLength: 316 - pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + This field is intended to be used by cluster administrators to enforce that nodes can be cycled within a given time period. + When set, drifted nodes will begin draining even if there are pods blocking eviction. Draining will respect PDBs and the do-not-disrupt annotation until the TGP is reached. + + + Karpenter will preemptively delete pods so their terminationGracePeriodSeconds align with the node's terminationGracePeriod. + If a pod would be terminated without being granted its full terminationGracePeriodSeconds prior to the node timeout, + that pod will be deleted at T = node timeout - pod terminationGracePeriodSeconds. + + + The feature can also be used to allow maximum time limits for long-running jobs which can delay node termination with preStop hooks. + If left undefined, the controller will wait indefinitely for pods to be drained. + pattern: ^([0-9]+(s|m|h))+$ type: string required: - - lastTransitionTime - - message - - reason - - status - - type + - nodeClassRef + - requirements type: object - type: array - resources: - additionalProperties: - anyOf: - - type: integer - - type: string - pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ - x-kubernetes-int-or-string: true - description: Resources is the list of resources that have been provisioned. + required: + - spec + type: object + weight: + description: |- + Weight is the priority given to the nodepool during scheduling. A higher + numerical weight indicates that this nodepool will be ordered + ahead of other nodepools with lower weights. A nodepool with no weight + will be treated as if it is a nodepool with a weight of 0. + format: int32 + maximum: 100 + minimum: 1 + type: integer + required: + - template + type: object + status: + description: NodePoolStatus defines the observed state of NodePool + properties: + conditions: + description: Conditions contains signals for health and readiness + items: + description: Condition aliases the upstream type and adds additional + helper methods + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type type: object - type: object - required: - - spec - type: object - served: true - storage: true - subresources: - status: {} + type: array + resources: + additionalProperties: + anyOf: + - type: integer + - type: string + pattern: ^(\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))(([KMGTPE]i)|[numkMGTPE]|([eE](\+|-)?(([0-9]+(\.[0-9]*)?)|(\.[0-9]+))))?$ + x-kubernetes-int-or-string: true + description: Resources is the list of resources that have been provisioned. + type: object + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/pkg/apis/v1/nodeclaim_status.go b/pkg/apis/v1/nodeclaim_status.go index aca25a4e77..e7f918bcee 100644 --- a/pkg/apis/v1/nodeclaim_status.go +++ b/pkg/apis/v1/nodeclaim_status.go @@ -30,6 +30,7 @@ const ( ConditionTypeDrifted = "Drifted" ConditionTypeInstanceTerminating = "InstanceTerminating" ConditionTypeConsistentStateFound = "ConsistentStateFound" + ConditionTypeDisruptionCandidate = "DisruptionCandidate" ) // NodeClaimStatus defines the observed state of NodeClaim diff --git a/pkg/controllers/disruption/controller.go b/pkg/controllers/disruption/controller.go index 6cb7556346..f155c51c16 100644 --- a/pkg/controllers/disruption/controller.go +++ b/pkg/controllers/disruption/controller.go @@ -230,9 +230,21 @@ func (c *Controller) executeCommand(ctx context.Context, m Method, cmd Command, // We have the new NodeClaims created at the API server so mark the old NodeClaims for deletion c.cluster.MarkForDeletion(providerIDs...) + // Set the status of the nodeclaims to reflect that they are disruption candidates + err = multierr.Combine(lo.Map(cmd.candidates, func(candidate *Candidate, _ int) error { + candidate.NodeClaim.StatusConditions().SetTrueWithReason(v1.ConditionTypeDisruptionCandidate, v1.ConditionTypeDisruptionCandidate, m.EvictionReason(candidate.NodeClaim)) + return c.kubeClient.Status().Update(ctx, candidate.NodeClaim) + })...) + if err != nil { + return multierr.Append(fmt.Errorf("updating nodeclaim status: %w", err), state.RequireNoScheduleTaint(ctx, c.kubeClient, false, stateNodes...)) + } + if err := c.queue.Add(orchestration.NewCommand(nodeClaimNames, lo.Map(cmd.candidates, func(c *Candidate, _ int) *state.StateNode { return c.StateNode }), commandID, m.Reason(), m.ConsolidationType())); err != nil { c.cluster.UnmarkForDeletion(providerIDs...) + err = multierr.Combine(err, multierr.Combine(lo.Map(cmd.candidates, func(candidate *Candidate, _ int) error { + return multierr.Append(candidate.NodeClaim.StatusConditions().Clear(v1.ConditionTypeDisruptionCandidate), c.kubeClient.Status().Update(ctx, candidate.NodeClaim)) + })...)) return fmt.Errorf("adding command to queue (command-id: %s), %w", commandID, multierr.Append(err, state.RequireNoScheduleTaint(ctx, c.kubeClient, false, stateNodes...))) } diff --git a/pkg/controllers/disruption/emptiness.go b/pkg/controllers/disruption/emptiness.go index 0b68474b6a..846aefd9dd 100644 --- a/pkg/controllers/disruption/emptiness.go +++ b/pkg/controllers/disruption/emptiness.go @@ -134,3 +134,7 @@ func (e *Emptiness) Class() string { func (e *Emptiness) ConsolidationType() string { return "empty" } + +func (e *Emptiness) EvictionReason(nodeClaim *v1.NodeClaim) string { + return fmt.Sprintf("node %s was empty", nodeClaim.Status.NodeName) +} diff --git a/pkg/controllers/disruption/eventual.go b/pkg/controllers/disruption/eventual.go index f4bd8b779c..22da6b6a31 100644 --- a/pkg/controllers/disruption/eventual.go +++ b/pkg/controllers/disruption/eventual.go @@ -19,6 +19,7 @@ package disruption import ( "context" "errors" + "fmt" "sort" "sigs.k8s.io/controller-runtime/pkg/client" @@ -125,3 +126,7 @@ func (d *EventualDisruption) Class() string { func (d *EventualDisruption) ConsolidationType() string { return "" } + +func (d *EventualDisruption) EvictionReason(nodeClaim *v1.NodeClaim) string { + return fmt.Sprintf("node %s drifted", nodeClaim.Status.NodeName) +} diff --git a/pkg/controllers/disruption/multinodeconsolidation.go b/pkg/controllers/disruption/multinodeconsolidation.go index fc65951ea0..d026497d0c 100644 --- a/pkg/controllers/disruption/multinodeconsolidation.go +++ b/pkg/controllers/disruption/multinodeconsolidation.go @@ -224,3 +224,7 @@ func (m *MultiNodeConsolidation) Class() string { func (m *MultiNodeConsolidation) ConsolidationType() string { return "multi" } + +func (m *MultiNodeConsolidation) EvictionReason(nodeClaim *v1.NodeClaim) string { + return fmt.Sprintf("node %s was %s node consolidated", nodeClaim.Status.NodeName, m.ConsolidationType()) +} diff --git a/pkg/controllers/disruption/singlenodeconsolidation.go b/pkg/controllers/disruption/singlenodeconsolidation.go index 41af43d41f..e52823a370 100644 --- a/pkg/controllers/disruption/singlenodeconsolidation.go +++ b/pkg/controllers/disruption/singlenodeconsolidation.go @@ -110,3 +110,7 @@ func (s *SingleNodeConsolidation) Class() string { func (s *SingleNodeConsolidation) ConsolidationType() string { return "single" } + +func (s *SingleNodeConsolidation) EvictionReason(nodeClaim *v1.NodeClaim) string { + return fmt.Sprintf("node %s was %s node consolidated", nodeClaim.Status.NodeName, s.ConsolidationType()) +} diff --git a/pkg/controllers/disruption/suite_test.go b/pkg/controllers/disruption/suite_test.go index 9cdca5e863..ca0ddb519c 100644 --- a/pkg/controllers/disruption/suite_test.go +++ b/pkg/controllers/disruption/suite_test.go @@ -24,10 +24,6 @@ import ( "testing" "time" - "sigs.k8s.io/karpenter/pkg/metrics" - - "sigs.k8s.io/karpenter/pkg/test/v1alpha1" - . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" @@ -39,9 +35,6 @@ import ( clock "k8s.io/utils/clock/testing" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/karpenter/pkg/utils/pdb" - . "sigs.k8s.io/karpenter/pkg/utils/testing" - coreapis "sigs.k8s.io/karpenter/pkg/apis" v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/cloudprovider" @@ -51,11 +44,15 @@ import ( "sigs.k8s.io/karpenter/pkg/controllers/provisioning" "sigs.k8s.io/karpenter/pkg/controllers/state" "sigs.k8s.io/karpenter/pkg/controllers/state/informer" + "sigs.k8s.io/karpenter/pkg/metrics" "sigs.k8s.io/karpenter/pkg/operator/options" "sigs.k8s.io/karpenter/pkg/scheduling" "sigs.k8s.io/karpenter/pkg/test" . "sigs.k8s.io/karpenter/pkg/test/expectations" + "sigs.k8s.io/karpenter/pkg/test/v1alpha1" disruptionutils "sigs.k8s.io/karpenter/pkg/utils/disruption" + "sigs.k8s.io/karpenter/pkg/utils/pdb" + . "sigs.k8s.io/karpenter/pkg/utils/testing" ) var ctx context.Context @@ -538,6 +535,7 @@ var _ = Describe("Disruption Taints", func() { ExpectSingletonReconciled(ctx, disruptionController) node = ExpectNodeExists(ctx, env.Client, node.Name) Expect(node.Spec.Taints).ToNot(ContainElement(v1.DisruptedNoScheduleTaint)) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDisruptionCandidate)).To(BeNil()) }) It("should add and remove taints from NodeClaims that fail to disrupt", func() { nodePool.Spec.Disruption.ConsolidationPolicy = v1.ConsolidationPolicyWhenEmptyOrUnderutilized @@ -575,6 +573,11 @@ var _ = Describe("Disruption Taints", func() { node = ExpectNodeExists(ctx, env.Client, node.Name) Expect(node.Spec.Taints).To(ContainElement(v1.DisruptedNoScheduleTaint)) + existingNodeClaim := lo.Filter(ExpectNodeClaims(ctx, env.Client), func(nc *v1.NodeClaim, _ int) bool { + return nc.Name == nodeClaim.Name + }) + Expect(existingNodeClaim[0].StatusConditions().Get(v1.ConditionTypeDisruptionCandidate)).ToNot(BeNil()) + Expect(existingNodeClaim[0].StatusConditions().Get(v1.ConditionTypeDisruptionCandidate).IsTrue()).To(BeTrue()) createdNodeClaim := lo.Reject(ExpectNodeClaims(ctx, env.Client), func(nc *v1.NodeClaim, _ int) bool { return nc.Name == nodeClaim.Name @@ -591,6 +594,7 @@ var _ = Describe("Disruption Taints", func() { node = ExpectNodeExists(ctx, env.Client, node.Name) Expect(node.Spec.Taints).ToNot(ContainElement(v1.DisruptedNoScheduleTaint)) + Expect(nodeClaim.StatusConditions().Get(v1.ConditionTypeDisruptionCandidate)).To(BeNil()) }) }) diff --git a/pkg/controllers/disruption/types.go b/pkg/controllers/disruption/types.go index 4d4f201754..1580d2e476 100644 --- a/pkg/controllers/disruption/types.go +++ b/pkg/controllers/disruption/types.go @@ -50,6 +50,7 @@ type Method interface { Reason() v1.DisruptionReason Class() string ConsolidationType() string + EvictionReason(nodeClaim *v1.NodeClaim) string } type CandidateFilter func(context.Context, *Candidate) bool diff --git a/pkg/controllers/node/termination/terminator/events/events.go b/pkg/controllers/node/termination/terminator/events/events.go index d626173d56..0e790978f3 100644 --- a/pkg/controllers/node/termination/terminator/events/events.go +++ b/pkg/controllers/node/termination/terminator/events/events.go @@ -26,12 +26,12 @@ import ( "sigs.k8s.io/karpenter/pkg/events" ) -func EvictPod(pod *corev1.Pod) events.Event { +func EvictPod(pod *corev1.Pod, message string) events.Event { return events.Event{ InvolvedObject: pod, Type: corev1.EventTypeNormal, Reason: "Evicted", - Message: "Evicted pod", + Message: "Evicted pod: " + message, DedupeValues: []string{pod.Name}, } } diff --git a/pkg/controllers/node/termination/terminator/eviction.go b/pkg/controllers/node/termination/terminator/eviction.go index 16e54d3e93..ec238a9d92 100644 --- a/pkg/controllers/node/termination/terminator/eviction.go +++ b/pkg/controllers/node/termination/terminator/eviction.go @@ -39,6 +39,7 @@ import ( "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" + v1 "sigs.k8s.io/karpenter/pkg/apis/v1" "sigs.k8s.io/karpenter/pkg/operator/injection" terminatorevents "sigs.k8s.io/karpenter/pkg/controllers/node/termination/terminator/events" @@ -68,13 +69,15 @@ func IsNodeDrainError(err error) bool { type QueueKey struct { types.NamespacedName - UID types.UID + UID types.UID + nodeName string } func NewQueueKey(pod *corev1.Pod) QueueKey { return QueueKey{ NamespacedName: client.ObjectKeyFromObject(pod), UID: pod.UID, + nodeName: pod.Spec.NodeName, } } @@ -164,6 +167,10 @@ func (q *Queue) Reconcile(ctx context.Context) (reconcile.Result, error) { // Evict returns true if successful eviction call, and false if there was an eviction-related error func (q *Queue) Evict(ctx context.Context, key QueueKey) bool { ctx = log.IntoContext(ctx, log.FromContext(ctx).WithValues("Pod", klog.KRef(key.Namespace, key.Name))) + evictionMessage, err := evictionReason(ctx, key, q.kubeClient) + if err != nil { + log.FromContext(ctx).Error(err, "failed looking up pod eviction reason") + } if err := q.kubeClient.SubResource("eviction").Create(ctx, &corev1.Pod{ObjectMeta: metav1.ObjectMeta{Namespace: key.Namespace, Name: key.Name}}, &policyv1.Eviction{ @@ -192,10 +199,35 @@ func (q *Queue) Evict(ctx context.Context, key QueueKey) bool { log.FromContext(ctx).Error(err, "failed evicting pod") return false } - q.recorder.Publish(terminatorevents.EvictPod(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}})) + q.recorder.Publish(terminatorevents.EvictPod(&corev1.Pod{ObjectMeta: metav1.ObjectMeta{Name: key.Name, Namespace: key.Namespace}}, evictionMessage)) return true } +func getNodeClaims(ctx context.Context, key QueueKey, kubeClient client.Client) ([]*v1.NodeClaim, error) { + nodeClaimList := &v1.NodeClaimList{} + if err := kubeClient.List(ctx, nodeClaimList, client.MatchingFields{"status.nodeName": key.nodeName}); err != nil { + return nil, fmt.Errorf("listing nodeClaims, %w", err) + } + return lo.ToSlicePtr(nodeClaimList.Items), nil +} + +func evictionReason(ctx context.Context, key QueueKey, kubeClient client.Client) (string, error) { + nodeClaims, err := getNodeClaims(ctx, key, kubeClient) + if err != nil { + return "", err + } + for _, nodeClaim := range nodeClaims { + terminationCondition := nodeClaim.StatusConditions().Get(v1.ConditionTypeDisruptionCandidate) + if terminationCondition.IsTrue() { + return terminationCondition.Message, nil + } + if !nodeClaim.DeletionTimestamp.IsZero() { + return fmt.Sprintf("node %s/%s is marked for deletion", nodeClaim.Name, nodeClaim.Status.NodeName), nil + } + } + return "", nil +} + func (q *Queue) Reset() { q.mu.Lock() defer q.mu.Unlock() diff --git a/pkg/events/suite_test.go b/pkg/events/suite_test.go index fec8d1462a..076862a690 100644 --- a/pkg/events/suite_test.go +++ b/pkg/events/suite_test.go @@ -88,8 +88,8 @@ var _ = Describe("Event Creation", func() { Expect(internalRecorder.Calls(schedulingevents.NominatePodEvent(PodWithUID(), NodeWithUID(), NodeClaimWithUID()).Reason)).To(Equal(1)) }) It("should create a EvictPod event", func() { - eventRecorder.Publish(terminatorevents.EvictPod(PodWithUID())) - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(1)) + eventRecorder.Publish(terminatorevents.EvictPod(PodWithUID(), "")) + Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID(), "").Reason)).To(Equal(1)) }) It("should create a PodFailedToSchedule event", func() { eventRecorder.Publish(schedulingevents.PodFailedToScheduleEvent(PodWithUID(), fmt.Errorf(""))) @@ -105,31 +105,31 @@ var _ = Describe("Dedupe", func() { It("should only create a single event when many events are created quickly", func() { pod := PodWithUID() for i := 0; i < 100; i++ { - eventRecorder.Publish(terminatorevents.EvictPod(pod)) + eventRecorder.Publish(terminatorevents.EvictPod(pod, "")) } - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(1)) + Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID(), "").Reason)).To(Equal(1)) }) It("should allow the dedupe timeout to be overridden", func() { pod := PodWithUID() - evt := terminatorevents.EvictPod(pod) + evt := terminatorevents.EvictPod(pod, "") evt.DedupeTimeout = time.Second * 2 // Generate a set of events within the dedupe timeout for i := 0; i < 10; i++ { eventRecorder.Publish(evt) } - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(1)) + Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID(), "").Reason)).To(Equal(1)) // Wait until after the overridden dedupe timeout time.Sleep(time.Second * 3) eventRecorder.Publish(evt) - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(2)) + Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID(), "").Reason)).To(Equal(2)) }) It("should allow events with different entities to be created", func() { for i := 0; i < 100; i++ { - eventRecorder.Publish(terminatorevents.EvictPod(PodWithUID())) + eventRecorder.Publish(terminatorevents.EvictPod(PodWithUID(), "")) } - Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID()).Reason)).To(Equal(100)) + Expect(internalRecorder.Calls(terminatorevents.EvictPod(PodWithUID(), "").Reason)).To(Equal(100)) }) }) diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index b912a4ae75..40ca5a8ecd 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -179,6 +179,9 @@ func NewOperator() (context.Context, *Operator) { lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.NodeClaim{}, "status.providerID", func(o client.Object) []string { return []string{o.(*v1.NodeClaim).Status.ProviderID} }), "failed to setup nodeclaim provider id indexer") + lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.NodeClaim{}, "status.nodeName", func(o client.Object) []string { + return []string{o.(*v1.NodeClaim).Status.NodeName} + }), "failed to setup nodeclaim nodeName indexer") lo.Must0(mgr.GetFieldIndexer().IndexField(ctx, &v1.NodeClaim{}, "spec.nodeClassRef.group", func(o client.Object) []string { return []string{o.(*v1.NodeClaim).Spec.NodeClassRef.Group} }), "failed to setup nodeclaim nodeclassref apiversion indexer") diff --git a/pkg/scheduling/zz_generated.deepcopy.go b/pkg/scheduling/zz_generated.deepcopy.go index d710e56811..bed8d079cf 100644 --- a/pkg/scheduling/zz_generated.deepcopy.go +++ b/pkg/scheduling/zz_generated.deepcopy.go @@ -92,7 +92,7 @@ func (in *VolumeUsage) DeepCopyInto(out *VolumeUsage) { } else { inVal := (*in)[key] in, out := &inVal, &outVal - *out = make(sets.Set[string], len(*in)) + *out = make(sets.Set, len(*in)) for key, val := range *in { (*out)[key] = val } @@ -104,7 +104,7 @@ func (in *VolumeUsage) DeepCopyInto(out *VolumeUsage) { in, out := &in.podVolumes, &out.podVolumes *out = make(map[types.NamespacedName]Volumes, len(*in)) for key, val := range *in { - var outVal map[string]sets.Set[string] + var outVal map[string]sets.Set if val == nil { (*out)[key] = nil } else { @@ -118,7 +118,7 @@ func (in *VolumeUsage) DeepCopyInto(out *VolumeUsage) { } else { inVal := (*in)[key] in, out := &inVal, &outVal - *out = make(sets.Set[string], len(*in)) + *out = make(sets.Set, len(*in)) for key, val := range *in { (*out)[key] = val } @@ -160,7 +160,7 @@ func (in Volumes) DeepCopyInto(out *Volumes) { } else { inVal := (*in)[key] in, out := &inVal, &outVal - *out = make(sets.Set[string], len(*in)) + *out = make(sets.Set, len(*in)) for key, val := range *in { (*out)[key] = val } diff --git a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml index ae6d14b27d..3907c2cafd 100644 --- a/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml +++ b/pkg/test/v1alpha1/crds/karpenter.test.sh_testnodeclasses.yaml @@ -3,7 +3,7 @@ apiVersion: apiextensions.k8s.io/v1 kind: CustomResourceDefinition metadata: annotations: - controller-gen.kubebuilder.io/version: v0.16.1 + controller-gen.kubebuilder.io/version: v0.15.0 name: testnodeclasses.karpenter.test.sh spec: group: karpenter.test.sh @@ -84,7 +84,12 @@ spec: - Unknown type: string type: - description: type of condition in CamelCase or in foo.example.com/CamelCase. + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) maxLength: 316 pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ type: string