Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[WIP]feat: Add Repair Policy CP interface #7345

Draft
wants to merge 2 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cmd/controller/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
package main

import (
"github.com/aws/karpenter-provider-aws/pkg/cloudprovider"

Check failure on line 18 in cmd/controller/main.go

View workflow job for this annotation

GitHub Actions / ci

could not import github.com/aws/karpenter-provider-aws/pkg/cloudprovider (-: # github.com/aws/karpenter-provider-aws/pkg/cloudprovider
"github.com/aws/karpenter-provider-aws/pkg/controllers"
"github.com/aws/karpenter-provider-aws/pkg/operator"

Expand All @@ -39,11 +39,12 @@

op.
WithControllers(ctx, corecontrollers.NewControllers(
ctx,
op.Manager,
op.Clock,
op.GetClient(),
op.EventRecorder,
cloudProvider,

Check failure on line 47 in cmd/controller/main.go

View workflow job for this annotation

GitHub Actions / ci

too many arguments in call to corecontrollers.NewControllers
)...).
WithControllers(ctx, controllers.NewControllers(
ctx,
Expand Down
21 changes: 21 additions & 0 deletions pkg/cloudprovider/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,27 @@
return []status.Object{&v1.EC2NodeClass{}}
}

func (c *CloudProvider) RepairPolicy() []cloudprovider.RepairStatement {

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / Analyze Go

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.30.x)

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.28.x)

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.26.x)

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.27.x)

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.25.x)

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.29.x)

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.31.x)

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci

undefined: cloudprovider.RepairStatement

Check failure on line 230 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci

undefined: cloudprovider.RepairStatement
return []cloudprovider.RepairStatement{

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / Analyze Go

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.30.x)

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.28.x)

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.26.x)

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.27.x)

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.25.x)

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.29.x)

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.31.x)

undefined: cloudprovider.RepairStatement

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci

undefined: cloudprovider.RepairStatement (typecheck)

Check failure on line 231 in pkg/cloudprovider/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci

undefined: cloudprovider.RepairStatement) (typecheck)
// Supported Kubelet fields
{
Type: "Ready",
Status: corev1.ConditionFalse,
TolerationDuration: 30 * time.Minute,
},
{
Type: "DiskPressure",
Status: corev1.ConditionTrue,
TolerationDuration: 30 * time.Minute,
},
{
Type: "MemoryPressure",
Status: corev1.ConditionTrue,
TolerationDuration: 30 * time.Minute,
},
}
}

func (c *CloudProvider) resolveNodeClassFromNodeClaim(ctx context.Context, nodeClaim *karpv1.NodeClaim) (*v1.EC2NodeClass, error) {
nodeClass := &v1.EC2NodeClass{}
if err := c.kubeClient.Get(ctx, types.NamespacedName{Name: nodeClaim.Spec.NodeClassRef.Name}, nodeClass); err != nil {
Expand Down
4 changes: 4 additions & 0 deletions pkg/fake/cloudprovider.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,7 @@
func (c *CloudProvider) GetSupportedNodeClasses() []status.Object {
return []status.Object{&v1.EC2NodeClass{}}
}

func (c *CloudProvider) RepairPolicy() []corecloudprovider.RepairStatement {

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / Analyze Go

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.30.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.28.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.26.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.27.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.25.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.29.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.31.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci

undefined: corecloudprovider.RepairStatement

Check failure on line 93 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci

undefined: corecloudprovider.RepairStatement
return []corecloudprovider.RepairStatement{}

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / Analyze Go

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.30.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.28.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.26.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.27.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.25.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.29.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci-test (1.31.x)

undefined: corecloudprovider.RepairStatement

Check failure on line 94 in pkg/fake/cloudprovider.go

View workflow job for this annotation

GitHub Actions / ci

undefined: corecloudprovider.RepairStatement (typecheck)
}
21 changes: 21 additions & 0 deletions test/pkg/environment/common/expectations.go
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,27 @@ func (env *Environment) ExpectUpdated(objects ...client.Object) {
}
}

// ExpectStatusUpdated will update objects in the cluster to match the inputs.
// WARNING: This ignores the resource version check, which can result in
// overwriting changes made by other controllers in the cluster.
// This is useful in ensuring that we can clean up resources by patching
// out finalizers.
// Grab the object before making the updates to reduce the chance of this race.
func (env *Environment) ExpectStatusUpdated(objects ...client.Object) {
GinkgoHelper()
for _, o := range objects {
Eventually(func(g Gomega) {
current := o.DeepCopyObject().(client.Object)
g.Expect(env.Client.Get(env.Context, client.ObjectKeyFromObject(current), current)).To(Succeed())
if current.GetResourceVersion() != o.GetResourceVersion() {
log.FromContext(env).Info(fmt.Sprintf("detected an update to an object (%s) with an outdated resource version, did you get the latest version of the object before patching?", lo.Must(apiutil.GVKForObject(o, env.Client.Scheme()))))
}
o.SetResourceVersion(current.GetResourceVersion())
g.Expect(env.Client.Status().Update(env.Context, o)).To(Succeed())
}).WithTimeout(time.Second * 10).Should(Succeed())
}
}

// ExpectCreatedOrUpdated can update objects in the cluster to match the inputs.
// WARNING: ExpectUpdated ignores the resource version check, which can result in
// overwriting changes made by other controllers in the cluster.
Expand Down
85 changes: 85 additions & 0 deletions test/suites/integration/repair_policy_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
/*
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package integration_test

import (
"time"

appsv1 "k8s.io/api/apps/v1"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/labels"
karpenterv1 "sigs.k8s.io/karpenter/pkg/apis/v1"
coretest "sigs.k8s.io/karpenter/pkg/test"

. "github.com/onsi/ginkgo/v2"
"github.com/samber/lo"
)

var _ = Describe("Repair Policy", func() {
var selector labels.Selector
var dep *appsv1.Deployment
var numPods int

BeforeEach(func() {
numPods = 1
// Add pods with a do-not-disrupt annotation so that we can check node metadata before we disrupt
dep = coretest.Deployment(coretest.DeploymentOptions{
Replicas: int32(numPods),
PodOptions: coretest.PodOptions{
ObjectMeta: metav1.ObjectMeta{
Labels: map[string]string{
"app": "my-app",
},
Annotations: map[string]string{
karpenterv1.DoNotDisruptAnnotationKey: "true",
},
},
TerminationGracePeriodSeconds: lo.ToPtr[int64](0),
},
})
selector = labels.SelectorFromSet(dep.Spec.Selector.MatchLabels)
})

FDescribeTable("Conditions", func(unhealthyCondition corev1.NodeCondition) {
env.ExpectCreated(nodeClass, nodePool, dep)
pod := env.EventuallyExpectHealthyPodCount(selector, numPods)[0]
node := env.ExpectCreatedNodeCount("==", 1)[0]
env.EventuallyExpectInitializedNodeCount("==", 1)

node.Status.Conditions = append(node.Status.Conditions, unhealthyCondition)

env.ExpectStatusUpdated(node)

env.EventuallyExpectNotFound(pod, node)
env.EventuallyExpectHealthyPodCount(selector, numPods)
},
Entry("Kubelet Readiness", corev1.NodeCondition{
Type: corev1.NodeReady,
Status: corev1.ConditionFalse,
LastTransitionTime: metav1.Time{Time: time.Now().Add(-30 * time.Minute)},
}),
Entry("Kubelet DiskPressure", corev1.NodeCondition{
Type: corev1.NodeDiskPressure,
Status: corev1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: time.Now().Add(-30 * time.Minute)},
}),
Entry("Kubelet MemoryPressure", corev1.NodeCondition{
Type: corev1.NodeMemoryPressure,
Status: corev1.ConditionTrue,
LastTransitionTime: metav1.Time{Time: time.Now().Add(-30 * time.Minute)},
}),
)
})
Loading