From 8912b9b5834aa97e8e1ab933dbf4229a62ee7a2b Mon Sep 17 00:00:00 2001 From: Jonathan Innis Date: Wed, 28 Aug 2024 13:32:38 -0700 Subject: [PATCH] test: Add E2E tests to validate pod grouping and waiting fix (#6838) --- test/pkg/environment/common/expectations.go | 11 ++ test/suites/termination/termination_test.go | 148 ++++++++++++++++++++ 2 files changed, 159 insertions(+) diff --git a/test/pkg/environment/common/expectations.go b/test/pkg/environment/common/expectations.go index f7de4e3046cc..1c3f19e0402c 100644 --- a/test/pkg/environment/common/expectations.go +++ b/test/pkg/environment/common/expectations.go @@ -329,6 +329,17 @@ func (env *Environment) ConsistentlyExpectTerminatingPods(duration time.Duration }, duration.String()).Should(Succeed()) } +func (env *Environment) ConsistentlyExpectActivePods(duration time.Duration, pods ...*corev1.Pod) { + GinkgoHelper() + By(fmt.Sprintf("expecting %d pods to be live for %s", len(pods), duration)) + Consistently(func(g Gomega) { + for _, pod := range pods { + g.Expect(env.Client.Get(env, client.ObjectKeyFromObject(pod), pod)).To(Succeed()) + g.Expect(pod.DeletionTimestamp.IsZero()).To(BeTrue()) + } + }, duration.String()).Should(Succeed()) +} + func (env *Environment) ConsistentlyExpectHealthyPods(duration time.Duration, pods ...*corev1.Pod) { GinkgoHelper() By(fmt.Sprintf("expecting %d pods to be ready for %s", len(pods), duration)) diff --git a/test/suites/termination/termination_test.go b/test/suites/termination/termination_test.go index 7a48fdeee27d..2790a63ae21c 100644 --- a/test/suites/termination/termination_test.go +++ b/test/suites/termination/termination_test.go @@ -20,6 +20,10 @@ import ( . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" "github.com/samber/lo" + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/labels" "sigs.k8s.io/karpenter/pkg/test" ) @@ -47,4 +51,148 @@ var _ = Describe("Termination", func() { g.Expect(lo.FromPtr(env.GetInstanceByID(instanceID).State.Name)).To(BeElementOf("terminated", "shutting-down")) }, time.Second*10).Should(Succeed()) }) + // Pods from Karpenter nodes are expected to drain in the following order: + // 1. Non-Critical Non-Daemonset pods + // 2. Non-Critical Daemonset pods + // 3. Critical Non-Daemonset pods + // 4. Critical Daemonset pods + // Pods in one group are expected to be fully removed before the next group is executed + It("should drain pods on a node in order", func() { + daemonSet := test.DaemonSet(test.DaemonSetOptions{ + Selector: map[string]string{"app": "non-critical-daemonset"}, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "drain-test": "true", + "app": "daemonset", + }, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), + Image: "alpine:3.20.2", + Command: []string{"/bin/sh", "-c", "sleep 1000"}, + PreStopSleep: lo.ToPtr(int64(60)), + ResourceRequirements: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceMemory: resource.MustParse("1Gi")}}, + }, + }) + nodeCriticalDaemonSet := test.DaemonSet(test.DaemonSetOptions{ + Selector: map[string]string{"app": "critical-daemonset"}, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "drain-test": "true", + "app": "node-critical-daemonset", + }, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(10)), // shorter terminationGracePeriod since it's the last pod + Image: "alpine:3.20.2", + Command: []string{"/bin/sh", "-c", "sleep 1000"}, + PreStopSleep: lo.ToPtr(int64(10)), // shorter preStopSleep since it's the last pod + PriorityClassName: "system-node-critical", + ResourceRequirements: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceMemory: resource.MustParse("1Gi")}}, + }, + }) + clusterCriticalDaemonSet := test.DaemonSet(test.DaemonSetOptions{ + Selector: map[string]string{"app": "critical-daemonset"}, + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "drain-test": "true", + "app": "cluster-critical-daemonset", + }, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(10)), // shorter terminationGracePeriod since it's the last pod + Image: "alpine:3.20.2", + Command: []string{"/bin/sh", "-c", "sleep 1000"}, + PreStopSleep: lo.ToPtr(int64(10)), // shorter preStopSleep since it's the last pod + PriorityClassName: "system-cluster-critical", + ResourceRequirements: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceMemory: resource.MustParse("1Gi")}}, + }, + }) + deployment := test.Deployment(test.DeploymentOptions{ + Replicas: int32(1), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "drain-test": "true", + "app": "deployment", + }, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), + Image: "alpine:3.20.2", + Command: []string{"/bin/sh", "-c", "sleep 1000"}, + PreStopSleep: lo.ToPtr(int64(60)), + ResourceRequirements: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceMemory: resource.MustParse("1Gi")}}, + }, + }) + nodeCriticalDeployment := test.Deployment(test.DeploymentOptions{ + Replicas: int32(1), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "drain-test": "true", + "app": "node-critical-deployment", + }, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), + Image: "alpine:3.20.2", + Command: []string{"/bin/sh", "-c", "sleep 1000"}, + PreStopSleep: lo.ToPtr(int64(60)), + PriorityClassName: "system-node-critical", + ResourceRequirements: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceMemory: resource.MustParse("1Gi")}}, + }, + }) + clusterCriticalDeployment := test.Deployment(test.DeploymentOptions{ + Replicas: int32(1), + PodOptions: test.PodOptions{ + ObjectMeta: metav1.ObjectMeta{ + Labels: map[string]string{ + "drain-test": "true", + "app": "cluster-critical-deployment", + }, + }, + TerminationGracePeriodSeconds: lo.ToPtr(int64(60)), + Image: "alpine:3.20.2", + Command: []string{"/bin/sh", "-c", "sleep 1000"}, + PreStopSleep: lo.ToPtr(int64(60)), + PriorityClassName: "system-cluster-critical", + ResourceRequirements: corev1.ResourceRequirements{Limits: corev1.ResourceList{corev1.ResourceMemory: resource.MustParse("1Gi")}}, + }, + }) + env.ExpectCreated(nodeClass, nodePool, daemonSet, nodeCriticalDaemonSet, clusterCriticalDaemonSet, deployment, nodeCriticalDeployment, clusterCriticalDeployment) + + nodeClaim := env.EventuallyExpectCreatedNodeClaimCount("==", 1)[0] + _ = env.EventuallyExpectCreatedNodeCount("==", 1)[0] + env.EventuallyExpectHealthyPodCount(labels.SelectorFromSet(map[string]string{"drain-test": "true"}), 6) + + daemonsetPod := env.ExpectPodsMatchingSelector(labels.SelectorFromSet(map[string]string{"app": "daemonset"}))[0] + nodeCriticalDaemonsetPod := env.ExpectPodsMatchingSelector(labels.SelectorFromSet(map[string]string{"app": "node-critical-daemonset"}))[0] + clusterCriticalDaemonsetPod := env.ExpectPodsMatchingSelector(labels.SelectorFromSet(map[string]string{"app": "cluster-critical-daemonset"}))[0] + deploymentPod := env.ExpectPodsMatchingSelector(labels.SelectorFromSet(map[string]string{"app": "deployment"}))[0] + nodeCriticalDeploymentPod := env.ExpectPodsMatchingSelector(labels.SelectorFromSet(map[string]string{"app": "node-critical-deployment"}))[0] + clusterCriticalDeploymentPod := env.ExpectPodsMatchingSelector(labels.SelectorFromSet(map[string]string{"app": "cluster-critical-deployment"}))[0] + + env.ExpectDeleted(nodeClaim) + + // Wait for non-critical deployment pod to drain and delete + env.EventuallyExpectTerminating(deploymentPod) + // We check that other pods are live for 30s since pre-stop sleep and terminationGracePeriod are 60s + env.ConsistentlyExpectActivePods(time.Second*30, daemonsetPod, nodeCriticalDeploymentPod, nodeCriticalDaemonsetPod, clusterCriticalDeploymentPod, clusterCriticalDaemonsetPod) + env.EventuallyExpectNotFound(deploymentPod) + + // Wait for non-critical daemonset pod to drain and delete + env.EventuallyExpectTerminating(daemonsetPod) + // We check that other pods are live for 30s since pre-stop sleep and terminationGracePeriod are 60s + env.ConsistentlyExpectActivePods(time.Second*30, nodeCriticalDeploymentPod, nodeCriticalDaemonsetPod, clusterCriticalDeploymentPod, clusterCriticalDaemonsetPod) + env.EventuallyExpectNotFound(daemonsetPod) + + // Wait for critical deployment pod to drain and delete + env.EventuallyExpectTerminating(nodeCriticalDeploymentPod, clusterCriticalDeploymentPod) + // We check that other pods are live for 30s since pre-stop sleep and terminationGracePeriod are 60s + env.ConsistentlyExpectActivePods(time.Second*30, nodeCriticalDaemonsetPod, clusterCriticalDaemonsetPod) + env.EventuallyExpectNotFound(nodeCriticalDeploymentPod, clusterCriticalDeploymentPod) + + // Wait for critical daemonset pod to drain and delete + env.EventuallyExpectTerminating(nodeCriticalDaemonsetPod, clusterCriticalDaemonsetPod) + env.EventuallyExpectNotFound(nodeCriticalDaemonsetPod, clusterCriticalDaemonsetPod) + }) })