Skip to content

Commit

Permalink
test: fix consolidation test races (#7180)
Browse files Browse the repository at this point in the history
  • Loading branch information
njtran authored Oct 10, 2024
1 parent 06f7acc commit cdb346d
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 81 deletions.
69 changes: 56 additions & 13 deletions test/pkg/environment/common/expectations.go
Original file line number Diff line number Diff line change
Expand Up @@ -565,34 +565,77 @@ func (env *Environment) ConsistentlyExpectNodeCount(comparator string, count int
return lo.ToSlicePtr(nodeList.Items)
}

func (env *Environment) ConsistentlyExpectNoDisruptions(nodeCount int, duration time.Duration) (taintedNodes []*corev1.Node) {
// ConsistentlyExpectNoDisruptions asserts that the number of tainted nodes remains the same.
// And that the number of nodeclaims remains the same.
func (env *Environment) ConsistentlyExpectNoDisruptions(nodeCount int, duration time.Duration) {
GinkgoHelper()
return env.ConsistentlyExpectDisruptionsWithNodeCount(0, nodeCount, duration)
Consistently(func(g Gomega) {
nodeClaimList := &karpv1.NodeClaimList{}
g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(nodeClaimList.Items).To(HaveLen(nodeCount))
nodeList := &corev1.NodeList{}
g.Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(nodeList.Items).To(HaveLen(nodeCount))
nodeList.Items = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool {
_, ok := lo.Find(n.Spec.Taints, func(t corev1.Taint) bool {
return t.MatchTaint(&karpv1.DisruptedNoScheduleTaint)
})
return ok
})
g.Expect(nodeList.Items).To(HaveLen(0))
}, duration).Should(Succeed())
}

// ConsistentlyExpectDisruptionsWithNodeCount will continually ensure that there are exactly disruptingNodes with totalNodes (including replacements and existing nodes)
func (env *Environment) ConsistentlyExpectDisruptionsWithNodeCount(disruptingNodes, totalNodes int, duration time.Duration) (taintedNodes []*corev1.Node) {
// ConsistentlyExpectDisruptionsUntilNoneLeft consistently ensures a max on number of concurrently disrupting and non-terminating nodes.
// This actually uses an Eventually() under the hood so that when we reach 0 tainted nodes we exit early.
// We use the StopTrying() so that we can exit the Eventually() if we've breached an assertion on total concurrency of disruptions.
// For example: if we have 5 nodes, with a budget of 2 nodes, we ensure that `disruptingNodes <= maxNodesDisrupting=2`
// We use nodesAtStart+maxNodesDisrupting to assert that we're not creating too many instances in replacement.
func (env *Environment) ConsistentlyExpectDisruptionsUntilNoneLeft(nodesAtStart, maxNodesDisrupting int, timeout time.Duration) {
GinkgoHelper()
nodes := []corev1.Node{}
Consistently(func(g Gomega) {
// Ensure we don't change our NodeClaims
// We use an eventually to exit when we detect the number of tainted/disrupted nodes matches our target.
Eventually(func(g Gomega) {
// Grab Nodes and NodeClaims
nodeClaimList := &karpv1.NodeClaimList{}
g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(nodeClaimList.Items).To(HaveLen(totalNodes))

nodeList := &corev1.NodeList{}
g.Expect(env.Client.List(env, nodeClaimList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(env.Client.List(env, nodeList, client.HasLabels{test.DiscoveryLabel})).To(Succeed())
g.Expect(nodeList.Items).To(HaveLen(totalNodes))

// Don't include NodeClaims with the `Terminating` status condition, as they're not included in budgets
removedProviderIDs := sets.Set[string]{}
nodeClaimList.Items = lo.Filter(nodeClaimList.Items, func(nc karpv1.NodeClaim, _ int) bool {
if !nc.StatusConditions().IsTrue(karpv1.ConditionTypeInstanceTerminating) {
return true
}
removedProviderIDs.Insert(nc.Status.ProviderID)
return false
})
if len(nodeClaimList.Items) > nodesAtStart+maxNodesDisrupting {
StopTrying(fmt.Sprintf("Too many nodeclaims created. Expected no more than %d, got %d", nodesAtStart+maxNodesDisrupting, len(nodeClaimList.Items))).Now()
}

// Don't include Nodes whose NodeClaims have been ignored
nodeList.Items = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool {
return !removedProviderIDs.Has(n.Spec.ProviderID)
})
if len(nodeList.Items) > nodesAtStart+maxNodesDisrupting {
StopTrying(fmt.Sprintf("Too many nodes created. Expected no more than %d, got %d", nodesAtStart+maxNodesDisrupting, len(nodeList.Items))).Now()
}

// Filter further by the number of tainted nodes to get the number of nodes that are disrupting
nodes = lo.Filter(nodeList.Items, func(n corev1.Node, _ int) bool {
_, ok := lo.Find(n.Spec.Taints, func(t corev1.Taint) bool {
return t.MatchTaint(&karpv1.DisruptedNoScheduleTaint)
})
return ok
})
g.Expect(nodes).To(HaveLen(disruptingNodes))
}, duration).Should(Succeed())
return lo.ToSlicePtr(nodes)
if len(nodes) > maxNodesDisrupting {
StopTrying(fmt.Sprintf("Too many disruptions detected. Expected no more than %d, got %d", maxNodesDisrupting, len(nodeList.Items))).Now()
}

g.Expect(nodes).To(HaveLen(0))
}).WithTimeout(timeout).WithPolling(5 * time.Second).Should(Succeed())
}

func (env *Environment) EventuallyExpectTaintedNodeCount(comparator string, count int) []*corev1.Node {
Expand Down
42 changes: 7 additions & 35 deletions test/suites/consolidation/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -239,30 +239,7 @@ var _ = Describe("Consolidation", func() {
// Update the deployment to only contain 1 replica.
env.ExpectUpdated(dep)

// Ensure that we get two nodes tainted, and they have overlap during consolidation
env.EventuallyExpectTaintedNodeCount("==", 2)
nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 5, 5*time.Second)

// Remove the finalizer from each node so that we can terminate
for _, node := range nodes {
Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
}

// After the deletion timestamp is set and all pods are drained
// the node should be gone
env.EventuallyExpectNotFound(nodes[0], nodes[1])

// This check ensures that we are consolidating nodes at the same time
env.EventuallyExpectTaintedNodeCount("==", 2)
nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 5*time.Second)

for _, node := range nodes {
Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
}
env.EventuallyExpectNotFound(nodes[0], nodes[1])

// Expect there to only be one node remaining for the last replica
env.ExpectNodeCount("==", 1)
env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 2, 10*time.Minute)
})
It("should respect budgets for non-empty delete consolidation", func() {
// This test will hold consolidation until we are ready to execute it
Expand Down Expand Up @@ -322,15 +299,7 @@ var _ = Describe("Consolidation", func() {
nodePool.Spec.Disruption.ConsolidateAfter = karpv1.MustParseNillableDuration("0s")
env.ExpectUpdated(nodePool)

// Ensure that we get two nodes tainted, and they have overlap during consolidation
env.EventuallyExpectTaintedNodeCount("==", 2)
nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 5*time.Second)

for _, node := range nodes {
Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
}
env.EventuallyExpectNotFound(nodes[0], nodes[1])
env.ExpectNodeCount("==", 1)
env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 10*time.Minute)
})
It("should respect budgets for non-empty replace consolidation", func() {
appLabels := map[string]string{"app": "large-app"}
Expand Down Expand Up @@ -430,12 +399,15 @@ var _ = Describe("Consolidation", func() {
env.EventuallyExpectTaintedNodeCount("==", 3)
env.EventuallyExpectNodeClaimCount("==", 8)
env.EventuallyExpectNodeCount("==", 8)
env.ConsistentlyExpectDisruptionsWithNodeCount(3, 8, 5*time.Second)

env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 3, 10*time.Minute)

for _, node := range originalNodes {
Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
}

for _, nodeClaim := range originalNodeClaims {
Expect(env.ExpectTestingFinalizerRemoved(nodeClaim)).To(Succeed())
}
// Eventually expect all the nodes to be rolled and completely removed
// Since this completes the disruption operation, this also ensures that we aren't leaking nodes into subsequent
// tests since nodeclaims that are actively replacing but haven't brought-up nodes yet can register nodes later
Expand Down
39 changes: 6 additions & 33 deletions test/suites/drift/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,22 +155,7 @@ var _ = Describe("Drift", func() {

env.EventuallyExpectDrifted(nodeClaims...)

// Ensure that we get two nodes tainted, and they have overlap during the drift
env.EventuallyExpectTaintedNodeCount("==", 2)
nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 5*time.Second)

// Remove the finalizer from each node so that we can terminate
for _, node := range nodes {
Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
}

// After the deletion timestamp is set and all pods are drained
// the node should be gone
env.EventuallyExpectNotFound(nodes[0], nodes[1])

nodes = env.EventuallyExpectTaintedNodeCount("==", 1)
Expect(env.ExpectTestingFinalizerRemoved(nodes[0])).To(Succeed())
env.EventuallyExpectNotFound(nodes[0])
env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute)
})
It("should respect budgets for non-empty delete drift", func() {
nodePool = coretest.ReplaceRequirements(nodePool,
Expand Down Expand Up @@ -244,17 +229,7 @@ var _ = Describe("Drift", func() {
env.ExpectUpdated(pod)
}

// Ensure that we get two nodes tainted, and they have overlap during the drift
env.EventuallyExpectTaintedNodeCount("==", 2)
nodes = env.ConsistentlyExpectDisruptionsWithNodeCount(2, 3, 30*time.Second)

By("removing the finalizer from the nodes")
Expect(env.ExpectTestingFinalizerRemoved(nodes[0])).To(Succeed())
Expect(env.ExpectTestingFinalizerRemoved(nodes[1])).To(Succeed())

// After the deletion timestamp is set and all pods are drained
// the node should be gone
env.EventuallyExpectNotFound(nodes[0], nodes[1])
env.ConsistentlyExpectDisruptionsUntilNoneLeft(3, 2, 5*time.Minute)
})
It("should respect budgets for non-empty replace drift", func() {
appLabels := map[string]string{"app": "large-app"}
Expand Down Expand Up @@ -301,16 +276,14 @@ var _ = Describe("Drift", func() {
By("drifting the nodepool")
nodePool.Spec.Template.Annotations = lo.Assign(nodePool.Spec.Template.Annotations, map[string]string{"test-annotation": "drift"})
env.ExpectUpdated(nodePool)

// Ensure that we get three nodes tainted, and they have overlap during the drift
env.EventuallyExpectTaintedNodeCount("==", 3)
env.EventuallyExpectNodeClaimCount("==", 8)
env.EventuallyExpectNodeCount("==", 8)
env.ConsistentlyExpectDisruptionsWithNodeCount(3, 8, 5*time.Second)
env.ConsistentlyExpectDisruptionsUntilNoneLeft(5, 3, 10*time.Minute)

for _, node := range originalNodes {
Expect(env.ExpectTestingFinalizerRemoved(node)).To(Succeed())
}
for _, nodeClaim := range originalNodeClaims {
Expect(env.ExpectTestingFinalizerRemoved(nodeClaim)).To(Succeed())
}

// Eventually expect all the nodes to be rolled and completely removed
// Since this completes the disruption operation, this also ensures that we aren't leaking nodes into subsequent
Expand Down

0 comments on commit cdb346d

Please sign in to comment.