From 7c5b32590f8a5b5c9a89d2d2de1767ba2c39fbdb Mon Sep 17 00:00:00 2001 From: Craig Condit Date: Thu, 15 Aug 2024 14:17:22 -0500 Subject: [PATCH] [YUNIKORN-2805] Remove invalid health checks regarding negative capacity (#947) Now that we properly handle cases where node resources can shrink while existing allocations remain, remove the assumption from the health checker that allocated resources are always <= capacity. Closes: #947 --- pkg/scheduler/health_checker.go | 57 ++++++++++------------------ pkg/scheduler/health_checker_test.go | 8 ++-- 2 files changed, 24 insertions(+), 41 deletions(-) diff --git a/pkg/scheduler/health_checker.go b/pkg/scheduler/health_checker.go index 84c7947af..177d9b8d4 100644 --- a/pkg/scheduler/health_checker.go +++ b/pkg/scheduler/health_checker.go @@ -233,21 +233,16 @@ func checkFailedNodes(metrics *metrics.SchedulerMetrics) dao.HealthCheckInfo { } func checkSchedulingContext(schedulerContext *ClusterContext) []dao.HealthCheckInfo { - // 1. check resources - // 1.1 check for negative resources + // check for negative resources var partitionsWithNegResources []string var nodesWithNegResources []string - // 1.3 node allocated resource <= total resource of the node - var allocationMismatch []string - // 1.2 total partition resource = sum of node resources + // total partition resource = sum of node resources var totalResourceMismatch []string - // 1.4 node total resource = allocated resource + occupied resource + available resource + // node total resource = allocated resource + occupied resource + available resource var nodeTotalMismatch []string - // 1.5 node capacity >= allocated resources on the node - var nodeCapacityMismatch []string - // 2. check reservation/node ration + // check reservation/node ration var partitionReservationRatio []float32 - // 3. check for orphan allocations + // check for orphan allocations orphanAllocationsOnNode := make([]*objects.Allocation, 0) orphanAllocationsOnApp := make([]*objects.Allocation, 0) @@ -282,9 +277,6 @@ func checkSchedulingContext(schedulerContext *ClusterContext) []dao.HealthCheckI if node.GetOccupiedResource().HasNegativeValue() { nodesWithNegResources = append(nodesWithNegResources, node.NodeID) } - if !resources.StrictlyGreaterThanOrEquals(node.GetCapacity(), node.GetAllocatedResource()) { - nodeCapacityMismatch = append(nodeCapacityMismatch, node.NodeID) - } orphanAllocationsOnNode = append(orphanAllocationsOnNode, checkNodeAllocations(node, part)...) } // check if there are allocations assigned to an app but there are missing from the nodes @@ -292,42 +284,33 @@ func checkSchedulingContext(schedulerContext *ClusterContext) []dao.HealthCheckI orphanAllocationsOnApp = append(orphanAllocationsOnApp, checkAppAllocations(app, part.nodes)...) } partitionReservationRatio = append(partitionReservationRatio, float32(sumReservation)/(float32(part.GetTotalNodeCount()))) - if !resources.Equals(sumNodeAllocatedResources, part.GetAllocatedResource()) { - allocationMismatch = append(allocationMismatch, part.Name) - } if !resources.EqualsOrEmpty(sumNodeResources, part.GetTotalPartitionResource()) { totalResourceMismatch = append(totalResourceMismatch, part.Name) } } - var info = make([]dao.HealthCheckInfo, 9) - info[0] = CreateCheckInfo(len(partitionsWithNegResources) == 0, "Negative resources", + var info = make([]dao.HealthCheckInfo, 0) + info = append(info, CreateCheckInfo(len(partitionsWithNegResources) == 0, "Negative resources", "Check for negative resources in the partitions", - fmt.Sprintf("Partitions with negative resources: %q", partitionsWithNegResources)) - info[1] = CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative resources", + fmt.Sprintf("Partitions with negative resources: %q", partitionsWithNegResources))) + info = append(info, CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative resources", "Check for negative resources in the nodes", - fmt.Sprintf("Nodes with negative resources: %q", nodesWithNegResources)) - info[2] = CreateCheckInfo(len(allocationMismatch) == 0, "Consistency of data", - "Check if a partition's allocated resource <= total resource of the partition", - fmt.Sprintf("Partitions with inconsistent data: %q", allocationMismatch)) - info[3] = CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency of data", + fmt.Sprintf("Nodes with negative resources: %q", nodesWithNegResources))) + info = append(info, CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency of data", "Check if total partition resource == sum of the node resources from the partition", - fmt.Sprintf("Partitions with inconsistent data: %q", totalResourceMismatch)) - info[4] = CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of data", + fmt.Sprintf("Partitions with inconsistent data: %q", totalResourceMismatch))) + info = append(info, CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of data", "Check if node total resource = allocated resource + occupied resource + available resource", - fmt.Sprintf("Nodes with inconsistent data: %q", nodeTotalMismatch)) - info[5] = CreateCheckInfo(len(nodeCapacityMismatch) == 0, "Consistency of data", - "Check if node capacity >= allocated resources on the node", - fmt.Sprintf("Nodes with inconsistent data: %q", nodeCapacityMismatch)) + fmt.Sprintf("Nodes with inconsistent data: %q", nodeTotalMismatch))) // mark it as succeeded for a while until we will know what is not considered a normal value anymore - info[6] = CreateCheckInfo(true, "Reservation check", + info = append(info, CreateCheckInfo(true, "Reservation check", "Check the reservation nr compared to the number of nodes", - fmt.Sprintf("Reservation/node nr ratio: %f", partitionReservationRatio)) - info[7] = CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan allocation on node check", + fmt.Sprintf("Reservation/node nr ratio: %f", partitionReservationRatio))) + info = append(info, CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan allocation on node check", "Check if there are orphan allocations on the nodes", - fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode)) - info[8] = CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan allocation on app check", + fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode))) + info = append(info, CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan allocation on app check", "Check if there are orphan allocations on the applications", - fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp)) + fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp))) return info } diff --git a/pkg/scheduler/health_checker_test.go b/pkg/scheduler/health_checker_test.go index 6d3e50adb..407f3f225 100644 --- a/pkg/scheduler/health_checker_test.go +++ b/pkg/scheduler/health_checker_test.go @@ -209,7 +209,7 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) { node.AddAllocation(alloc) healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext) assert.Assert(t, !healthInfo.Healthy, "Scheduler should not be healthy") - assert.Assert(t, !healthInfo.HealthChecks[9].Succeeded, "The orphan allocation check on the node should not be successful") + assert.Assert(t, !healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should not be successful") // add the allocation to the app as well part := schedulerContext.partitions[partName] @@ -218,13 +218,13 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) { err = part.AddApplication(app) assert.NilError(t, err, "Could not add application") healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext) - assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan allocation check on the node should be successful") + assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should be successful") // remove the allocation from the node, so we will have an orphan allocation assigned to the app node.RemoveAllocation("key") healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext) - assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan allocation check on the node should be successful") - assert.Assert(t, !healthInfo.HealthChecks[10].Succeeded, "The orphan allocation check on the app should not be successful") + assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should be successful") + assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app should not be successful") } func TestGetSchedulerHealthStatusMetrics(t *testing.T) {