Skip to content

Commit

Permalink
[YUNIKORN-2805] Remove invalid health checks regarding negative capac…
Browse files Browse the repository at this point in the history
…ity (apache#947)

Now that we properly handle cases where node resources can shrink while
existing allocations remain, remove the assumption from the health
checker that allocated resources are always <= capacity.

Closes: apache#947
  • Loading branch information
craigcondit committed Aug 15, 2024
1 parent 1ee2774 commit 7c5b325
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 41 deletions.
57 changes: 20 additions & 37 deletions pkg/scheduler/health_checker.go
Original file line number Diff line number Diff line change
Expand Up @@ -233,21 +233,16 @@ func checkFailedNodes(metrics *metrics.SchedulerMetrics) dao.HealthCheckInfo {
}

func checkSchedulingContext(schedulerContext *ClusterContext) []dao.HealthCheckInfo {
// 1. check resources
// 1.1 check for negative resources
// check for negative resources
var partitionsWithNegResources []string
var nodesWithNegResources []string
// 1.3 node allocated resource <= total resource of the node
var allocationMismatch []string
// 1.2 total partition resource = sum of node resources
// total partition resource = sum of node resources
var totalResourceMismatch []string
// 1.4 node total resource = allocated resource + occupied resource + available resource
// node total resource = allocated resource + occupied resource + available resource
var nodeTotalMismatch []string
// 1.5 node capacity >= allocated resources on the node
var nodeCapacityMismatch []string
// 2. check reservation/node ration
// check reservation/node ration
var partitionReservationRatio []float32
// 3. check for orphan allocations
// check for orphan allocations
orphanAllocationsOnNode := make([]*objects.Allocation, 0)
orphanAllocationsOnApp := make([]*objects.Allocation, 0)

Expand Down Expand Up @@ -282,52 +277,40 @@ func checkSchedulingContext(schedulerContext *ClusterContext) []dao.HealthCheckI
if node.GetOccupiedResource().HasNegativeValue() {
nodesWithNegResources = append(nodesWithNegResources, node.NodeID)
}
if !resources.StrictlyGreaterThanOrEquals(node.GetCapacity(), node.GetAllocatedResource()) {
nodeCapacityMismatch = append(nodeCapacityMismatch, node.NodeID)
}
orphanAllocationsOnNode = append(orphanAllocationsOnNode, checkNodeAllocations(node, part)...)
}
// check if there are allocations assigned to an app but there are missing from the nodes
for _, app := range part.GetApplications() {
orphanAllocationsOnApp = append(orphanAllocationsOnApp, checkAppAllocations(app, part.nodes)...)
}
partitionReservationRatio = append(partitionReservationRatio, float32(sumReservation)/(float32(part.GetTotalNodeCount())))
if !resources.Equals(sumNodeAllocatedResources, part.GetAllocatedResource()) {
allocationMismatch = append(allocationMismatch, part.Name)
}
if !resources.EqualsOrEmpty(sumNodeResources, part.GetTotalPartitionResource()) {
totalResourceMismatch = append(totalResourceMismatch, part.Name)
}
}
var info = make([]dao.HealthCheckInfo, 9)
info[0] = CreateCheckInfo(len(partitionsWithNegResources) == 0, "Negative resources",
var info = make([]dao.HealthCheckInfo, 0)
info = append(info, CreateCheckInfo(len(partitionsWithNegResources) == 0, "Negative resources",
"Check for negative resources in the partitions",
fmt.Sprintf("Partitions with negative resources: %q", partitionsWithNegResources))
info[1] = CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative resources",
fmt.Sprintf("Partitions with negative resources: %q", partitionsWithNegResources)))
info = append(info, CreateCheckInfo(len(nodesWithNegResources) == 0, "Negative resources",
"Check for negative resources in the nodes",
fmt.Sprintf("Nodes with negative resources: %q", nodesWithNegResources))
info[2] = CreateCheckInfo(len(allocationMismatch) == 0, "Consistency of data",
"Check if a partition's allocated resource <= total resource of the partition",
fmt.Sprintf("Partitions with inconsistent data: %q", allocationMismatch))
info[3] = CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency of data",
fmt.Sprintf("Nodes with negative resources: %q", nodesWithNegResources)))
info = append(info, CreateCheckInfo(len(totalResourceMismatch) == 0, "Consistency of data",
"Check if total partition resource == sum of the node resources from the partition",
fmt.Sprintf("Partitions with inconsistent data: %q", totalResourceMismatch))
info[4] = CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of data",
fmt.Sprintf("Partitions with inconsistent data: %q", totalResourceMismatch)))
info = append(info, CreateCheckInfo(len(nodeTotalMismatch) == 0, "Consistency of data",
"Check if node total resource = allocated resource + occupied resource + available resource",
fmt.Sprintf("Nodes with inconsistent data: %q", nodeTotalMismatch))
info[5] = CreateCheckInfo(len(nodeCapacityMismatch) == 0, "Consistency of data",
"Check if node capacity >= allocated resources on the node",
fmt.Sprintf("Nodes with inconsistent data: %q", nodeCapacityMismatch))
fmt.Sprintf("Nodes with inconsistent data: %q", nodeTotalMismatch)))
// mark it as succeeded for a while until we will know what is not considered a normal value anymore
info[6] = CreateCheckInfo(true, "Reservation check",
info = append(info, CreateCheckInfo(true, "Reservation check",
"Check the reservation nr compared to the number of nodes",
fmt.Sprintf("Reservation/node nr ratio: %f", partitionReservationRatio))
info[7] = CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan allocation on node check",
fmt.Sprintf("Reservation/node nr ratio: %f", partitionReservationRatio)))
info = append(info, CreateCheckInfo(len(orphanAllocationsOnNode) == 0, "Orphan allocation on node check",
"Check if there are orphan allocations on the nodes",
fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode))
info[8] = CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan allocation on app check",
fmt.Sprintf("Orphan allocations: %v", orphanAllocationsOnNode)))
info = append(info, CreateCheckInfo(len(orphanAllocationsOnApp) == 0, "Orphan allocation on app check",
"Check if there are orphan allocations on the applications",
fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp))
fmt.Sprintf("OrphanAllocations: %v", orphanAllocationsOnApp)))
return info
}

Expand Down
8 changes: 4 additions & 4 deletions pkg/scheduler/health_checker_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
node.AddAllocation(alloc)
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, !healthInfo.Healthy, "Scheduler should not be healthy")
assert.Assert(t, !healthInfo.HealthChecks[9].Succeeded, "The orphan allocation check on the node should not be successful")
assert.Assert(t, !healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should not be successful")

// add the allocation to the app as well
part := schedulerContext.partitions[partName]
Expand All @@ -218,13 +218,13 @@ func TestGetSchedulerHealthStatusContext(t *testing.T) {
err = part.AddApplication(app)
assert.NilError(t, err, "Could not add application")
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan allocation check on the node should be successful")
assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should be successful")

// remove the allocation from the node, so we will have an orphan allocation assigned to the app
node.RemoveAllocation("key")
healthInfo = GetSchedulerHealthStatus(schedulerMetrics, schedulerContext)
assert.Assert(t, healthInfo.HealthChecks[9].Succeeded, "The orphan allocation check on the node should be successful")
assert.Assert(t, !healthInfo.HealthChecks[10].Succeeded, "The orphan allocation check on the app should not be successful")
assert.Assert(t, healthInfo.HealthChecks[7].Succeeded, "The orphan allocation check on the node should be successful")
assert.Assert(t, !healthInfo.HealthChecks[8].Succeeded, "The orphan allocation check on the app should not be successful")
}

func TestGetSchedulerHealthStatusMetrics(t *testing.T) {
Expand Down

0 comments on commit 7c5b325

Please sign in to comment.