From a3c7954586b5092831f8912bc3e95fe1867288a8 Mon Sep 17 00:00:00 2001
From: lvhaodong <lvhaodong@kuaishou.com>
Date: Thu, 12 Aug 2021 20:58:22 +0800
Subject: [PATCH] feat: expose detailed scheduling reason of pending tasks
 based on last transaction statement

Signed-off-by: lvhaodong <lvhaodong@kuaishou.com>
---
 docs/design/scheduling-reason.md     |  46 +++++++++++
 pkg/scheduler/api/job_info.go        | 114 +++++++++++++++++++++++----
 pkg/scheduler/api/job_info_test.go   |  97 +++++++++++++++++++++++
 pkg/scheduler/api/unschedule_info.go |  13 +++
 pkg/scheduler/cache/cache.go         |  15 ++--
 pkg/scheduler/framework/statement.go |   2 +
 pkg/scheduler/plugins/gang/gang.go   |  16 +++-
 7 files changed, 279 insertions(+), 24 deletions(-)
 create mode 100644 docs/design/scheduling-reason.md

diff --git a/docs/design/scheduling-reason.md b/docs/design/scheduling-reason.md
new file mode 100644
index 0000000000..bb71126910
--- /dev/null
+++ b/docs/design/scheduling-reason.md
@@ -0,0 +1,46 @@
+# Scheduling Reason
+
+[@eggiter](https://github.com/eggiter); Aug 13, 2021
+
+## Table of Contents
+
+* [Table of Contents](#table-of-contents)
+* [Problem](#problem)
+* [Solution](#solution)
+
+## Problem
+
+- Currently, the reason and message in `pod.status.conditions["PodScheduled"]` and `podgroup.status.conditions["Unschedulable"]` is **NOT** informative, for example `x/x tasks in gang unschedulable: pod group is not ready, x Pending, x minAvailable`.
+- Users want to know which one breaks the scheduling cycle and why is that.
+
+## Solution
+
+- Introducing two extra scheduling reasons in PodScheduled PodCondition:
+  + `Schedulable`: means that the scheduler can schedule the pod right now, but not bind yet.
+  + `Undetermined`: means that the scheduler skips scheduling the pod which left the pod `Undetermined`, for example due to unschedulable pod already occurred.
+
+- Case:
+  + 3 nodes: node1, node2, node3;
+  + 6 tasks in the PodGroup(`minAvailable = 6`);
+    + only 1 task(Task6) is unschedulable and other 5 tasks(Task1 ~ 5) can be scheduled, thus the whole job is unscheduable.
+  
+- Current information:
+  +  |Tasks|Reason|Message|
+     |-|-|-|
+     |PodGroup |NotEnoughResources|6/6 tasks in gang unschedulable: pod group is not ready, 6 Pending, 6 minAvailable |
+     |Task1 ~ 5|Unschedulable|6/6 tasks in gang unschedulable: pod group is not ready, 6 Pending, 6 minAvailable|
+     |Task6|Unschedulable| all nodes are unavailable: 1 plugin InterPodAffinity predicates failed node(s) didn't match pod affinity/anti-affinity, node(s) didn't match pod affinity rules, 2 node(s) resource fit failed.|
+
+- Improved information:
+  + |Tasks|Reason|Message|
+    |-|-|-|
+    |PodGroup |(same)| **3/6** tasks in gang unschedulable: pod group is not ready, 6 Pending, 6 minAvailable; **Pending: 1 Unschedulable, 2 Undetermined, 3 Schedulable** |
+    |Task1 ~ 2|**Undetermined**| **3/6** tasks in gang unschedulable: pod group is not ready, 6 Pending, 6 minAvailable; **Pending: 1 Unschedulable, 2 Undetermined, 3 Schedulable**  |
+    |Task3|**Schedulable**| **Pod ns1/task-1 can possibly be assgined to node1** |
+    |Task4|**Schedulable**| **Pod ns1/task-2 can possibly be assgined to node2** |
+    |Task5|**Schedulable**| **Pod ns1/task-3 can possibly be assgined to node3** |
+    |Task6|Unschedulable| all nodes are unavailable: 1 plugin InterPodAffinity predicates failed node(s) didn't match pod affinity/anti-affinity, node(s) didn't match pod affinity rules, 2 node(s) resource fit failed.|
+
+    - Note: Task1 & 2 are `Undetermined` maybe because that this two locate after task6 by `TaskOrderFn`;
+
+- In improved information, we can easily find the one that breaks the whole scheduling cycle and why dose that happen. Additionally,  we can find the histogram of reason why there are some tasks whose status is pending.
diff --git a/pkg/scheduler/api/job_info.go b/pkg/scheduler/api/job_info.go
index d8ec20b7c0..1a9ac9ee79 100644
--- a/pkg/scheduler/api/job_info.go
+++ b/pkg/scheduler/api/job_info.go
@@ -65,6 +65,21 @@ const JobWaitingTime = "sla-waiting-time"
 // TaskID is UID type for Task
 type TaskID types.UID
 
+// TransactionContext holds all the fields that needed by scheduling transaction
+type TransactionContext struct {
+	NodeName string
+	Status   TaskStatus
+}
+
+// Clone return a clone of TransactionContext
+func (ctx *TransactionContext) Clone() *TransactionContext {
+	if ctx == nil {
+		return nil
+	}
+	clone := *ctx
+	return &clone
+}
+
 // TaskInfo will have all infos about the task
 type TaskInfo struct {
 	UID TaskID
@@ -78,8 +93,10 @@ type TaskInfo struct {
 	// InitResreq is the resource that used to launch a task.
 	InitResreq *Resource
 
-	NodeName    string
-	Status      TaskStatus
+	TransactionContext
+	// LastTransaction holds the context of last scheduling transaction
+	LastTransaction *TransactionContext
+
 	Priority    int32
 	VolumeReady bool
 	Preemptable bool
@@ -128,8 +145,6 @@ func NewTaskInfo(pod *v1.Pod) *TaskInfo {
 		Job:            jobID,
 		Name:           pod.Name,
 		Namespace:      pod.Namespace,
-		NodeName:       pod.Spec.NodeName,
-		Status:         getTaskStatus(pod),
 		Priority:       1,
 		Pod:            pod,
 		Resreq:         resReq,
@@ -137,6 +152,11 @@ func NewTaskInfo(pod *v1.Pod) *TaskInfo {
 		Preemptable:    preemptable,
 		RevocableZone:  revocableZone,
 		TopologyPolicy: topologyPolicy,
+
+		TransactionContext: TransactionContext{
+			NodeName: pod.Spec.NodeName,
+			Status:   getTaskStatus(pod),
+		},
 	}
 
 	if pod.Spec.Priority != nil {
@@ -146,6 +166,22 @@ func NewTaskInfo(pod *v1.Pod) *TaskInfo {
 	return ti
 }
 
+// GetTransactionContext get transaction context of a task
+func (ti *TaskInfo) GetTransactionContext() TransactionContext {
+	return ti.TransactionContext
+}
+
+// GenerateLastTxContext generate and set context of last transaction for a task
+func (ti *TaskInfo) GenerateLastTxContext() {
+	ctx := ti.GetTransactionContext()
+	ti.LastTransaction = &ctx
+}
+
+// ClearLastTxContext clear context of last transaction for a task
+func (ti *TaskInfo) ClearLastTxContext() {
+	ti.LastTransaction = nil
+}
+
 // Clone is used for cloning a task
 func (ti *TaskInfo) Clone() *TaskInfo {
 	return &TaskInfo{
@@ -153,8 +189,6 @@ func (ti *TaskInfo) Clone() *TaskInfo {
 		Job:            ti.Job,
 		Name:           ti.Name,
 		Namespace:      ti.Namespace,
-		NodeName:       ti.NodeName,
-		Status:         ti.Status,
 		Priority:       ti.Priority,
 		Pod:            ti.Pod,
 		Resreq:         ti.Resreq.Clone(),
@@ -163,6 +197,12 @@ func (ti *TaskInfo) Clone() *TaskInfo {
 		Preemptable:    ti.Preemptable,
 		RevocableZone:  ti.RevocableZone,
 		TopologyPolicy: ti.TopologyPolicy,
+
+		TransactionContext: TransactionContext{
+			NodeName: ti.NodeName,
+			Status:   ti.Status,
+		},
+		LastTransaction: ti.LastTransaction.Clone(),
 	}
 }
 
@@ -485,13 +525,7 @@ func (ji JobInfo) String() string {
 // FitError returns detailed information on why a job's task failed to fit on
 // each available node
 func (ji *JobInfo) FitError() string {
-	reasons := make(map[string]int)
-	for status, taskMap := range ji.TaskStatusIndex {
-		reasons[status.String()] += len(taskMap)
-	}
-	reasons["minAvailable"] = int(ji.MinAvailable)
-
-	sortReasonsHistogram := func() []string {
+	sortReasonsHistogram := func(reasons map[string]int) []string {
 		reasonStrings := []string{}
 		for k, v := range reasons {
 			reasonStrings = append(reasonStrings, fmt.Sprintf("%v %v", v, k))
@@ -499,10 +533,62 @@ func (ji *JobInfo) FitError() string {
 		sort.Strings(reasonStrings)
 		return reasonStrings
 	}
-	reasonMsg := fmt.Sprintf("%v, %v.", scheduling.PodGroupNotReady, strings.Join(sortReasonsHistogram(), ", "))
+
+	// Stat histogram for all tasks of the job
+	reasons := make(map[string]int)
+	for status, taskMap := range ji.TaskStatusIndex {
+		reasons[status.String()] += len(taskMap)
+	}
+	reasons["minAvailable"] = int(ji.MinAvailable)
+	reasonMsg := fmt.Sprintf("%v, %v", scheduling.PodGroupNotReady, strings.Join(sortReasonsHistogram(reasons), ", "))
+
+	// Stat histogram for pending tasks only
+	reasons = make(map[string]int)
+	for uid := range ji.TaskStatusIndex[Pending] {
+		reason, _ := ji.TaskSchedulingReason(uid)
+		reasons[reason]++
+	}
+	if len(reasons) > 0 {
+		reasonMsg += "; " + fmt.Sprintf("%s: %s", Pending.String(), strings.Join(sortReasonsHistogram(reasons), ", "))
+	}
 	return reasonMsg
 }
 
+// TaskSchedulingReason get detailed reason and message of the given task
+// It returns detailed reason and message for tasks based on last scheduling transaction.
+func (ji *JobInfo) TaskSchedulingReason(tid TaskID) (reason string, msg string) {
+	taskInfo, exists := ji.Tasks[tid]
+	if !exists {
+		return "", ""
+	}
+
+	// Get detailed scheduling reason based on LastTransaction
+	ctx := taskInfo.GetTransactionContext()
+	if taskInfo.LastTransaction != nil {
+		ctx = *taskInfo.LastTransaction
+	}
+
+	msg = ji.JobFitErrors
+	switch status := ctx.Status; status {
+	case Allocated, Pipelined:
+		// Pod is schedulable
+		msg = fmt.Sprintf("Pod %s/%s can possibly be assigned to %s", taskInfo.Namespace, taskInfo.Name, ctx.NodeName)
+		if status == Pipelined {
+			msg += " once resource is released"
+		}
+		return PodReasonSchedulable, msg
+	case Pending:
+		if fe := ji.NodesFitErrors[tid]; fe != nil {
+			// Pod is not schedulable
+			return PodReasonUnschedulable, fe.Error()
+		}
+		// Pod is not scheduled yet
+		return PodReasonUndetermined, msg
+	default:
+		return status.String(), msg
+	}
+}
+
 // ReadyTaskNum returns the number of tasks that are ready or that is best-effort.
 func (ji *JobInfo) ReadyTaskNum() int32 {
 	var occupied int32
diff --git a/pkg/scheduler/api/job_info_test.go b/pkg/scheduler/api/job_info_test.go
index e8ec84ce1d..c2cad5cf90 100644
--- a/pkg/scheduler/api/job_info_test.go
+++ b/pkg/scheduler/api/job_info_test.go
@@ -22,6 +22,10 @@ import (
 
 	v1 "k8s.io/api/core/v1"
 	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/types"
+
+	"volcano.sh/apis/pkg/apis/scheduling"
+	schedulingv2 "volcano.sh/apis/pkg/apis/scheduling/v1beta1"
 )
 
 func jobInfoEqual(l, r *JobInfo) bool {
@@ -191,3 +195,96 @@ func TestDeleteTaskInfo(t *testing.T) {
 		}
 	}
 }
+
+func TestTaskSchedulingReason(t *testing.T) {
+	t1 := buildPod("ns1", "task-1", "", v1.PodPending, buildResourceList("1", "1G"), nil, make(map[string]string))
+	t2 := buildPod("ns1", "task-2", "", v1.PodPending, buildResourceList("1", "1G"), nil, make(map[string]string))
+	t3 := buildPod("ns1", "task-3", "node1", v1.PodPending, buildResourceList("1", "1G"), nil, make(map[string]string))
+	t4 := buildPod("ns1", "task-4", "node2", v1.PodPending, buildResourceList("1", "1G"), nil, make(map[string]string))
+	t5 := buildPod("ns1", "task-5", "node3", v1.PodPending, buildResourceList("1", "1G"), nil, make(map[string]string))
+	t6 := buildPod("ns1", "task-6", "", v1.PodPending, buildResourceList("1", "1G"), nil, make(map[string]string))
+
+	tests := []struct {
+		desc     string
+		pods     []*v1.Pod
+		jobid    JobID
+		nodefes  map[TaskID]*FitErrors
+		expected map[types.UID]string
+	}{
+		{
+			desc:  "task3 ~ 5 are schedulable",
+			pods:  []*v1.Pod{t1, t2, t3, t4, t5, t6},
+			jobid: JobID("case1"),
+			nodefes: map[TaskID]*FitErrors{
+				TaskID(t6.UID): {
+					nodes: map[string]*FitError{
+						"node1": {Reasons: []string{NodePodNumberExceeded}},
+						"node2": {Reasons: []string{NodeResourceFitFailed}},
+						"node3": {Reasons: []string{NodeResourceFitFailed}},
+					},
+				},
+			},
+			expected: map[types.UID]string{
+				"pg":   "pod group is not ready, 6 Pending, 6 minAvailable; Pending: 1 Unschedulable, 2 Undetermined, 3 Schedulable",
+				t1.UID: "pod group is not ready, 6 Pending, 6 minAvailable; Pending: 1 Unschedulable, 2 Undetermined, 3 Schedulable",
+				t2.UID: "pod group is not ready, 6 Pending, 6 minAvailable; Pending: 1 Unschedulable, 2 Undetermined, 3 Schedulable",
+				t3.UID: "Pod ns1/task-3 can possibly be assigned to node1",
+				t4.UID: "Pod ns1/task-4 can possibly be assigned to node2",
+				t5.UID: "Pod ns1/task-5 can possibly be assigned to node3",
+				t6.UID: "all nodes are unavailable: 1 node(s) pod number exceeded, 2 node(s) resource fit failed.",
+			},
+		},
+	}
+
+	for i, test := range tests {
+		job := NewJobInfo(test.jobid)
+		pg := scheduling.PodGroup{
+			ObjectMeta: metav1.ObjectMeta{
+				Namespace: "ns1",
+				Name:      "pg1",
+			},
+			Spec: scheduling.PodGroupSpec{
+				MinMember: int32(len(test.pods)),
+			},
+		}
+		for _, pod := range test.pods {
+			// set pod group
+			pod.Annotations = map[string]string{
+				schedulingv2.KubeGroupNameAnnotationKey: pg.Name,
+			}
+
+			// add TaskInfo
+			ti := NewTaskInfo(pod)
+			job.AddTaskInfo(ti)
+
+			// pod is schedulable
+			if len(pod.Spec.NodeName) > 0 {
+				ti.LastTransaction = &TransactionContext{
+					NodeName: pod.Spec.NodeName,
+					Status:   Allocated,
+				}
+			}
+		}
+		// complete job
+		job.SetPodGroup(&PodGroup{PodGroup: pg})
+		job.NodesFitErrors = test.nodefes
+		job.TaskStatusIndex = map[TaskStatus]tasksMap{Pending: {}}
+		for _, task := range job.Tasks {
+			task.Status = Pending
+			job.TaskStatusIndex[Pending][task.UID] = task
+		}
+		job.JobFitErrors = job.FitError()
+
+		// assert
+		for uid, exp := range test.expected {
+			msg := job.JobFitErrors
+			if uid != "pg" {
+				_, msg = job.TaskSchedulingReason(TaskID(uid))
+			}
+			t.Logf("case #%d, task %v, result: %s", i, uid, msg)
+			if msg != exp {
+				t.Errorf("[x] case #%d, task %v, expected: %s, got: %s", i, uid, exp, msg)
+			}
+		}
+	}
+}
diff --git a/pkg/scheduler/api/unschedule_info.go b/pkg/scheduler/api/unschedule_info.go
index fdec80ef40..d71c3a5e4f 100644
--- a/pkg/scheduler/api/unschedule_info.go
+++ b/pkg/scheduler/api/unschedule_info.go
@@ -16,6 +16,19 @@ const (
 	AllNodeUnavailableMsg = "all nodes are unavailable"
 )
 
+// These are reasons for a pod's transition to a condition.
+const (
+	// PodReasonUnschedulable reason in PodScheduled PodCondition means that the scheduler
+	// can't schedule the pod right now, for example due to insufficient resources in the cluster.
+	PodReasonUnschedulable = "Unschedulable"
+	// PodReasonSchedulable reason in PodScheduled PodCondition means that the scheduler
+	// can schedule the pod right now, but not bind yet
+	PodReasonSchedulable = "Schedulable"
+	// PodReasonUndetermined reason in PodScheduled PodCondition means that the scheduler
+	// skips scheduling the pod which left the pod `Undetermined`, for example due to unschedulable pod already occurred.
+	PodReasonUndetermined = "Undetermined"
+)
+
 // FitErrors is set of FitError on many nodes
 type FitErrors struct {
 	nodes map[string]*FitError
diff --git a/pkg/scheduler/cache/cache.go b/pkg/scheduler/cache/cache.go
index b91b0c50c5..8fa5ba5668 100644
--- a/pkg/scheduler/cache/cache.go
+++ b/pkg/scheduler/cache/cache.go
@@ -706,13 +706,13 @@ func (sc *SchedulerCache) UpdateSchedulerNumaInfo(AllocatedSets map[string]sched
 }
 
 // taskUnschedulable updates pod status of pending task
-func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, message string) error {
+func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, reason, message string) error {
 	pod := task.Pod
 
 	condition := &v1.PodCondition{
 		Type:    v1.PodScheduled,
 		Status:  v1.ConditionFalse,
-		Reason:  v1.PodReasonUnschedulable,
+		Reason:  reason, // Add more reasons in order to distinguish more specific scenario of pending tasks
 		Message: message,
 	}
 
@@ -721,7 +721,7 @@ func (sc *SchedulerCache) taskUnschedulable(task *schedulingapi.TaskInfo, messag
 
 		// The reason field in 'Events' should be "FailedScheduling", there is not constants defined for this in
 		// k8s core, so using the same string here.
-		// The reason field in PodCondition should be "Unschedulable"
+		// The reason field in PodCondition can be "Unschedulable"
 		sc.Recorder.Eventf(pod, v1.EventTypeWarning, "FailedScheduling", message)
 		if _, err := sc.StatusUpdater.UpdatePodCondition(pod, condition); err != nil {
 			return err
@@ -950,12 +950,11 @@ func (sc *SchedulerCache) RecordJobStatusEvent(job *schedulingapi.JobInfo) {
 	// Update podCondition for tasks Allocated and Pending before job discarded
 	for _, status := range []schedulingapi.TaskStatus{schedulingapi.Allocated, schedulingapi.Pending, schedulingapi.Pipelined} {
 		for _, taskInfo := range job.TaskStatusIndex[status] {
-			msg := baseErrorMessage
-			fitError := job.NodesFitErrors[taskInfo.UID]
-			if fitError != nil {
-				msg = fitError.Error()
+			reason, msg := job.TaskSchedulingReason(taskInfo.UID)
+			if len(msg) == 0 {
+				msg = baseErrorMessage
 			}
-			if err := sc.taskUnschedulable(taskInfo, msg); err != nil {
+			if err := sc.taskUnschedulable(taskInfo, reason, msg); err != nil {
 				klog.Errorf("Failed to update unschedulable task status <%s/%s>: %v",
 					taskInfo.Namespace, taskInfo.Name, err)
 			}
diff --git a/pkg/scheduler/framework/statement.go b/pkg/scheduler/framework/statement.go
index f114f791ed..a28c2d38d3 100644
--- a/pkg/scheduler/framework/statement.go
+++ b/pkg/scheduler/framework/statement.go
@@ -352,6 +352,7 @@ func (s *Statement) Discard() {
 	klog.V(3).Info("Discarding operations ...")
 	for i := len(s.operations) - 1; i >= 0; i-- {
 		op := s.operations[i]
+		op.task.GenerateLastTxContext()
 		switch op.name {
 		case Evict:
 			err := s.unevict(op.task)
@@ -376,6 +377,7 @@ func (s *Statement) Discard() {
 func (s *Statement) Commit() {
 	klog.V(3).Info("Committing operations ...")
 	for _, op := range s.operations {
+		op.task.ClearLastTxContext()
 		switch op.name {
 		case Evict:
 			err := s.evict(op.task, op.reason)
diff --git a/pkg/scheduler/plugins/gang/gang.go b/pkg/scheduler/plugins/gang/gang.go
index 9d70dd6981..2121d657ca 100644
--- a/pkg/scheduler/plugins/gang/gang.go
+++ b/pkg/scheduler/plugins/gang/gang.go
@@ -162,9 +162,21 @@ func (gp *gangPlugin) OnSessionClose(ssn *framework.Session) {
 	var unScheduleJobCount int
 	for _, job := range ssn.Jobs {
 		if !job.Ready() {
-			unreadyTaskCount = job.MinAvailable - job.ReadyTaskNum()
+			schedulableTaskNum := func() (num int32) {
+				for _, task := range job.TaskStatusIndex[api.Pending] {
+					ctx := task.GetTransactionContext()
+					if task.LastTransaction != nil {
+						ctx = *task.LastTransaction
+					}
+					if api.AllocatedStatus(ctx.Status) {
+						num++
+					}
+				}
+				return num + job.ReadyTaskNum()
+			}
+			unreadyTaskCount = job.MinAvailable - schedulableTaskNum()
 			msg := fmt.Sprintf("%v/%v tasks in gang unschedulable: %v",
-				job.MinAvailable-job.ReadyTaskNum(), len(job.Tasks), job.FitError())
+				unreadyTaskCount, len(job.Tasks), job.FitError())
 			job.JobFitErrors = msg
 
 			unScheduleJobCount++