Skip to content

Commit

Permalink
Implemented UnschedulablePodsCount metric (#1698)
Browse files Browse the repository at this point in the history
  • Loading branch information
edibble21 authored Oct 30, 2024
1 parent c2a8448 commit f7abd62
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 2 deletions.
3 changes: 3 additions & 0 deletions pkg/controllers/provisioning/provisioner.go
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,9 @@ func (p *Provisioner) Schedule(ctx context.Context) (scheduler.Results, error) {
return scheduler.Results{}, fmt.Errorf("creating scheduler, %w", err)
}
results := s.Solve(ctx, pods).TruncateInstanceTypes(scheduler.MaxInstanceTypes)
scheduler.UnschedulablePodsCount.With(
prometheus.Labels{scheduler.ControllerLabel: injection.GetControllerName(ctx)},
).Set(float64(len(results.PodErrors)))
if len(results.NewNodeClaims) > 0 {
log.FromContext(ctx).WithValues("Pods", pretty.Slice(lo.Map(pods, func(p *corev1.Pod, _ int) string { return klog.KRef(p.Namespace, p.Name).String() }), 5), "duration", time.Since(start)).Info("found provisionable pod(s)")
}
Expand Down
13 changes: 12 additions & 1 deletion pkg/controllers/provisioning/scheduling/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import (
)

func init() {
crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, IgnoredPodCount)
crmetrics.Registry.MustRegister(SchedulingDurationSeconds, QueueDepth, IgnoredPodCount, UnschedulablePodsCount)
}

const (
Expand Down Expand Up @@ -65,4 +65,15 @@ var (
Help: "Number of pods ignored during scheduling by Karpenter",
},
)
UnschedulablePodsCount = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: metrics.Namespace,
Subsystem: schedulerSubsystem,
Name: "unschedulable_pods_count",
Help: "The number of unschedulable Pods.",
},
[]string{
ControllerLabel,
},
)
)
4 changes: 3 additions & 1 deletion pkg/controllers/provisioning/scheduling/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,9 @@ func (s *Scheduler) Solve(ctx context.Context, pods []*corev1.Pod) Results {
// had 5xA pods and 5xB pods were they have a zonal topology spread, but A can only go in one zone and B in another.
// We need to schedule them alternating, A, B, A, B, .... and this solution also solves that as well.
errors := map[*corev1.Pod]error{}
QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)}) // Reset the metric for the controller, so we don't keep old ids around
// Reset the metric for the controller, so we don't keep old ids around
UnschedulablePodsCount.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})
QueueDepth.DeletePartialMatch(prometheus.Labels{ControllerLabel: injection.GetControllerName(ctx)})
q := NewQueue(pods...)

startTime := s.clock.Now()
Expand Down
38 changes: 38 additions & 0 deletions pkg/controllers/provisioning/scheduling/suite_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,7 @@ var _ = AfterEach(func() {
cluster.Reset()
scheduling.QueueDepth.Reset()
scheduling.SchedulingDurationSeconds.Reset()
scheduling.UnschedulablePodsCount.Reset()
})

var _ = Context("Scheduling", func() {
Expand Down Expand Up @@ -3676,6 +3677,43 @@ var _ = Context("Scheduling", func() {
s.Solve(injection.WithControllerName(ctx, "provisioner"), pods)
wg.Wait()
})
It("should surface the UnschedulablePodsCount metric while executing the scheduling loop", func() {
nodePool := test.NodePool(v1.NodePool{
Spec: v1.NodePoolSpec{
Template: v1.NodeClaimTemplate{
Spec: v1.NodeClaimTemplateSpec{
Requirements: []v1.NodeSelectorRequirementWithMinValues{
{
NodeSelectorRequirement: corev1.NodeSelectorRequirement{
Key: corev1.LabelInstanceTypeStable,
Operator: corev1.NodeSelectorOpIn,
Values: []string{
"default-instance-type",
},
},
},
},
},
},
},
})
ExpectApplied(ctx, env.Client, nodePool)
//Creates 15 pods, 5 schedulable and 10 unschedulable
podsUnschedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "unknown"}}, 10)
podsSchedulable := test.UnschedulablePods(test.PodOptions{NodeSelector: map[string]string{corev1.LabelInstanceTypeStable: "default-instance-type"}}, 5)
pods := append(podsUnschedulable, podsSchedulable...)
ExpectApplied(ctx, env.Client, nodePool)
//Adds UID to pods for queue in solve. Solve pushes any unschedulable pod back onto the queue and
//then maps the current length of the queue to the pod using the UID
for _, i := range pods {
ExpectApplied(ctx, env.Client, i)
}
_, err := prov.Schedule(injection.WithControllerName(ctx, "provisioner"))
m, ok := FindMetricWithLabelValues("karpenter_scheduler_unschedulable_pods_count", map[string]string{"controller": "provisioner"})
Expect(ok).To(BeTrue())
Expect(lo.FromPtr(m.Gauge.Value)).To(BeNumerically("==", 10))
Expect(err).To(BeNil())
})
It("should surface the schedulingDuration metric after executing a scheduling loop", func() {
nodePool := test.NodePool()
ExpectApplied(ctx, env.Client, nodePool)
Expand Down

0 comments on commit f7abd62

Please sign in to comment.