Skip to content

Commit

Permalink
Add flags for intital and max backoff time, and cache size
Browse files Browse the repository at this point in the history
  • Loading branch information
yaroslava-serdiuk committed Sep 9, 2024
1 parent 03ff085 commit 54e520c
Show file tree
Hide file tree
Showing 4 changed files with 38 additions and 25 deletions.
6 changes: 6 additions & 0 deletions cluster-autoscaler/config/autoscaling_options.go
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,12 @@ type AutoscalingOptions struct {
ProvisioningRequestEnabled bool
// AsyncNodeGroupsEnabled tells if CA creates/deletes node groups asynchronously.
AsyncNodeGroupsEnabled bool
// ProvisioningRequestInitialBackoffTime is the initial time for ProvisioningRequest be considered by CA after failed ScaleUp request.
ProvisioningRequestInitialBackoffTime time.Duration
// ProvisioningRequestMaxBackoffTime is the max time for ProvisioningRequest be considered by CA after failed ScaleUp request.
ProvisioningRequestMaxBackoffTime time.Duration
// ProvisioningRequestMaxCacheSize is the max size for ProvisioningRequest cache that is stored for retry backoff.
ProvisioningRequestMaxCacheSize int
}

// KubeClientOptions specify options for kube client
Expand Down
18 changes: 12 additions & 6 deletions cluster-autoscaler/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -269,11 +269,14 @@ var (
"--max-graceful-termination-sec flag should not be set when this flag is set. Not setting this flag will use unordered evictor by default."+
"Priority evictor reuses the concepts of drain logic in kubelet(https://github.com/kubernetes/enhancements/tree/master/keps/sig-node/2712-pod-priority-based-graceful-node-shutdown#migration-from-the-node-graceful-shutdown-feature)."+
"Eg. flag usage: '10000:20,1000:100,0:60'")
provisioningRequestsEnabled = flag.Bool("enable-provisioning-requests", false, "Whether the clusterautoscaler will be handling the ProvisioningRequest CRs.")
frequentLoopsEnabled = flag.Bool("frequent-loops-enabled", false, "Whether clusterautoscaler triggers new iterations more frequently when it's needed")
asyncNodeGroupsEnabled = flag.Bool("async-node-groups", false, "Whether clusterautoscaler creates and deletes node groups asynchronously. Experimental: requires cloud provider supporting async node group operations, enable at your own risk.")
proactiveScaleupEnabled = flag.Bool("enable-proactive-scaleup", false, "Whether to enable/disable proactive scale-ups, defaults to false")
podInjectionLimit = flag.Int("pod-injection-limit", 5000, "Limits total number of pods while injecting fake pods. If unschedulable pods already exceeds the limit, pod injection is disabled but pods are not truncated.")
provisioningRequestsEnabled = flag.Bool("enable-provisioning-requests", false, "Whether the clusterautoscaler will be handling the ProvisioningRequest CRs.")
provisioningRequestInitialBackoffTime = flag.Duration("provisioning-request-initial-backoff-time", 1*time.Minute, "Initial backoff time for ProvisioningRequest retry after failed ScaleUp.")
provisioningRequestMaxBackoffTime = flag.Duration("provisioning-request-max-backoff-time", 10*time.Minute, "Max backoff time for ProvisioningRequest retry after failed ScaleUp.")
provisioningRequestMaxCacheSize = flag.Int("provisioning-request-max-cache-size", 1000, "Max size for ProvisioningRequest cache size used for retry backoff mechanism.")
frequentLoopsEnabled = flag.Bool("frequent-loops-enabled", false, "Whether clusterautoscaler triggers new iterations more frequently when it's needed")
asyncNodeGroupsEnabled = flag.Bool("async-node-groups", false, "Whether clusterautoscaler creates and deletes node groups asynchronously. Experimental: requires cloud provider supporting async node group operations, enable at your own risk.")
proactiveScaleupEnabled = flag.Bool("enable-proactive-scaleup", false, "Whether to enable/disable proactive scale-ups, defaults to false")
podInjectionLimit = flag.Int("pod-injection-limit", 5000, "Limits total number of pods while injecting fake pods. If unschedulable pods already exceeds the limit, pod injection is disabled but pods are not truncated.")
)

func isFlagPassed(name string) bool {
Expand Down Expand Up @@ -446,6 +449,9 @@ func createAutoscalingOptions() config.AutoscalingOptions {
BypassedSchedulers: scheduler_util.GetBypassedSchedulersMap(*bypassedSchedulers),
ProvisioningRequestEnabled: *provisioningRequestsEnabled,
AsyncNodeGroupsEnabled: *asyncNodeGroupsEnabled,
ProvisioningRequestInitialBackoffTime: *provisioningRequestInitialBackoffTime,
ProvisioningRequestMaxBackoffTime: *provisioningRequestMaxBackoffTime,
ProvisioningRequestMaxCacheSize: *provisioningRequestMaxCacheSize,
}
}

Expand Down Expand Up @@ -524,7 +530,7 @@ func buildAutoscaler(debuggingSnapshotter debuggingsnapshot.DebuggingSnapshotter
return nil, err
}
opts.LoopStartNotifier = loopstart.NewObserversList([]loopstart.Observer{provreqProcesor})
injector, err := provreq.NewProvisioningRequestPodsInjector(restConfig)
injector, err := provreq.NewProvisioningRequestPodsInjector(restConfig, opts.ProvisioningRequestInitialBackoffTime, opts.ProvisioningRequestMaxBackoffTime, opts.ProvisioningRequestMaxCacheSize)
if err != nil {
return nil, err
}
Expand Down
27 changes: 12 additions & 15 deletions cluster-autoscaler/processors/provreq/injector.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ import (
apiv1 "k8s.io/api/core/v1"
apimeta "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1"
v1 "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1"
"k8s.io/autoscaler/cluster-autoscaler/context"
"k8s.io/autoscaler/cluster-autoscaler/processors/pods"
"k8s.io/autoscaler/cluster-autoscaler/provisioningrequest"
Expand All @@ -35,15 +35,12 @@ import (
"k8s.io/utils/clock"
)

const (
defaultRetryTime = 1 * time.Minute
maxBackoffTime = 10 * time.Minute
// TODO: replace with timeout for element rather than max size of cache.
maxCacheSize = 1000
)

// ProvisioningRequestPodsInjector creates in-memory pods from ProvisioningRequest and inject them to unscheduled pods list.
type ProvisioningRequestPodsInjector struct {
initialRetryTime time.Duration
maxBackoffTime time.Duration
// TODO: replace with timeout for element rather than max size of cache.
maxCacheSize int
clock clock.PassiveClock
client *provreqclient.ProvisioningRequestClient
backoffDuration map[string]time.Duration
Expand All @@ -57,15 +54,15 @@ func (p *ProvisioningRequestPodsInjector) IsAvailableForProvisioning(pr *provreq
}
provisioned := apimeta.FindStatusCondition(conditions, v1.Provisioned)
if provisioned != nil {
if provisioned.Status == metav1.ConditionFalse {
return true
if provisioned.Status != metav1.ConditionFalse {
return false
}
retryTime, found := p.backoffDuration[key(pr)]
if !found {
retryTime = defaultRetryTime
retryTime = p.initialRetryTime
}
if provisioned.LastTransitionTime.Add(retryTime).Before(p.clock.Now()) {
p.backoffDuration[key(pr)] = max(2*retryTime, maxBackoffTime)
p.backoffDuration[key(pr)] = max(2*retryTime, p.maxBackoffTime)
return true
}
return false
Expand Down Expand Up @@ -95,7 +92,7 @@ func (p *ProvisioningRequestPodsInjector) MarkAsFailed(pr *provreqwrapper.Provis
func (p *ProvisioningRequestPodsInjector) GetPodsFromNextRequest(
isSupportedClass func(*provreqwrapper.ProvisioningRequest) bool,
) ([]*apiv1.Pod, error) {
if len(p.backoffDuration) >= maxCacheSize {
if len(p.backoffDuration) >= p.maxCacheSize {
p.backoffDuration = make(map[string]time.Duration)
}
provReqs, err := p.client.ProvisioningRequests()
Expand Down Expand Up @@ -154,12 +151,12 @@ func (p *ProvisioningRequestPodsInjector) Process(
func (p *ProvisioningRequestPodsInjector) CleanUp() {}

// NewProvisioningRequestPodsInjector creates a ProvisioningRequest filter processor.
func NewProvisioningRequestPodsInjector(kubeConfig *rest.Config) (pods.PodListProcessor, error) {
func NewProvisioningRequestPodsInjector(kubeConfig *rest.Config, initialBackoffTime, maxBackoffTime time.Duration, maxCacheSize int) (pods.PodListProcessor, error) {
client, err := provreqclient.NewProvisioningRequestClient(kubeConfig)
if err != nil {
return nil, err
}
return &ProvisioningRequestPodsInjector{client: client, clock: clock.RealClock{}}, nil
return &ProvisioningRequestPodsInjector{initialRetryTime: initialBackoffTime, maxBackoffTime: maxBackoffTime, maxCacheSize: maxCacheSize, client: client, clock: clock.RealClock{}}, nil
}

func key(pr *provreqwrapper.ProvisioningRequest) string {
Expand Down
12 changes: 8 additions & 4 deletions cluster-autoscaler/processors/provreq/injector_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (

apimeta "k8s.io/apimachinery/pkg/api/meta"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1"
v1 "k8s.io/autoscaler/cluster-autoscaler/apis/provisioningrequest/autoscaling.x-k8s.io/v1"
"k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/provreqclient"
"k8s.io/autoscaler/cluster-autoscaler/provisioningrequest/provreqwrapper"
clock "k8s.io/utils/clock/testing"
Expand Down Expand Up @@ -104,11 +104,15 @@ func TestProvisioningRequestPodsInjector(t *testing.T) {
},
{
name: "Provisioned=True, no pods are injected",
provReqs: []*provreqwrapper.ProvisioningRequest{provisionedAcceptedProvReqB, failedProvReq, notProvisionedRecentlyProvReqB},
provReqs: []*provreqwrapper.ProvisioningRequest{provisionedAcceptedProvReqB, failedProvReq},
},
{
name: "Provisioned=False, ProvReq is backed off, no pods are injected",
provReqs: []*provreqwrapper.ProvisioningRequest{notProvisionedRecentlyProvReqB},
},
{
name: "Provisioned=Unknown, no pods are injected",
provReqs: []*provreqwrapper.ProvisioningRequest{unknownProvisionedProvReqB, failedProvReq, notProvisionedRecentlyProvReqB},
provReqs: []*provreqwrapper.ProvisioningRequest{unknownProvisionedProvReqB, failedProvReq},
},
{
name: "ProvisionedClass is unknown, no pods are injected",
Expand All @@ -125,7 +129,7 @@ func TestProvisioningRequestPodsInjector(t *testing.T) {
for _, tc := range testCases {
client := provreqclient.NewFakeProvisioningRequestClient(context.Background(), t, tc.provReqs...)
backoffTime := map[string]time.Duration{key(notProvisionedRecentlyProvReqB): 2 * time.Minute}
injector := ProvisioningRequestPodsInjector{client, clock.NewFakePassiveClock(now), client, backoffTime}
injector := ProvisioningRequestPodsInjector{1 * time.Minute, 10 * time.Minute, 1000, clock.NewFakePassiveClock(now), client, backoffTime}
getUnscheduledPods, err := injector.Process(nil, provreqwrapper.BuildTestPods("ns", "pod", tc.existingUnsUnschedulablePodCount))
if err != nil {
t.Errorf("%s failed: injector.Process return error %v", tc.name, err)
Expand Down

0 comments on commit 54e520c

Please sign in to comment.