diff --git a/api/v1/checks.go b/api/v1/checks.go index e3ff54b6c..66abfc7f0 100644 --- a/api/v1/checks.go +++ b/api/v1/checks.go @@ -788,6 +788,72 @@ type KubernetesResourceChecks struct { CanarySpec `yaml:",inline" json:",inline"` } +type KubernetesResourceCheckRetries struct { + // Delay is the initial delay + Delay string `json:"delay,omitempty"` + Timeout string `json:"timeout,omitempty"` + Interval string `json:"interval,omitempty"` + MaxRetries int `json:"maxRetries,omitempty"` + + parsedDelay *time.Duration `json:"-"` + parsedTimeout *time.Duration `json:"-"` + parsedInterval *time.Duration `json:"-"` +} + +func (t *KubernetesResourceCheckRetries) GetInitialDelay() (time.Duration, error) { + if t.parsedDelay != nil { + return *t.parsedDelay, nil + } + + if t.Delay == "" { + return time.Duration(0), nil + } + + tt, err := duration.ParseDuration(t.Delay) + if err != nil { + return time.Duration(0), err + } + t.parsedDelay = lo.ToPtr(time.Duration(tt)) + + return *t.parsedDelay, nil +} + +func (t *KubernetesResourceCheckRetries) GetTimeout() (time.Duration, error) { + if t.parsedTimeout != nil { + return *t.parsedTimeout, nil + } + + if t.Timeout == "" { + return time.Duration(0), nil + } + + tt, err := duration.ParseDuration(t.Timeout) + if err != nil { + return time.Duration(0), err + } + t.parsedTimeout = lo.ToPtr(time.Duration(tt)) + + return *t.parsedTimeout, nil +} + +func (t *KubernetesResourceCheckRetries) GetInterval() (time.Duration, error) { + if t.parsedInterval != nil { + return *t.parsedInterval, nil + } + + if t.Interval == "" { + return time.Duration(0), nil + } + + tt, err := duration.ParseDuration(t.Interval) + if err != nil { + return time.Duration(0), err + } + t.parsedInterval = lo.ToPtr(time.Duration(tt)) + + return *t.parsedInterval, nil +} + type KubernetesResourceCheckWaitFor struct { // Expr is a cel expression that determines whether all the resources // are in their desired state before running checks on them. @@ -805,6 +871,8 @@ type KubernetesResourceCheckWaitFor struct { // Default: 30s Interval string `json:"interval,omitempty"` + MaxRetries int `json:"maxRetries,omitempty"` + parsedTimeout *time.Duration `json:"-"` parsedInterval *time.Duration `json:"-"` } @@ -865,6 +933,9 @@ type KubernetesResourceCheck struct { // +kubebuilder:validation:XPreserveUnknownFields Checks []KubernetesResourceChecks `json:"checks,omitempty"` + // Set initial delays and retry intervals for checks. + CheckRetries KubernetesResourceCheckRetries `json:"checkRetries,omitempty"` + // Kubeconfig is the kubeconfig or the path to the kubeconfig file. Kubeconfig *types.EnvVar `yaml:"kubeconfig,omitempty" json:"kubeconfig,omitempty"` diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 9d09afa5f..04cc4e1c4 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -2286,6 +2286,7 @@ func (in *KubernetesResourceCheck) DeepCopyInto(out *KubernetesResourceCheck) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + in.CheckRetries.DeepCopyInto(&out.CheckRetries) if in.Kubeconfig != nil { in, out := &in.Kubeconfig, &out.Kubeconfig *out = new(types.EnvVar) @@ -2304,6 +2305,36 @@ func (in *KubernetesResourceCheck) DeepCopy() *KubernetesResourceCheck { return out } +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *KubernetesResourceCheckRetries) DeepCopyInto(out *KubernetesResourceCheckRetries) { + *out = *in + if in.parsedDelay != nil { + in, out := &in.parsedDelay, &out.parsedDelay + *out = new(timex.Duration) + **out = **in + } + if in.parsedTimeout != nil { + in, out := &in.parsedTimeout, &out.parsedTimeout + *out = new(timex.Duration) + **out = **in + } + if in.parsedInterval != nil { + in, out := &in.parsedInterval, &out.parsedInterval + *out = new(timex.Duration) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new KubernetesResourceCheckRetries. +func (in *KubernetesResourceCheckRetries) DeepCopy() *KubernetesResourceCheckRetries { + if in == nil { + return nil + } + out := new(KubernetesResourceCheckRetries) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *KubernetesResourceCheckWaitFor) DeepCopyInto(out *KubernetesResourceCheckWaitFor) { *out = *in diff --git a/checks/kubernetes_resource.go b/checks/kubernetes_resource.go index d8e9e5b03..20ee3c7cf 100644 --- a/checks/kubernetes_resource.go +++ b/checks/kubernetes_resource.go @@ -115,16 +115,47 @@ func (c *KubernetesResourceChecker) Check(ctx *context.Context, check v1.Kuberne return results.Failf("error templating checks: %v", err) } - checkCtx := context.New(ctx.Context, virtualCanary) - res, err := Exec(checkCtx) - if err != nil { - return results.Failf("%v", err) - } else { - for _, r := range res { - if r.Error != "" { - results.Failf("check (name:%s) failed with error: %v", r.GetName(), r.Error) + if wt, _ := check.CheckRetries.GetInitialDelay(); wt > 0 { + time.Sleep(wt) + } + + var backoff retry.Backoff + backoff = retry.BackoffFunc(func() (time.Duration, bool) { + return 0, true // don't retry by default + }) + + if retryInterval, _ := check.CheckRetries.GetInterval(); retryInterval > 0 { + backoff = retry.NewConstant(retryInterval) + } + + if check.CheckRetries.MaxRetries > 0 { + backoff = retry.WithMaxRetries(uint64(check.CheckRetries.MaxRetries), backoff) + } + + if maxRetryTimeout, _ := check.CheckRetries.GetTimeout(); maxRetryTimeout > 0 { + backoff = retry.WithMaxDuration(maxRetryTimeout, backoff) + } + + retryErr := retry.Do(ctx, backoff, func(_ctx gocontext.Context) error { + ctx.Infof("running check: %s", virtualCanary.Name) + + ctx = _ctx.(*context.Context) + checkCtx := context.New(ctx.Context, virtualCanary) + res, err := Exec(checkCtx) + if err != nil { + return err + } else { + for _, r := range res { + if r.Error != "" { + return retry.RetryableError(fmt.Errorf("check (name:%s) failed with error: %v", r.GetName(), r.Error)) + } } } + + return nil + }) + if retryErr != nil { + return results.Failf(retryErr.Error()) } } @@ -142,15 +173,19 @@ func (c *KubernetesResourceChecker) evalWaitFor(ctx *context.Context, check v1.K waitInterval = wt } + kClient := pkg.NewKubeClient(ctx.Kommons().GetRESTConfig) + var attempts int backoff := retry.WithMaxDuration(waitTimeout, retry.NewConstant(waitInterval)) + if check.WaitFor.MaxRetries > 0 { + backoff = retry.WithMaxRetries(uint64(check.WaitFor.MaxRetries), backoff) + } retryErr := retry.Do(ctx, backoff, func(_ctx gocontext.Context) error { ctx = _ctx.(*context.Context) attempts++ ctx.Tracef("waiting for %d resources to be ready. (attempts: %d)", check.TotalResources(), attempts) var templateVar = map[string]any{} - kClient := pkg.NewKubeClient(ctx.Kommons().GetRESTConfig) if response, err := kClient.FetchResources(ctx, append(check.StaticResources, check.Resources...)...); err != nil { return fmt.Errorf("wait for evaluation. fetching resources: %w", err) } else if len(response) != check.TotalResources() { @@ -219,6 +254,18 @@ func (c *KubernetesResourceChecker) validate(ctx *context.Context, check v1.Kube return fmt.Errorf("failed to parse wait for timeout(%s): %w", check.WaitFor.Timeout, err) } + if _, err := check.CheckRetries.GetTimeout(); err != nil { + return fmt.Errorf("failed to parse check retry timeout(%s): %w", check.CheckRetries.Timeout, err) + } + + if _, err := check.CheckRetries.GetInterval(); err != nil { + return fmt.Errorf("failed to parse check retry interval(%s): %w", check.CheckRetries.Interval, err) + } + + if _, err := check.CheckRetries.GetInitialDelay(); err != nil { + return fmt.Errorf("failed to parse check retry initial delay(%s): %w", check.CheckRetries.Delay, err) + } + maxResourcesAllowed := ctx.Properties().Int("checks.kubernetesResource.maxResources", defaultMaxResourcesAllowed) if check.TotalResources() > maxResourcesAllowed { return fmt.Errorf("too many resources (%d). only %d allowed", check.TotalResources(), maxResourcesAllowed) diff --git a/config/deploy/crd.yaml b/config/deploy/crd.yaml index 0f768879e..b5430e8c8 100644 --- a/config/deploy/crd.yaml +++ b/config/deploy/crd.yaml @@ -4983,6 +4983,19 @@ spec: kubernetesResource: items: properties: + checkRetries: + description: Set initial delays and retry intervals for checks. + properties: + delay: + description: Delay is the initial delay + type: string + interval: + type: string + maxRetries: + type: integer + timeout: + type: string + type: object checks: description: Checks to run against the kubernetes resources. items: diff --git a/config/deploy/manifests.yaml b/config/deploy/manifests.yaml index f18b5527b..ee002e67e 100644 --- a/config/deploy/manifests.yaml +++ b/config/deploy/manifests.yaml @@ -4982,6 +4982,19 @@ spec: kubernetesResource: items: properties: + checkRetries: + description: Set initial delays and retry intervals for checks. + properties: + delay: + description: Delay is the initial delay + type: string + interval: + type: string + maxRetries: + type: integer + timeout: + type: string + type: object checks: description: Checks to run against the kubernetes resources. items: diff --git a/config/schemas/canary.schema.json b/config/schemas/canary.schema.json index 2772e5581..188e81dd8 100644 --- a/config/schemas/canary.schema.json +++ b/config/schemas/canary.schema.json @@ -2320,6 +2320,9 @@ }, "type": "array" }, + "checkRetries": { + "$ref": "#/$defs/KubernetesResourceCheckRetries" + }, "kubeconfig": { "$ref": "#/$defs/EnvVar" }, @@ -2334,6 +2337,24 @@ "resources" ] }, + "KubernetesResourceCheckRetries": { + "properties": { + "delay": { + "type": "string" + }, + "timeout": { + "type": "string" + }, + "interval": { + "type": "string" + }, + "maxRetries": { + "type": "integer" + } + }, + "additionalProperties": false, + "type": "object" + }, "KubernetesResourceCheckWaitFor": { "properties": { "expr": { diff --git a/config/schemas/component.schema.json b/config/schemas/component.schema.json index 0b1d18a47..bce74ec45 100644 --- a/config/schemas/component.schema.json +++ b/config/schemas/component.schema.json @@ -2574,6 +2574,9 @@ }, "type": "array" }, + "checkRetries": { + "$ref": "#/$defs/KubernetesResourceCheckRetries" + }, "kubeconfig": { "$ref": "#/$defs/EnvVar" }, @@ -2588,6 +2591,24 @@ "resources" ] }, + "KubernetesResourceCheckRetries": { + "properties": { + "delay": { + "type": "string" + }, + "timeout": { + "type": "string" + }, + "interval": { + "type": "string" + }, + "maxRetries": { + "type": "integer" + } + }, + "additionalProperties": false, + "type": "object" + }, "KubernetesResourceCheckWaitFor": { "properties": { "expr": { diff --git a/config/schemas/topology.schema.json b/config/schemas/topology.schema.json index 9bc8a933c..5db72a2ef 100644 --- a/config/schemas/topology.schema.json +++ b/config/schemas/topology.schema.json @@ -2544,6 +2544,9 @@ }, "type": "array" }, + "checkRetries": { + "$ref": "#/$defs/KubernetesResourceCheckRetries" + }, "kubeconfig": { "$ref": "#/$defs/EnvVar" }, @@ -2558,6 +2561,24 @@ "resources" ] }, + "KubernetesResourceCheckRetries": { + "properties": { + "delay": { + "type": "string" + }, + "timeout": { + "type": "string" + }, + "interval": { + "type": "string" + }, + "maxRetries": { + "type": "integer" + } + }, + "additionalProperties": false, + "type": "object" + }, "KubernetesResourceCheckWaitFor": { "properties": { "expr": { diff --git a/fixtures/k8s/kubernetes_resource_ingress_pass.yaml b/fixtures/k8s/kubernetes_resource_ingress_pass.yaml index 692751f8c..9f0d69704 100644 --- a/fixtures/k8s/kubernetes_resource_ingress_pass.yaml +++ b/fixtures/k8s/kubernetes_resource_ingress_pass.yaml @@ -69,3 +69,7 @@ spec: headers: - name: Host value: "{{(index ((index .staticResources 1).Object.spec.rules) 0).host}}" + checkRetries: + delay: 3s + interval: 2s + maxRetries: 3 diff --git a/fixtures/k8s/kubernetes_resource_service_pass.yaml b/fixtures/k8s/kubernetes_resource_service_pass.yaml index 629e4ff21..6e406dad2 100644 --- a/fixtures/k8s/kubernetes_resource_service_pass.yaml +++ b/fixtures/k8s/kubernetes_resource_service_pass.yaml @@ -41,3 +41,7 @@ spec: - http: - name: Call httpbin service url: "http://httpbin-svc.default.svc" + checkRetries: + delay: 2s + maxRetries: 5 + interval: 3s