Skip to content

Commit

Permalink
improve finalising logic for canary release
Browse files Browse the repository at this point in the history
Signed-off-by: yunbo <[email protected]>
  • Loading branch information
Funinu committed Aug 13, 2024
1 parent 5378dc2 commit 5fe4b6f
Show file tree
Hide file tree
Showing 5 changed files with 267 additions and 37 deletions.
9 changes: 4 additions & 5 deletions api/v1beta1/rollout_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -569,18 +569,17 @@ const (
// For Both BlueGreenStrategy and CanaryStrategy:
// set workload.pause=false, set workload.partition=0
FinalisingStepTypeBatchRelease FinalisingStepType = "PatchBatchRelease"
//TODO - Currently, the next three steps are in the same function, FinalisingTrafficRouting
// we should try to separate the FinalisingStepTypeGateway and FinalisingStepTypeCanaryService
// with graceful time to prevent some potential issues

// Restore the stable Service (i.e. remove corresponding selector)
FinalisingStepTypeStableService FinalisingStepType = "RestoreStableService"
// Execute the FinalisingTrafficRouting function
FinalisingStepTypeTrafficRouting FinalisingStepType = "FinalisingTrafficRouting"
// Restore the GatewayAPI/Ingress/Istio
FinalisingStepTypeGateway FinalisingStepType = "RestoreGateway"
// Delete Canary Service
FinalisingStepTypeDeleteCanaryService FinalisingStepType = "DeleteCanaryService"
// Delete Batch Release
FinalisingStepTypeDeleteBR FinalisingStepType = "DeleteBatchRelease"
// All needed work done
FinalisingStepTypeEnd FinalisingStepType = "END"
)

// +genclient
Expand Down
132 changes: 104 additions & 28 deletions pkg/controller/rollout/rollout_canary.go
Original file line number Diff line number Diff line change
Expand Up @@ -363,43 +363,86 @@ func (m *canaryReleaseManager) doCanaryJump(c *RolloutContext) (jumped bool) {

// cleanup after rollout is completed or finished
func (m *canaryReleaseManager) doCanaryFinalising(c *RolloutContext) (bool, error) {
canaryStatus := c.NewStatus.CanaryStatus
// when CanaryStatus is nil, which means canary action hasn't started yet, don't need doing cleanup
if c.NewStatus.CanaryStatus == nil {
if canaryStatus == nil {
return true, nil
}
// 1. rollout progressing complete, remove rollout progressing annotation in workload
// rollout progressing complete, remove rollout progressing annotation in workload
err := m.removeRolloutProgressingAnnotation(c)
if err != nil {
return false, err
}
tr := newTrafficRoutingContext(c)
// 2. remove stable service the pod revision selector, so stable service will be selector all version pods.
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
}
// 3. set workload.pause=false; set workload.partition=0
done, err = m.finalizingBatchRelease(c)
if err != nil || !done {
return done, err
}
// 4. modify network api(ingress or gateway api) configuration, and route 100% traffic to stable pods.
done, err = m.trafficRoutingManager.FinalisingTrafficRouting(tr)
c.NewStatus.CanaryStatus.LastUpdateTime = tr.LastUpdateTime
if err != nil || !done {
return done, err
}
// 5. delete batchRelease crd
done, err = m.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
return false, nil
// execute steps based on the predefined order for each reason
nextStep := nextTask(c.FinalizeReason, canaryStatus.FinalisingStep)
// if current step is empty, set it with the first step
// if current step is end, we just return
if len(canaryStatus.FinalisingStep) == 0 {
canaryStatus.FinalisingStep = nextStep
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
} else if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
klog.Infof("rollout(%s/%s) finalising process is already completed", c.Rollout.Namespace, c.Rollout.Name)
return true, nil
}
klog.Infof("rollout(%s/%s) doCanaryFinalising success", c.Rollout.Namespace, c.Rollout.Name)
return true, nil
klog.Infof("rollout(%s/%s) Finalising Step is %s", c.Rollout.Namespace, c.Rollout.Name, canaryStatus.FinalisingStep)
// the steps. order is maitained by the nextStep
switch canaryStatus.FinalisingStep {
// call the FinalisingTrafficRouting function to:
// 1.restore stable service selector to select all pods
// 2.restore network api(ingress/ gateway api/ istio) configuration
// 3.delete canary service
case v1beta1.FinalisingStepTypeTrafficRouting:
done, err := m.trafficRoutingManager.FinalisingTrafficRouting(tr)
if err != nil || !done {
canaryStatus.LastUpdateTime = tr.LastUpdateTime
return done, err
}
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
canaryStatus.FinalisingStep = nextStep
if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
return true, nil
}
// set workload.pause=false; set workload.partition=0
case v1beta1.FinalisingStepTypeBatchRelease:
done, err := m.finalizingBatchRelease(c)
if err != nil || !done {
return done, err
}
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
canaryStatus.FinalisingStep = nextStep
if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
return true, nil
}
// delete batchRelease
case v1beta1.FinalisingStepTypeDeleteBR:
done, err := m.removeBatchRelease(c)
if err != nil {
klog.Errorf("rollout(%s/%s) Finalize batchRelease failed: %s", c.Rollout.Namespace, c.Rollout.Name, err.Error())
return false, err
} else if !done {
return false, nil
}
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
canaryStatus.FinalisingStep = nextStep
if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
return true, nil
}
// restore the gateway resources (ingress/gatewayAPI/Istio), that means
// only stable Service will accept the traffic
case v1beta1.FinalisingStepTypeGateway:
retry, err := m.trafficRoutingManager.RestoreGateway(tr)
if err != nil || retry {
return false, err
}
canaryStatus.LastUpdateTime = &metav1.Time{Time: time.Now()}
canaryStatus.FinalisingStep = nextStep
if canaryStatus.FinalisingStep == v1beta1.FinalisingStepTypeEnd {
return true, nil
}
}

return false, nil
}

func (m *canaryReleaseManager) removeRolloutProgressingAnnotation(c *RolloutContext) error {
Expand Down Expand Up @@ -601,3 +644,36 @@ func (m *canaryReleaseManager) syncBatchRelease(br *v1beta1.BatchRelease, canary
}
return nil
}

// calculate next task
func nextTask(reason string, currentTask v1beta1.FinalisingStepType) v1beta1.FinalisingStepType {
var taskSequence []v1beta1.FinalisingStepType
//REVIEW - should we consider more complex scenarios?
// like, user pauses Rollout and rollbacks the workload at the same time?
switch reason {
case v1beta1.FinaliseReasonRollback: // rollback
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeGateway, // route all traffic to stable version
v1beta1.FinalisingStepTypeBatchRelease, // scale up old, scale down new
v1beta1.FinalisingStepTypeDeleteBR,
v1beta1.FinalisingStepTypeTrafficRouting, // do cleaning works(restore stable Service, remove canary Service)
}
default: // others: success/paused/disabled
taskSequence = []v1beta1.FinalisingStepType{
v1beta1.FinalisingStepTypeTrafficRouting, // remove selector of stable Service
v1beta1.FinalisingStepTypeBatchRelease, // scale up new, scale down old
v1beta1.FinalisingStepTypeDeleteBR,
}
}
// if currentTask is empty, return first task
if len(currentTask) == 0 {
return taskSequence[0]
}
// find next task
for i := range taskSequence {
if currentTask == taskSequence[i] && i < len(taskSequence)-1 {
return taskSequence[i+1]
}
}
return v1beta1.FinalisingStepTypeEnd
}
4 changes: 4 additions & 0 deletions pkg/controller/rollout/rollout_progressing.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,8 @@ type RolloutContext struct {
RecheckTime *time.Time
// wait stable workload pods ready
WaitReady bool
// finalising reason
FinalizeReason string
}

// parameter1 retryReconcile, parameter2 error
Expand Down Expand Up @@ -116,6 +118,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
var done bool
rolloutContext.WaitReady = true
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonSuccess
done, err = r.doFinalising(rolloutContext)
if err != nil {
return nil, err
Expand All @@ -140,6 +143,7 @@ func (r *RolloutReconciler) reconcileRolloutProgressing(rollout *v1beta1.Rollout
case v1alpha1.ProgressingReasonCancelling:
klog.Infof("rollout(%s/%s) is Progressing, and in reason(%s)", rollout.Namespace, rollout.Name, cond.Reason)
var done bool
rolloutContext.FinalizeReason = v1beta1.FinaliseReasonRollback
done, err = r.doFinalising(rolloutContext)
if err != nil {
return nil, err
Expand Down
Loading

0 comments on commit 5fe4b6f

Please sign in to comment.