diff --git a/docs/reference/RollingUpdate.md b/docs/reference/RollingUpdate.md index 6ff30e368..7ed2da6b7 100644 --- a/docs/reference/RollingUpdate.md +++ b/docs/reference/RollingUpdate.md @@ -77,7 +77,9 @@ that is not from the active scheduler version update, proceed with the update 7. The update checks how many rooms it can spawn by computing the below ``` -maxSurge * totalAvailableRooms (pending + unready + ready + occupied) +Maximum Number of Rooms at any point: desiredRoomsComputedByAutoscale + maxSurge +How many rooms can we spawn? desiredRoomsComputedByAutoscale + maxSurge - totalAvailableRooms (ready + pending + unready) +Can we spawn more than the actual maxSurge? If so, cap to the surge ``` 8. Enqueues a priority _add_room_ operation to create the surge amount 9. Check how many old rooms it can delete by computing @@ -118,55 +120,61 @@ throughout the cycle and rolling update will adjust to that as well. | **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | |----------|-----------|--------------|---------------|-------------|------------------|-------------|-----------------| -| **1** | 20 | 5 | 25 (0 new) | 10 | 5 | 7 | 15 | -| **2** | 12 | 5 | 17 (7 new) | 10 | 5 | 4 | 7 | -| **3** | 9 | 5 | 14 (11 new) | 10 | 5 | 3 | 3 (4 actually) | -| **4** | 9 | 5 | 14 (14 new) | 10 | 5 | - | 4 by autoscale | -| **5** | 5 | 5 | 10 (10 new) | 10 | 5 | - | - | +| **1** | 20 | 5 | 25 (0 new) | 10 | 5 | 1 | 15 | +| **2** | 6 | 5 | 11 (1 new) | 10 | 5 | 2 | 1 | +| **3** | 7 | 5 | 12 (3 new) | 10 | 5 | 1 | 2 | +| **4** | 6 | 5 | 11 (4 new) | 10 | 5 | 1 | 1 | +| **5** | 5 | 5 | 10 (5 new) | 10 | 5 | 3 | 0 | +| **5** | 8 | 5 | 13 (8 new) | 10 | 5 | 1 | 3 | +| **6** | 6 | 5 | 11 (9 new) | 10 | 5 | 2 | 1 | +| **7** | 7 | 5 | 12 (11 new) | 10 | 5 | 1 | 1 (actual 2) | +| **8** | 7 | 5 | 12 (12 new) | 10 | 5 | - | - | ### Upscale -| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | -|----------|-----------|--------------|---------------|-------------|------------------|-------------|-----------------| -| **1** | 5 | 20 | 25 (0 new) | 40 | 20 | 7 | 0 | -| **2** | 12 | 20 | 32 (7 new) | 40 | 20 | 8 | 0 | -| **3** | 20 | 20 | 40 (15 new) | 40 | 20 | 10 | 0 | -| **4** | 30 | 20 | 50 (25 new) | 40 | 20 | 13 | 10 | -| **5** | 33 | 20 | 53 (38 new) | 40 | 20 | 14 | 13 | -| **6** | 34 | 20 | 54 (52 new) | 40 | 20 | 14 | 2 (actual 14) | -| **7** | 46 | 20 | 66 (66 new) | 40 | 20 | - | 26 by autoscale | -| **8** | 20 | 20 | 40 (40 new) | 40 | 20 | - | - | +| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | +|----------|-----------|--------------|---------------|-------------|------------------|---------------|-----------------| +| **1** | 5 | 20 | 25 (0 new) | 40 | 20 | 10 (actual 25)| 0 | +| **2** | 15 | 20 | 35 (10 new) | 40 | 20 | 10 (actual 15)| 0 | +| **3** | 25 | 20 | 45 (20 new) | 40 | 20 | 5 | 5 | +| **4** | 25 | 20 | 45 (25 new) | 40 | 20 | 5 | 5 | +| **5** | 25 | 20 | 45 (30 new) | 40 | 20 | 5 | 5 | +| **6** | 25 | 20 | 45 (35 new) | 40 | 20 | 5 | 5 | +| **7** | 25 | 20 | 45 (40 new) | 40 | 20 | 5 | 5 | +| **8** | 25 | 20 | 45 (45 new) | 40 | 20 | 5 | 5 | +| **8** | 25 | 20 | 45 (45 new) | 40 | 20 | - | - | ## Big Amount of Game Rooms ### Upscale -* readyTarget: 0.4 -> 0.7 +* readyTarget: 0.25 -> 0.5 * maxSurge: 25% -| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | -|----------|-----------|--------------|-----------------|-------------|------------------|-------------|------------------| -| **1** | 209 | 458 | 667 (0 new) | 1526 | 1068 | 167 | 0 | -| **2** | 376 | 458 | 834 (167 new) | 1526 | 1068 | 209 | 0 | -| **3** | 585 | 458 | 1043 (376 new) | 1526 | 1068 | 261 | 0 | -| **4** | 846 | 458 | 1304 (637 new) | 1526 | 1068 | 326 | 0 | -| **5** | 1172 | 458 | 1630 (963 new) | 1526 | 1068 | 408 | 104 | -| **6** | 1476 | 458 | 1934 (1371 new) | 1526 | 1068 | 484 | 408 | -| **7** | 1552 | 458 | 2010 (1855 new) | 1526 | 1068 | 503 | 155 | -| **8** | 2055 | 458 | 2513 (2513 new) | 1526 | 1068 | - | 987 by autoscale | -| **9** | 1068 | 458 | 1526 (1526 new) | 1526 | 1068 | - | - | +| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | +|----------|-----------|--------------|-----------------|-------------|------------------|------------------|------------------| +| **1** | 200 | 400 | 600 (0 new) | 800 | 400 | 200 (actual 400) | 0 | +| **2** | 400 | 400 | 800 (200 new) | 800 | 400 | 200 | 0 | +| **3** | 600 | 400 | 1000 (400 new) | 800 | 400 | 0 | 200 | +| **4** | 400 | 400 | 800 (400 new) | 800 | 400 | 200 | 0 | +| **5** | 600 | 400 | 1000 (600 new) | 800 | 400 | 0 | 200 | +| **6** | 400 | 400 | 800 (600 new) | 800 | 400 | 200 | 0 | +| **7** | 600 | 400 | 1000 (800 new) | 800 | 400 | 0 | 200 | +| **8** | 400 | 400 | 800 (800 new) | 800 | 400 | - | - | ### Downscale -* readyTarget: 0.7 -> 0.4 +* readyTarget: 0.5 -> 0.25 * maxSurge: 25% -| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | -|----------|-----------|--------------|-----------------|-------------|------------------|-------------|-------------------| -| **1** | 940 | 1040 | 1980 (0 new) | 1733 | 693 | 495 | 247 | -| **2** | 1188 | 1040 | 2228 (495 new) | 1733 | 693 | 557 | 495 | -| **3** | 1250 | 1040 | 2290 (1052 new) | 1733 | 693 | 573 | 557 | -| **4** | 1266 | 1040 | 2306 (1625 new) | 1733 | 693 | 577 | 573 | -| **5** | 1270 | 1040 | 2310 (2202 new) | 1733 | 693 | 578 | 108 (577) | -| **6** | 1740 | 1040 | 2780 (2780 new) | 1733 | 693 | - | 1047 by autoscale | -| **7** | 693 | 1040 | 1733 (1733 new) | 1733 | 693 | - | - | +| **loop** | **ready** | **occupied** | **available** | **desired** | **desiredReady** | **toSurge** | **toBeDeleted** | +|----------|-----------|--------------|-----------------|-------------|------------------|------------------|-------------------| +| **1** | 900 | 1000 | 1900 (0 new) | 1250 | 250 | 1 | 750 | +| **2** | 251 | 1000 | 1001 (1 new) | 1250 | 250 | 312 (actual 562) | 1 | +| **3** | 562 | 1000 | 1562 (313 new) | 1250 | 250 | 262 | 312 | +| **4** | 512 | 1000 | 1512 (575 new) | 1250 | 250 | 312 | 262 | +| **5** | 562 | 1000 | 1562 (887 new) | 1250 | 250 | 1 | 312 | +| **6** | 251 | 1000 | 1251 (888 new) | 1250 | 250 | 249 | 1 | +| **7** | 499 | 1000 | 1499 (1137 new) | 1250 | 250 | 64 | 249 | +| **8** | 314 | 1000 | 1314 (1314 new) | 1250 | 250 | 249 | 13 (actual 64) | +| **9** | 550 | 1000 | 1550 (1314 new) | 1250 | 250 | - |- | diff --git a/internal/core/operations/healthcontroller/executor.go b/internal/core/operations/healthcontroller/executor.go index d33509a46..213e04ff4 100644 --- a/internal/core/operations/healthcontroller/executor.go +++ b/internal/core/operations/healthcontroller/executor.go @@ -25,6 +25,9 @@ package healthcontroller import ( "context" "fmt" + "math" + "strconv" + "strings" "time" "github.com/topfreegames/maestro/internal/core/operations/rooms/add" @@ -40,6 +43,11 @@ import ( "go.uber.org/zap" ) +const ( + minSchedulerMaxSurge = 1 + schedulerMaxSurgeRelativeSymbol = "%" +) + // Config have the configs to execute healthcontroller. type Config struct { RoomInitializationTimeout time.Duration @@ -426,7 +434,7 @@ func (ex *Executor) performRollingUpdate( roomsWithPreviousSchedulerVersion []string, ) error { logger.Info("performing rolling update", zap.String("scheduler.Version", scheduler.Spec.Version)) - maxSurgeAmount, err := ex.roomManager.SchedulerMaxSurge(ctx, scheduler) + maxSurgeAmount, err := ComputeRollingSurge(scheduler, desiredNumberOfRooms, len(availableRoomsIDs)) if err != nil { logger.Error("failed to perform rolling update while getting max surge amount of rooms", zap.Error(err)) return err @@ -434,9 +442,6 @@ func (ex *Executor) performRollingUpdate( if len(roomsWithPreviousSchedulerVersion) < maxSurgeAmount { maxSurgeAmount = len(roomsWithPreviousSchedulerVersion) } - if maxSurgeAmount <= 0 { - maxSurgeAmount = 1 - } logger.Info( "upscaling new rooms", zap.Int("desired", desiredNumberOfRooms), @@ -521,3 +526,25 @@ func (ex *Executor) markPreviousSchedulerRoomsForDeletion( logger.Sugar().Infof("successfully marked %d rooms for deletion", bufferRoomsToBeRemoved) return roomsWithPreviousSchedulerVersion[:bufferRoomsToBeRemoved], nil } + +func ComputeRollingSurge(scheduler *entities.Scheduler, desiredNumberOfRooms int, totalRoomsAmount int) (maxSurgeNum int, err error) { + maxSurgeNum = minSchedulerMaxSurge + + if scheduler.MaxSurge != "" { + isRelative := strings.HasSuffix(scheduler.MaxSurge, schedulerMaxSurgeRelativeSymbol) + maxSurgeNum, err = strconv.Atoi(strings.TrimSuffix(scheduler.MaxSurge, schedulerMaxSurgeRelativeSymbol)) + if err != nil { + return minSchedulerMaxSurge, fmt.Errorf("failed to parse max surge into a number: %w", err) + } + if isRelative { + absoluteNum := math.Round((float64(desiredNumberOfRooms) / 100) * float64(maxSurgeNum)) + maxSurgeNum = int(math.Max(minSchedulerMaxSurge, absoluteNum)) + } + } + + maxRoomsToSurge := desiredNumberOfRooms + maxSurgeNum - totalRoomsAmount + if maxRoomsToSurge > maxSurgeNum { + return maxSurgeNum, nil + } + return maxRoomsToSurge, nil +} diff --git a/internal/core/operations/healthcontroller/executor_test.go b/internal/core/operations/healthcontroller/executor_test.go index 5f661bfab..77c119fd3 100644 --- a/internal/core/operations/healthcontroller/executor_test.go +++ b/internal/core/operations/healthcontroller/executor_test.go @@ -1370,7 +1370,6 @@ func TestSchedulerHealthController_Execute(t *testing.T) { op := operation.New(newScheduler.Name, definition.Name(), nil) // Perform rolling update - roomManager.EXPECT().SchedulerMaxSurge(gomock.Any(), newScheduler).Return(1, nil) operationManager.EXPECT().CreatePriorityOperation(gomock.Any(), newScheduler.Name, &add.Definition{Amount: 1}).Return(op, nil) operationManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), gomock.Any(), gomock.Any()).Times(2) roomStorage.EXPECT().GetRoomIDsByStatus(gomock.Any(), newScheduler.Name, game_room.GameStatusOccupied).Return(gameRoomIDs, nil) @@ -1467,3 +1466,69 @@ func newValidScheduler(autoscaling *autoscaling.Autoscaling) *entities.Scheduler Autoscaling: autoscaling, } } + +func TestComputeRollingSurgeVariants(t *testing.T) { + testCases := []struct { + name string + maxSurge string + desiredNumberOfRooms int + totalRoomsAmount int + expectedSurgeAmount int + expectError bool + }{ + { + name: "Standard surge calculation", + maxSurge: "10", + desiredNumberOfRooms: 20, + totalRoomsAmount: 15, + expectedSurgeAmount: 10, + expectError: false, + }, + { + name: "High amount of rooms above 1000", + maxSurge: "30%", + desiredNumberOfRooms: 1500, + totalRoomsAmount: 1000, + expectedSurgeAmount: 450, // It can surge up to 950 but we cap to the maxSurge of 450 + expectError: false, + }, + { + name: "Relative maxSurge with rooms above 1000", + maxSurge: "25%", + desiredNumberOfRooms: 1200, + totalRoomsAmount: 1000, + expectedSurgeAmount: 300, // 25% of 1200 is 300 + expectError: false, + }, + { + name: "Invalid maxSurge string format", + maxSurge: "twenty%", // Invalid because it's not a number + desiredNumberOfRooms: 300, + totalRoomsAmount: 250, + expectedSurgeAmount: 1, // Expected to be 1 because the function should return an error + expectError: true, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + scheduler := &entities.Scheduler{ + MaxSurge: tc.maxSurge, + } + + surgeAmount, err := healthcontroller.ComputeRollingSurge(scheduler, tc.desiredNumberOfRooms, tc.totalRoomsAmount) + if tc.expectError { + if err == nil { + t.Errorf("expected an error but got none") + } + } else { + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if surgeAmount != tc.expectedSurgeAmount { + t.Errorf("expected surge amount to be %d, but got %d", tc.expectedSurgeAmount, surgeAmount) + } + } + }) + } +} diff --git a/internal/core/worker/operationexecution/operation_execution_worker_test.go b/internal/core/worker/operationexecution/operation_execution_worker_test.go index faea451da..03b35ccef 100644 --- a/internal/core/worker/operationexecution/operation_execution_worker_test.go +++ b/internal/core/worker/operationexecution/operation_execution_worker_test.go @@ -603,7 +603,7 @@ func TestSchedulerOperationsExecutionLoop(t *testing.T) { pendingOpsChan := make(chan string) operationManager.EXPECT().PendingOperationsChan(gomock.Any(), gomock.Any()).Return(pendingOpsChan) - operationManager.EXPECT().CreateOperation(gomock.Any(), scheduler.Name, &healthcontroller.Definition{}).Return(&operation.Operation{}, nil).MaxTimes(5) + operationManager.EXPECT().CreateOperation(gomock.Any(), scheduler.Name, &healthcontroller.Definition{}).Return(&operation.Operation{}, nil).MaxTimes(6) ctx, cancel := context.WithCancel(context.Background())