Skip to content

Commit

Permalink
feat(healthcontroller): avoid rolling update on minor version upgrades (
Browse files Browse the repository at this point in the history
#635)

When checking for rooms with mismatch in scheduler version, do not
perform rolling update if all mismatches are from minor versions.
  • Loading branch information
hspedro authored Aug 15, 2024
1 parent adadff7 commit eeb05b5
Show file tree
Hide file tree
Showing 3 changed files with 132 additions and 4 deletions.
9 changes: 7 additions & 2 deletions docs/reference/RollingUpdate.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Rolling Update

When the scheduler is updated, minor of major, the switch version operation will
When the scheduler is updated, only minor, the switch version operation will
simply change the new active version in the database. Thus, who is responsible for
actually performing runtime changes to enforce that all existing Game Rooms are from
the active schedulers is the _health_controller_ operation.
Expand Down Expand Up @@ -72,7 +72,12 @@ becomes ready, the new version is marked as safe to be deployed
4. _switch_version_ operation starts by the worker, simply changing the active
version in the database
5. _health_controller_ operation runs and check if there is any existing game room
that is not from the active scheduler version
that is not from the active scheduler version. More details on how we check for it below.
1. Get all rooms from that scheduler
2. For each room check if the version is not equal to the current active scheduler version
3. Check if we have compared with this version before. If not, get the scheduler entity for that old version
4. Compare the old version with the new one, if it's a major one add it to the comparison map
5. At the end of the rooms loop, if we do not find any major change return false when checking for a rolling update. If we found a major change, append the occupied and then ready rooms to the array
6. If it does not have, run normal autoscale. If it has, it is performing a rolling
update, proceed with the update
7. The update checks how many rooms it can spawn by computing the below:
Expand Down
27 changes: 26 additions & 1 deletion internal/core/operations/healthcontroller/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ import (
"strings"
"time"

"github.com/topfreegames/maestro/internal/core/filters"
"github.com/topfreegames/maestro/internal/core/operations/rooms/add"
"github.com/topfreegames/maestro/internal/core/operations/rooms/remove"

Expand Down Expand Up @@ -408,19 +409,43 @@ func (ex *Executor) checkRollingUpdate(
) ([]string, bool) {
logger.Debug("checking if system is in the middle of a rolling update of scheduler")
var roomsPreviousScheduler, occupiedRoomsPreviousScheduler []string
majorVersionComparison := make(map[string]bool)
var hasMajorVersionUpdate bool
// TODO: build this struct during findAvailableAndExpiredRooms() call
for _, roomID := range availableRoomsIDs {
room, err := ex.roomStorage.GetRoom(ctx, scheduler.Name, roomID)
// if err != nil we will miss the room, the system can still recover itself in
// the next health_controller operation
if err == nil && room.Version != scheduler.Spec.Version {
if err != nil || room.Version == scheduler.Spec.Version {
continue
}

// Check against the cache to avoid unnecessary calls to the storage
if _, ok := majorVersionComparison[room.Version]; !ok {
// Defaults to false when creating the map
majorVersionComparison[room.Version] = false
roomScheduler, err := ex.schedulerStorage.GetSchedulerWithFilter(ctx, &filters.SchedulerFilter{
Name: scheduler.Name,
Version: room.Version,
})
if err == nil {
majorVersionComparison[room.Version] = roomScheduler.IsMajorVersion(scheduler)
}
}

if majorVersionComparison[room.Version] {
hasMajorVersionUpdate = true
if room.Status == game_room.GameStatusOccupied {
occupiedRoomsPreviousScheduler = append(occupiedRoomsPreviousScheduler, roomID)
} else {
roomsPreviousScheduler = append(roomsPreviousScheduler, roomID)
}
}
}
if !hasMajorVersionUpdate {
logger.Info("system only has rooms in previous minor scheduler versions, skipping rolling update")
return []string{}, false
}
// Append occupied to the end so when deleting we prioritize non-occupied rooms
roomsPreviousScheduler = append(roomsPreviousScheduler, occupiedRoomsPreviousScheduler...)
logger.Info("rooms that did not match current scheduler versions", zap.Int("rooms", len(roomsPreviousScheduler)))
Expand Down
100 changes: 99 additions & 1 deletion internal/core/operations/healthcontroller/executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"testing"
"time"

"github.com/topfreegames/maestro/internal/core/filters"
"github.com/topfreegames/maestro/internal/core/operations/rooms/add"
"github.com/topfreegames/maestro/internal/core/operations/rooms/remove"

Expand Down Expand Up @@ -1306,7 +1307,98 @@ func TestSchedulerHealthController_Execute(t *testing.T) {
},
},
{
title: "room scheduler version do not match current scheduler, start rolling update and not autoscale",
title: "do not start rolling update if there are only rooms with a minor difference in version",
definition: &healthcontroller.Definition{},
executionPlan: executionPlan{
tookAction: false,
planMocks: func(
roomStorage *mockports.MockRoomStorage,
roomManager *mockports.MockRoomManager,
instanceStorage *mockports.MockGameRoomInstanceStorage,
schedulerStorage *mockports.MockSchedulerStorage,
operationManager *mockports.MockOperationManager,
autoscaler *mockports.MockAutoscaler,
) {
gameRoomIDs := []string{"room-1", "room-2"}
instances := []*game_room.Instance{
{
ID: "room-1",
Status: game_room.InstanceStatus{
Type: game_room.InstanceReady,
},
},
{
ID: "room-2",
Status: game_room.InstanceStatus{
Type: game_room.InstanceReady,
},
},
}
// Simulate change in autoscaling, its considered minor
newScheduler := newValidScheduler(&autoscaling.Autoscaling{
Enabled: true,
Min: 2,
Max: 10,
Cooldown: 60,
Policy: autoscaling.Policy{
Type: autoscaling.RoomOccupancy,
Parameters: autoscaling.PolicyParameters{
RoomOccupancy: &autoscaling.RoomOccupancyParams{
DownThreshold: 0.99,
},
},
},
})
newScheduler.Spec.Version = "v1.1"
gameRoom1 := &game_room.GameRoom{
ID: gameRoomIDs[0],
SchedulerID: genericSchedulerAutoscalingEnabled.Name,
Status: game_room.GameStatusReady,
LastPingAt: time.Now(),
CreatedAt: time.Now(),
Version: genericSchedulerAutoscalingEnabled.Spec.Version,
}
gameRoom2 := &game_room.GameRoom{
ID: gameRoomIDs[1],
SchedulerID: newScheduler.Name,
Status: game_room.GameStatusOccupied,
LastPingAt: time.Now(),
Version: newScheduler.Spec.Version,
}

// load
roomStorage.EXPECT().GetAllRoomIDs(gomock.Any(), gomock.Any()).Return(gameRoomIDs, nil)
instanceStorage.EXPECT().GetAllInstances(gomock.Any(), gomock.Any()).Return(instances, nil)

// findAvailableAndExpiredRooms
roomStorage.EXPECT().GetRoom(gomock.Any(), newScheduler.Name, gameRoomIDs[0]).Return(gameRoom1, nil)
roomStorage.EXPECT().GetRoom(gomock.Any(), newScheduler.Name, gameRoomIDs[1]).Return(gameRoom2, nil)
operationManager.EXPECT().AppendOperationEventToExecutionHistory(gomock.Any(), gomock.Any(), gomock.Any()).Times(1)

// getDesiredNumberOfRooms
schedulerStorage.EXPECT().GetScheduler(gomock.Any(), gomock.Any()).Return(newScheduler, nil)
autoscaler.EXPECT().CalculateDesiredNumberOfRooms(gomock.Any(), newScheduler).Return(2, nil)

// Check for rolling update
roomStorage.EXPECT().GetRoom(gomock.Any(), newScheduler.Name, gameRoomIDs[0]).Return(gameRoom1, nil)
roomStorage.EXPECT().GetRoom(gomock.Any(), newScheduler.Name, gameRoomIDs[1]).Return(gameRoom2, nil)
schedulerStorage.EXPECT().GetSchedulerWithFilter(gomock.Any(), &filters.SchedulerFilter{
Name: genericSchedulerAutoscalingEnabled.Name,
Version: genericSchedulerAutoscalingEnabled.Spec.Version,
}).Return(genericSchedulerAutoscalingEnabled, nil)

_ = operation.New(newScheduler.Name, definition.Name(), nil)

// Do not perform rolling update
operationManager.EXPECT().CreatePriorityOperation(gomock.Any(), newScheduler.Name, &add.Definition{Amount: 0}).Times(0)
roomStorage.EXPECT().GetRoomIDsByStatus(gomock.Any(), newScheduler.Name, game_room.GameStatusOccupied).Times(0)
roomStorage.EXPECT().GetRoomIDsByStatus(gomock.Any(), newScheduler.Name, game_room.GameStatusReady).Times(0)
operationManager.EXPECT().CreateOperation(gomock.Any(), newScheduler.Name, &remove.Definition{RoomsIDs: gameRoomIDs, Reason: remove.RollingUpdateReplace}).Times(0)
},
},
},
{
title: "start rolling update if there are rooms with a major difference in version",
definition: &healthcontroller.Definition{},
executionPlan: executionPlan{
tookAction: true,
Expand Down Expand Up @@ -1335,6 +1427,8 @@ func TestSchedulerHealthController_Execute(t *testing.T) {
}
newScheduler := newValidScheduler(&autoscalingEnabled)
newScheduler.Spec.Version = "v2"
// TerminationGracePeriod field is considered a major change
newScheduler.Spec.TerminationGracePeriod += 1
gameRoom1 := &game_room.GameRoom{
ID: gameRoomIDs[0],
SchedulerID: genericSchedulerAutoscalingEnabled.Name,
Expand Down Expand Up @@ -1366,6 +1460,10 @@ func TestSchedulerHealthController_Execute(t *testing.T) {
// Check for rolling update
roomStorage.EXPECT().GetRoom(gomock.Any(), newScheduler.Name, gameRoomIDs[0]).Return(gameRoom1, nil)
roomStorage.EXPECT().GetRoom(gomock.Any(), newScheduler.Name, gameRoomIDs[1]).Return(gameRoom2, nil)
schedulerStorage.EXPECT().GetSchedulerWithFilter(gomock.Any(), &filters.SchedulerFilter{
Name: genericSchedulerAutoscalingEnabled.Name,
Version: genericSchedulerAutoscalingEnabled.Spec.Version,
}).Return(genericSchedulerAutoscalingEnabled, nil)

op := operation.New(newScheduler.Name, definition.Name(), nil)

Expand Down

0 comments on commit eeb05b5

Please sign in to comment.