Skip to content

Commit

Permalink
Allow to pause failover at runtime (#1783)
Browse files Browse the repository at this point in the history
Co-authored-by: Michael Filonenko <[email protected]>
Co-authored-by: Patience Daur <[email protected]>
  • Loading branch information
3 people authored Mar 23, 2022
1 parent 97de7ef commit 44726e6
Show file tree
Hide file tree
Showing 11 changed files with 230 additions and 41 deletions.
4 changes: 3 additions & 1 deletion CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,9 @@ Added
- Export vshard config in Lua API (#1761).

- New ``failover_promote`` option ``skip_error_on_change`` to skip etcd error
when vclockkeeper was changed between ``set_vclokkeeper`` calls.
when vclockkeeper was changed between ``set_vclokkeeper`` calls (#1399).

- Allow to pause failover at runtime, with Lua API and GraphQL (#1763).

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Changed
Expand Down
23 changes: 23 additions & 0 deletions cartridge/failover.lua
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ vars:new('options', {
LONGPOLL_TIMEOUT = 30,
NETBOX_CALL_TIMEOUT = 1,
})
vars:new('failover_paused', false)

function _G.__cartridge_failover_get_lsn(timeout)
box.ctl.wait_ro(timeout)
Expand All @@ -89,6 +90,14 @@ function _G.__cartridge_failover_wait_rw(timeout)
return errors.pcall('WaitRwError', box.ctl.wait_rw, timeout)
end

function _G.__cartridge_failover_pause()
vars.failover_paused = true
end

function _G.__cartridge_failover_resume()
vars.failover_paused = false
end

local reconfigure_all -- function implemented below

--- Cancel all pending reconfigure_all tasks.
Expand Down Expand Up @@ -597,6 +606,11 @@ local function failover_loop(args)

vars.failover_err = nil

if vars.failover_paused == true then
log.warn("Failover is paused, appointments don't apply")
goto continue
end

if accept_appointments(appointments) then
local id = schedule_add()
log.info(
Expand Down Expand Up @@ -811,6 +825,14 @@ local function is_vclockkeeper()
return vars.cache.is_vclockkeeper
end

--- Check if failover paused on current instance.
-- @function is_paused
-- @local
-- @treturn boolean true / false
local function is_paused()
return vars.failover_paused
end

--- Check if current configuration implies consistent switchover.
-- @function consistency_needed
-- @local
Expand Down Expand Up @@ -935,6 +957,7 @@ return {
is_vclockkeeper = is_vclockkeeper,
is_leader = is_leader,
is_rw = is_rw,
is_paused = is_paused,

force_inconsistency = force_inconsistency,
wait_consistency = wait_consistency,
Expand Down
63 changes: 63 additions & 0 deletions cartridge/lua-api/failover.lua
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,18 @@

local checks = require('checks')
local errors = require('errors')
local fun = require('fun')

local twophase = require('cartridge.twophase')
local topology = require('cartridge.topology')
local confapplier = require('cartridge.confapplier')
local failover = require('cartridge.failover')
local rpc = require('cartridge.rpc')
local pool = require('cartridge.pool')

local FailoverSetParamsError = errors.new_class('FailoverSetParamsError')
local PromoteLeaderError = errors.new_class('PromoteLeaderError')
local FailoverPauseError = errors.new_class('FailoverPauseError')

--- Get failover configuration.
--
Expand Down Expand Up @@ -247,10 +250,70 @@ local function promote(replicaset_leaders, opts)
return true
end

--- Stops failover across cluster at runtime. Will be useful in case of "failover storms"
-- when failover triggers too many times in minute.
--
-- @function pause
--
-- @treturn[1] boolean true On success
-- @treturn[2] nil
-- @treturn[2] table Error description
local function pause()
local uri_list = {}
local topology_cfg = confapplier.get_readonly('topology')

if topology_cfg == nil then
return nil, FailoverSetParamsError:new("Current instance isn't bootstrapped yet")
end

local refined_uri_list = topology.refine_servers_uri(topology_cfg)
for _, uuid, _ in fun.filter(topology.not_disabled, topology_cfg.servers) do
table.insert(uri_list, refined_uri_list[uuid])
end

local _, err = pool.map_call('_G.__cartridge_failover_pause', nil, { uri_list = uri_list })

if err ~= nil then
return nil, FailoverPauseError:new("Failover pausing failed, probably some of instances are not healthy")
end
return true
end

--- Starts failover across cluster at runtime after ``pause``.
-- Don't forget to resume your failover after pausing it.
--
-- @function resume
--
-- @treturn[1] boolean true On success
-- @treturn[2] nil
-- @treturn[2] table Error description
local function resume()
local uri_list = {}
local topology_cfg = confapplier.get_readonly('topology')

if topology_cfg == nil then
return nil, FailoverSetParamsError:new("Current instance isn't bootstrapped yet")
end

local refined_uri_list = topology.refine_servers_uri(topology_cfg)
for _, uuid, _ in fun.filter(topology.not_disabled, topology_cfg.servers) do
table.insert(uri_list, refined_uri_list[uuid])
end

local _, err = pool.map_call('_G.__cartridge_failover_resume', nil, { uri_list = uri_list })

if err ~= nil then
return nil, FailoverPauseError:new("Failover resuming failed, probably some of instances are not healthy")
end
return true
end

return {
get_params = get_params,
set_params = set_params,
promote = promote,
pause = pause,
resume = resume,
get_failover_enabled = get_failover_enabled, -- deprecated
set_failover_enabled = set_failover_enabled, -- deprecated
}
2 changes: 1 addition & 1 deletion cartridge/pool.lua
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ end
-- @tparam {string,...} opts.uri_list
-- array of URIs for performing remote call
-- @tparam ?number opts.timeout
-- passed to `net.box` `conn:call()`
-- passed to `net.box` `conn:call()` (default: 10)
--
-- @treturn {URI=value,...}
-- Call results mapping for every URI.
Expand Down
2 changes: 1 addition & 1 deletion cartridge/webui/api-config.lua
Original file line number Diff line number Diff line change
Expand Up @@ -363,7 +363,7 @@ local function init(graphql, httpd, opts)
graphql.add_mutation({
prefix = 'cluster',
name = 'config',
doc = 'Applies updated config on cluster',
doc = 'Applies updated config on the cluster',
args = {
sections = gql_types.list(gql_type_section_input)
},
Expand Down
2 changes: 1 addition & 1 deletion cartridge/webui/api-ddl.lua
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ local function init(graphql)
graphql.add_mutation({
prefix = 'cluster',
name = 'check_schema',
doc = 'Checks that schema can be applied on cluster',
doc = 'Checks that the schema can be applied on the cluster',
args = {
as_yaml = gql_types.string.nonNull,
},
Expand Down
26 changes: 26 additions & 0 deletions cartridge/webui/api-failover.lua
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,13 @@ local function set_failover_enabled(_, args)
return lua_api_failover.set_failover_enabled(args.enabled)
end

local function pause()
return lua_api_failover.pause()
end
local function resume()
return lua_api_failover.resume()
end

local function promote(_, args)
local replicaset_uuid = args['replicaset_uuid']
local instance_uuid = args['instance_uuid']
Expand Down Expand Up @@ -167,6 +174,23 @@ local function init(graphql)
kind = gql_types.boolean.nonNull,
callback = module_name .. '.promote',
})

graphql.add_mutation({
prefix = 'cluster',
name = 'failover_pause',
doc = 'Pause failover',
args = {},
kind = gql_types.boolean.nonNull,
callback = module_name .. '.pause',
})
graphql.add_mutation({
prefix = 'cluster',
name = 'failover_resume',
doc = 'Resume failover after pausing',
args = {},
kind = gql_types.boolean.nonNull,
callback = module_name .. '.resume',
})
end

return {
Expand All @@ -176,4 +200,6 @@ return {
get_failover_params = get_failover_params,
set_failover_params = set_failover_params,
promote = promote,
pause = pause,
resume = resume,
}
2 changes: 1 addition & 1 deletion cartridge/webui/api-topology.lua
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ local function init(graphql)
graphql.add_mutation({
prefix = 'cluster',
name = 'restart_replication',
doc = 'Restart replication on specified by uuid servers',
doc = 'Restart replication on servers specified by uuid',
args = {
uuids = gql_types.list(gql_types.string.nonNull),
},
Expand Down
78 changes: 42 additions & 36 deletions doc/schema.graphql
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# source: http://127.0.0.1:8081/admin/api
# timestamp: Thu Mar 10 2022 11:35:56 GMT+0300 (Moscow Standard Time)
# timestamp: Wed Mar 23 2022 18:11:53 GMT+0300 (Москва, стандартное время)

"""Custom scalar specification."""
directive @specifiedBy(
Expand Down Expand Up @@ -323,9 +323,6 @@ type Mutation {

"""Cluster management"""
type MutationApicluster {
"""Disable listed servers by uuid"""
disable_servers(uuids: [String!]): [Server]

"""Applies DDL schema on cluster"""
schema(as_yaml: String!): DDLSchema!

Expand Down Expand Up @@ -375,14 +372,24 @@ type MutationApicluster {
etcd2_params: FailoverStateProviderCfgInputEtcd2
): FailoverAPI!

"""Remove user"""
remove_user(username: String!): User
"""Pause failover"""
failover_pause: Boolean!
auth_params(
"""
The `Long` scalar type represents non-fractional signed whole numeric
values. Long can represent values from -(2^52) to 2^52 - 1, inclusive.
"""
cookie_max_age: Long

"""Checks that schema can be applied on cluster"""
check_schema(as_yaml: String!): DDLCheckResult!
"""The `Boolean` scalar type represents `true` or `false`."""
enabled: Boolean

"""Restart replication on specified by uuid servers"""
restart_replication(uuids: [String!]): Boolean
"""
The `Long` scalar type represents non-fractional signed whole numeric
values. Long can represent values from -(2^52) to 2^52 - 1, inclusive.
"""
cookie_renew_age: Long
): UserManagementAPI!
edit_vshard_options(
"""
The `Int` scalar type represents non-fractional signed whole numeric values.
Expand Down Expand Up @@ -427,51 +434,50 @@ type MutationApicluster {
"""
sched_move_quota: Long
): VshardGroup!
auth_params(
"""
The `Long` scalar type represents non-fractional signed whole numeric
values. Long can represent values from -(2^52) to 2^52 - 1, inclusive.
"""
cookie_max_age: Long

"""The `Boolean` scalar type represents `true` or `false`."""
enabled: Boolean
"""Restart replication on servers specified by uuid"""
restart_replication(uuids: [String!]): Boolean

"""
The `Long` scalar type represents non-fractional signed whole numeric
values. Long can represent values from -(2^52) to 2^52 - 1, inclusive.
"""
cookie_renew_age: Long
): UserManagementAPI!
"""Remove user"""
remove_user(username: String!): User

"""Create a new user"""
add_user(
password: String!
username: String!
"""Checks that the schema can be applied on the cluster"""
check_schema(as_yaml: String!): DDLCheckResult!

"""Applies updated config on the cluster"""
config(sections: [ConfigSectionInput]): [ConfigSection]!

"""Edit an existing user"""
edit_user(
"""
The `String` scalar type represents textual data, represented as UTF-8
character sequences. The String type is most often used by GraphQL to
represent free-form human-readable text.
"""
fullname: String
password: String
username: String!

"""
The `String` scalar type represents textual data, represented as UTF-8
character sequences. The String type is most often used by GraphQL to
represent free-form human-readable text.
"""
email: String
): User
fullname: String

"""Edit an existing user"""
edit_user(
"""
The `String` scalar type represents textual data, represented as UTF-8
character sequences. The String type is most often used by GraphQL to
represent free-form human-readable text.
"""
password: String
email: String
): User

"""Resume failover after pausing"""
failover_resume: Boolean!

"""Create a new user"""
add_user(
password: String!
username: String!

"""
Expand Down Expand Up @@ -507,8 +513,8 @@ type MutationApicluster {
"""Reapplies config on the specified nodes"""
config_force_reapply(uuids: [String]): Boolean!

"""Applies updated config on cluster"""
config(sections: [ConfigSectionInput]): [ConfigSection]!
"""Disable listed servers by uuid"""
disable_servers(uuids: [String!]): [Server]
}

type Query {
Expand Down
9 changes: 9 additions & 0 deletions rst/topics/failover.rst
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,15 @@ that yet. So, both S1 and S2 consider themselves as leaders.

Moreover, SWIM protocol isn't perfect and still can produce
false-negative gossips (announce the instance is dead when it's not).
It may cause "failover storms", when failover triggers too many times per minute
under a high load. You can pause failover at runtime using Lua API
(``require('cartridge.lua-api.failover').pause()``) or GraphQL mutation
(``mutation { cluster { failover_pause } }``). Those functions will pause
failover on every instance they can reach. To see if failover is paused, check the logs or use the function ``require('cartridge.failover').is_paused()``.
Don't forget to resume failover using Lua API
(``require('cartridge.lua-api.failover').resume()``) or GraphQL mutation
(``mutation { cluster { failover_resume } }``)


.. _cartridge-stateful_failover:

Expand Down
Loading

0 comments on commit 44726e6

Please sign in to comment.