diff --git a/addons/redis/redis-cluster-scripts/redis-cluster-common.sh b/addons/redis/redis-cluster-scripts/redis-cluster-common.sh index 74cf56e58..d6d73b3b3 100644 --- a/addons/redis/redis-cluster-scripts/redis-cluster-common.sh +++ b/addons/redis/redis-cluster-scripts/redis-cluster-common.sh @@ -700,4 +700,24 @@ execute_acl_save_with_retry() { return 1 fi return 0 +} + +check_redis_role() { + local host=$1 + local port=$2 + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + role_info=$(redis-cli -h $host -p $port info replication) + else + role_info=$(redis-cli -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" info replication) + fi + set_xtrace_when_ut_mode_false + + if echo "$role_info" | grep -q "^role:master"; then + echo "primary" + elif echo "$role_info" | grep -q "^role:slave"; then + echo "secondary" + else + echo "unknown" + fi } \ No newline at end of file diff --git a/addons/redis/redis-cluster-scripts/redis-cluster-switchover.sh b/addons/redis/redis-cluster-scripts/redis-cluster-switchover.sh new file mode 100644 index 000000000..3a512e86a --- /dev/null +++ b/addons/redis/redis-cluster-scripts/redis-cluster-switchover.sh @@ -0,0 +1,230 @@ +#!/bin/bash + +# This is magic for shellspec ut framework. "test" is a `test [expression]` well known as a shell command. +# Normally test without [expression] returns false. It means that __() { :; } +# function is defined if this script runs directly. +# +# shellspec overrides the test command and returns true *once*. It means that +# __() function defined internally by shellspec is called. +# +# In other words. If not in test mode, __ is just a comment. If test mode, __ +# is a interception point. +# +# you should set ut_mode="true" when you want to run the script in shellspec file. +# +# shellcheck disable=SC2034 +# shellcheck disable=SC2153 +# shellcheck disable=SC1090 +ut_mode="false" +test || __() { + # when running in non-unit test mode, set the options "set -ex". + set -ex; +} + +load_redis_cluster_common_utils() { + # the common.sh and redis-cluster-common.sh scripts are defined in the redis-cluster-scripts-template configmap + # and are mounted to the same path which defined in the cmpd.spec.scripts + kblib_common_library_file="/scripts/common.sh" + redis_cluster_common_library_file="/scripts/redis-cluster-common.sh" + source "${kblib_common_library_file}" + source "${redis_cluster_common_library_file}" +} + +check_environment_exist() { + local required_vars=( + "CURRENT_SHARD_POD_NAME_LIST" + "CURRENT_SHARD_POD_FQDN_LIST" + "KB_SWITCHOVER_ROLE" + ) + + for var in "${required_vars[@]}"; do + if is_empty "${!var}"; then + echo "Error: Required environment variable $var is not set." >&2 + return 1 + fi + done + + if [ "$KB_SWITCHOVER_ROLE" != "primary" ]; then + echo "switchover not triggered for primary, nothing to do" + return 0 + fi +} + +init_redis_cluster_service_port() { + service_port=6379 + if [ -n "$SERVICE_PORT" ]; then + service_port=$SERVICE_PORT + fi +} + +get_current_shard_primary() { + local host=$1 + local port=$2 + local master_info + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + master_info=$(redis-cli -h $host -p $port info replication) + else + master_info=$(redis-cli -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" info replication) + fi + set_xtrace_when_ut_mode_false + + local master_host + local master_port + + master_host=$(echo "$master_info" | grep "master_host:" | cut -d':' -f2 | tr -d '[:space:]') + master_port=$(echo "$master_info" | grep "master_port:" | cut -d':' -f2 | tr -d '[:space:]') + + if is_empty "$master_host"|| is_empty "$master_port"; then + return 1 + fi + + echo "$master_host:$master_port" +} + +get_all_shards_master() { + local host=$1 + local port=$2 + local cluster_nodes_info + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + cluster_nodes_info=$(redis-cli -h $host -p $port cluster nodes) + else + cluster_nodes_info=$(redis-cli -h $host -p $port -a "$REDIS_DEFAULT_PASSWORD" cluster nodes) + fi + set_xtrace_when_ut_mode_false + + echo "$cluster_nodes_info" | grep "master" | while read -r line; do + node_addr=$(echo "$line" | cut -d' ' -f2 | cut -d'@' -f1) + echo "$node_addr" + done +} + +do_switchover() { + candidate_pod=$1 + candidate_pod_fqdn=$2 + + # check candidate pod is ready and has the role of secondary + role=$(check_redis_role "$candidate_pod_fqdn" $service_port) + if ! equals "$role" "secondary"; then + echo "Error: Candidate pod $candidate_pod is not a secondary" >&2 + return 1 + fi + + # get current shard primary + current_shard_primary=$(get_current_shard_primary "$candidate_pod_fqdn" $service_port) + if is_empty "$current_shard_primary"; then + echo "Error: Could not determine current shard primary for $candidate_pod" >&2 + return 1 + fi + + # check cluster health from current shard primary + if ! check_slots_covered "$current_shard_primary" $service_port; then + echo "Error: Cluster health check failed" >&2 + return 1 + fi + + # check if candidate is known by all the shards primary + current_shard_primary_host=$(echo "$current_shard_primary" | cut -d':' -f1) + current_shard_primary_port=$(echo "$current_shard_primary" | cut -d':' -f2) + if is_empty "$current_shard_primary_host" || is_empty "$current_shard_primary_port"; then + echo "Error: Could not determine current shard primary host and port" >&2 + return 1 + fi + primaries=$(get_all_shards_master "$current_shard_primary_host" $current_shard_primary_port) + for primary in $primaries; do + primary_host=$(echo "$primary" | cut -d':' -f1) + primary_port=$(echo "$primary" | cut -d':' -f2) + if ! check_node_in_cluster_with_retry "$primary_host" $primary_port "$candidate_pod"; then + echo "Error: Candidate $candidate_pod is not known by shard $primary" >&2 + return 1 + fi + done + + # do switchover + echo "Starting switchover to $candidate_pod" + unset_xtrace_when_ut_mode_false + if is_empty "$REDIS_DEFAULT_PASSWORD"; then + result=$(redis-cli -h "$candidate_pod_fqdn" -p $service_port cluster failover) + else + result=$(redis-cli -h "$candidate_pod_fqdn" -p $service_port -a "$REDIS_DEFAULT_PASSWORD" cluster failover) + fi + set_xtrace_when_ut_mode_false + if [ "$result" != "OK" ]; then + echo "Error: Cluster Failover command failed with result: $result" >&2 + return 1 + fi + + # check switchover result + max_attempts=60 + attempt=0 + while [ $attempt -lt $max_attempts ]; do + role=$(check_redis_role "$candidate_pod_fqdn" $service_port) + if [ "$role" = "primary" ]; then + echo "Switchover successful: $candidate_pod is now primary" + return 0 + fi + sleep 2 + ((attempt++)) + done + + echo "Error: Switchover verification timeout" >&2 + return 1 +} + +switchover_without_candidate() { + candidate_pod="" + candidate_pod_fqdn="" + # get the one of secondary pod of current shard + # TODO: get the most suitable secondary pod which has the lowest latency + IFS=',' read -ra PODS <<< "$CURRENT_SHARD_POD_NAME_LIST" + for pod_name in "${PODS[@]}"; do + local pod_fqdn + pod_fqdn=$(get_target_pod_fqdn_from_pod_fqdn_vars "$CURRENT_SHARD_POD_FQDN_LIST" "$pod_name") || { + echo "Failed to get FQDN for pod: $pod_name" >&2 + return 1 + } + role=$(check_redis_role "$pod_fqdn" $service_port) + if [ "$role" = "secondary" ]; then + candidate_pod=$pod_name + candidate_pod_fqdn=$pod_fqdn + break + fi + done + + if is_empty "$candidate_pod"; then + echo "Error: No eligible secondary found in pod list: $CURRENT_SHARD_POD_NAME_LIST" >&2 + return 1 + fi + + # do switchover + do_switchover "$candidate_pod" "$candidate_pod_fqdn" || return 1 +} + +switchover_with_candidate() { + # check KB_SWITCHOVER_CANDIDATE_FQDN and KB_SWITCHOVER_CANDIDATE_NAME are not empty + if is_empty "$KB_SWITCHOVER_CANDIDATE_FQDN" || is_empty "$KB_SWITCHOVER_CANDIDATE_NAME"; then + echo "KB_SWITCHOVER_CANDIDATE_NAME or KB_SWITCHOVER_CANDIDATE_FQDN is empty" >&2 + return 1 + fi + + # do switchover + do_switchover "$KB_SWITCHOVER_CANDIDATE_NAME" "$KB_SWITCHOVER_CANDIDATE_FQDN" || return 1 +} + +# This is magic for shellspec ut framework. +# Sometime, functions are defined in a single shell script. +# You will want to test it. but you do not want to run the script. +# When included from shellspec, __SOURCED__ variable defined and script +# end here. The script path is assigned to the __SOURCED__ variable. +${__SOURCED__:+false} : || return 0 + +# main +load_redis_cluster_common_utils +check_environment_exist || exit 1 +init_redis_cluster_service_port +if is_empty "$KB_SWITCHOVER_CANDIDATE_FQDN"; then + switchover_without_candidate || exit 1 +else + switchover_with_candidate || exit 1 +fi \ No newline at end of file diff --git a/addons/redis/scripts-ut-spec/redis_switchover_spec.sh b/addons/redis/scripts-ut-spec/redis_switchover_spec.sh index 81b1bac87..40ec87498 100644 --- a/addons/redis/scripts-ut-spec/redis_switchover_spec.sh +++ b/addons/redis/scripts-ut-spec/redis_switchover_spec.sh @@ -1,7 +1,6 @@ # shellcheck shell=bash # shellcheck disable=SC2034 -# validate_shell_type_and_version defined in shellspec/spec_helper.sh used to validate the expected shell type and version this script needs to run. if ! validate_shell_type_and_version "bash" 4 &>/dev/null; then echo "redis_switchover_spec.sh skip all cases because dependency bash version 4 or higher is not installed." exit 0 @@ -9,18 +8,14 @@ fi source ./utils.sh -# The unit test needs to rely on the common library functions defined in kblib. -# Therefore, we first dynamically generate the required common library files from the kblib library chart. common_library_file="./common.sh" generate_common_library $common_library_file Describe "Redis Switchover Script Tests" - Include ../scripts/redis-switchover.sh Include $common_library_file init() { - # set ut_mode to true to hack control flow in the script ut_mode="true" } BeforeAll "init" @@ -30,55 +25,343 @@ Describe "Redis Switchover Script Tests" } AfterAll 'cleanup' - Describe "switchover" + Describe "Environment Check Tests" + Context "check_environment_exist()" + It "should fail when no required variables are set" + unset SENTINEL_POD_FQDN_LIST REDIS_POD_FQDN_LIST REDIS_COMPONENT_NAME KB_SWITCHOVER_ROLE + When call check_environment_exist + The status should be failure + The stderr should include "Required environment variable SENTINEL_POD_FQDN_LIST is not set" + The stdout should equal "" + End + + It "should succeed with all required variables" + export SENTINEL_POD_FQDN_LIST="sentinel1,sentinel2" + export REDIS_POD_FQDN_LIST="redis1,redis2" + export REDIS_COMPONENT_NAME="redis" + export KB_SWITCHOVER_ROLE="primary" + When call check_environment_exist + The status should be success + The stdout should equal "" + The stderr should equal "" + End + + It "should exit early when role is not primary" + export SENTINEL_POD_FQDN_LIST="sentinel1,sentinel2" + export REDIS_POD_FQDN_LIST="redis1,redis2" + export REDIS_COMPONENT_NAME="redis" + export KB_SWITCHOVER_ROLE="secondary" + When call check_environment_exist + The status should be success + The stdout should include "switchover not triggered for primary, nothing to do" + The stderr should equal "" + End + End + End + + Describe "Redis Operation Tests" + Context "check_redis_role()" + setup() { + export REDIS_DEFAULT_PASSWORD="password123" + } + Before 'setup' + + cleanup() { + unset REDIS_DEFAULT_PASSWORD + } + After 'cleanup' + + It "should identify primary role" + redis-cli() { + echo "# Replication +role:master +connected_slaves:2" + } + When call check_redis_role "localhost" "6379" + The status should be success + The output should equal "primary" + The stderr should equal "" + End + + It "should identify secondary role" + redis-cli() { + echo "# Replication +role:slave +master_host:redis-master" + } + When call check_redis_role "localhost" "6379" + The status should be success + The output should equal "secondary" + The stderr should equal "" + End + + It "should handle redis-cli failure" + redis-cli() { + return 1 + } + When call check_redis_role "localhost" "6379" + The status should be failure + The stderr should include "Failed to get role info from localhost" + The output should equal "" + End + + It "should handle empty response" + redis-cli() { + echo "" + } + When call check_redis_role "localhost" "6379" + The status should be failure + The output should equal "unknown" + End + End + + Context "check_redis_kernel_status()" + setup() { + export REDIS_POD_FQDN_LIST="redis1,redis2,redis3" + export SERVICE_PORT="6379" + } + Before 'setup' + + cleanup() { + unset REDIS_POD_FQDN_LIST + unset SERVICE_PORT + } + After 'cleanup' + + It "should detect single primary correctly" + check_redis_role() { + case "$1" in + "redis1") echo "primary" ;; + *) echo "secondary" ;; + esac + } + When call check_redis_kernel_status + The status should be success + The output should equal "redis1" + The stderr should equal "" + End + + It "should fail when multiple primaries detected" + check_redis_role() { + echo "primary" + } + When call check_redis_kernel_status + The status should be failure + The stderr should include "Multiple primaries detected" + The stdout should equal "" + End + + It "should fail when no primary found" + check_redis_role() { + echo "secondary" + } + When call check_redis_kernel_status + The status should be failure + The stderr should include "No primary found" + The stdout should equal "" + End + End + + Context "execute_sub_command()" + It "should succeed with OK response" + redis-cli() { + echo "OK" + } + When call execute_sub_command "localhost" "6379" "password" "PING" + The status should be success + The stdout should include "Command executed successfully" + The stderr should equal "" + End + + It "should fail with non-OK response" + redis-cli() { + echo "ERROR" + } + When call execute_sub_command "localhost" "6379" "password" "PING" + The status should be failure + The stderr should include "Command failed" + The stdout should include "ERROR" + End + + It "should fail when redis-cli fails" + redis-cli() { + return 1 + } + When call execute_sub_command "localhost" "6379" "password" "PING" + The status should be failure + The stderr should include "Command failed" + The stdout should include "execute_sub_command output:" + End + End + + Context "execute_sentinel_failover()" + setup() { + export SENTINEL_POD_FQDN_LIST="sentinel1,sentinel2" + export SENTINEL_SERVICE_PORT="26379" + export SENTINEL_PASSWORD="sentinel_pass" + } + Before 'setup' + + cleanup() { + unset SENTINEL_POD_FQDN_LIST + unset SENTINEL_SERVICE_PORT + unset SENTINEL_PASSWORD + } + After 'cleanup' + + It "should succeed with first sentinel" + execute_sub_command() { + echo "OK" + return 0 + } + When call execute_sentinel_failover "redis" + The status should be success + The stdout should include "Sentinel failover started with sentinel1" + The stderr should equal "" + End + + It "should fail when all sentinels fail" + execute_sub_command() { + return 1 + } + call_func_with_retry() { + return 1 + } + When call execute_sentinel_failover "redis" + The status should be failure + The stderr should include "All Sentinel failover attempts failed" + End + End + End + + Describe "Switchover Tests" setup() { - export REDIS_DEFAULT_PASSWORD="redis_default_password" - export SENTINEL_POD_FQDN_LIST="redis-redis-sentinel-0.redis-redis-sentinel-headless.test.svc,\ - redis-redis-sentinel-1.redis-redis-sentinel-headless.test.svc,\ - redis-redis-sentinel-2.redis-redis-sentinel-headless.test.svc" - export KB_SWITCHOVER_CANDIDATE_FQDN="redis-redis-1.redis-redis-headless.default.svc.local" - export REDIS_COMPONENT_NAME="redis-redis" + export REDIS_DEFAULT_PASSWORD="redis_pass" + export SENTINEL_POD_FQDN_LIST="sentinel1,sentinel2" + export REDIS_POD_FQDN_LIST="redis1,redis2,redis3" + export KB_SWITCHOVER_CANDIDATE_FQDN="redis2" + export REDIS_COMPONENT_NAME="redis" + export SERVICE_PORT="6379" + export KB_SWITCHOVER_ROLE="primary" + export SENTINEL_SERVICE_PORT="26379" + export SENTINEL_PASSWORD="sentinel_pass" + MOCK_RESPONSES=() + RESPONSE_INDEX=0 } Before 'setup' - un_setup() { + cleanup() { unset REDIS_DEFAULT_PASSWORD unset SENTINEL_POD_FQDN_LIST + unset REDIS_POD_FQDN_LIST unset KB_SWITCHOVER_CANDIDATE_FQDN unset REDIS_COMPONENT_NAME + unset SERVICE_PORT + unset KB_SWITCHOVER_ROLE + unset SENTINEL_SERVICE_PORT + unset SENTINEL_PASSWORD + unset MOCK_RESPONSES + unset RESPONSE_INDEX } - After 'un_setup' + After 'cleanup' Context "switchover_with_candidate()" - It "redis set recover replica priority should equal pre state and sentinel failover should start" - check_connectivity() { - echo "$KB_SWITCHOVER_CANDIDATE_FQDN is reachable on port 6379." - return 0 - } - redis_config_get() { - echo -e "replica-priority\n100" - return 0 - } - execute_sub_command() { - echo "Command executed successfully." - return 0 + It "should execute successful switchover" + check_redis_role() { + if [ "$1" = "redis2" ]; then + echo "secondary" + else + echo "primary" + fi } + check_redis_kernel_status() { return 0; } + set_redis_priorities() { return 0; } + execute_sentinel_failover() { return 0; } + check_switchover_result() { return 0; } + recover_redis_priorities() { return 0; } + When call switchover_with_candidate The status should be success - The stdout should include "Sentinel failover execute in redis-redis-sentinel-0.redis-redis-sentinel-headless.test.svc, Switchover is processing" - The stdout should include "Command executed successfully" + The stdout should include "All Redis config set replica-priority recovered" + The stderr should equal "" + End + + It "should fail when candidate is primary" + check_redis_role() { + echo "primary" + } + When call switchover_with_candidate + The status should be failure + The stderr should include "not in secondary role" + The stdout should equal "" End End + Context "switchover_without_candidate()" - It "sentinel failover should start" - execute_sub_command() { - echo "Command executed successfully." - return 0 + It "should execute successful switchover" + MOCK_RESPONSES=("redis1" "redis2") + check_redis_kernel_status() { + local response=${MOCK_RESPONSES[$RESPONSE_INDEX]} + RESPONSE_INDEX=$((RESPONSE_INDEX + 1)) + echo "$response" } + execute_sentinel_failover() { return 0;} + check_switchover_result() { return 0; } When call switchover_without_candidate The status should be success - The stdout should include "Sentinel failover start with redis-redis-sentinel-0.redis-redis-sentinel-headless.test.svc, Switchover is processing" - The stdout should include "Command executed successfully" + End + + It "should fail when initial status check fails" + check_redis_kernel_status() { + return 1 + } + When call switchover_without_candidate + The status should be failure + The stdout should equal "" + End + + It "should fail when sentinel failover fails" + check_redis_kernel_status() { + echo "redis1" + } + execute_sentinel_failover() { + return 1 + } + When call switchover_without_candidate + The status should be failure + The stdout should equal "" + End + End + + Context "check_switchover_result()" + It "should succeed when expected master is achieved" + check_redis_kernel_status() { + echo "redis2" + } + When call check_switchover_result "redis2" "" + The status should be success + The stdout should include "Switchover successful: redis2 is now master" + The stderr should equal "" + End + + It "should succeed when switched from initial master" + check_redis_kernel_status() { + echo "redis2" + } + When call check_switchover_result "" "redis1" + The status should be success + The stdout should include "Switchover successful: new master is redis2" + The stderr should equal "" + End + + It "should fail when neither expected nor initial master specified" + check_redis_kernel_status() { + echo "redis2" + } + When call check_switchover_result "" "" + The status should be failure + The stderr should include "Neither expected_master nor initial_master specified" + The stdout should equal "" End End End diff --git a/addons/redis/scripts/redis-switchover.sh b/addons/redis/scripts/redis-switchover.sh index eb5cbc65a..e93d0c3eb 100644 --- a/addons/redis/scripts/redis-switchover.sh +++ b/addons/redis/scripts/redis-switchover.sh @@ -13,12 +13,15 @@ # you should set ut_mode="true" when you want to run the script in shellspec file. # # shellcheck disable=SC2034 +# shellcheck disable=SC2153 ut_mode="false" test || __() { # when running in non-unit test mode, set the options "set -ex". set -ex; } +declare -A ORIGINAL_PRIORITIES + load_common_library() { # the common.sh scripts is mounted to the same path which is defined in the cmpd.spec.scripts common_library_file="/scripts/common.sh" @@ -27,22 +30,116 @@ load_common_library() { } check_environment_exist() { - # shellcheck disable=SC2153 - if is_empty "$SENTINEL_POD_FQDN_LIST"; then - echo "Error: Required environment variable SENTINEL_POD_FQDN_LIST: $SENTINEL_POD_FQDN_LIST is not set." - exit 1 + local required_vars=( + "SENTINEL_POD_FQDN_LIST" + "REDIS_POD_FQDN_LIST" + "REDIS_COMPONENT_NAME" + "KB_SWITCHOVER_ROLE" + ) + + for var in "${required_vars[@]}"; do + if is_empty "${!var}"; then + echo "Error: Required environment variable $var is not set." >&2 + return 1 + fi + done + + if [ "$KB_SWITCHOVER_ROLE" != "primary" ]; then + echo "switchover not triggered for primary, nothing to do" + return 0 + fi +} + +check_redis_role() { + local host=$1 + local port=$2 + unset_xtrace_when_ut_mode_false + local role_info + if [[ -z "$REDIS_DEFAULT_PASSWORD" ]]; then + role_info=$(redis-cli -h "$host" -p "$port" info replication) + else + role_info=$(redis-cli -h "$host" -p "$port" -a "$REDIS_DEFAULT_PASSWORD" info replication) + fi + status=$? + set_xtrace_when_ut_mode_false + + if [[ $status -ne 0 ]]; then + echo "Failed to get role info from $host" >&2 + return 1 fi - # shellcheck disable=SC2153 - if is_empty "$REDIS_POD_FQDN_LIST"; then - echo "Error: Required environment variable REDIS_POD_FQDN_LIST: $REDIS_POD_FQDN_LIST is not set." - exit 1 + if echo "$role_info" | grep -q "^role:master"; then + echo "primary" + elif echo "$role_info" | grep -q "^role:slave"; then + echo "secondary" + else + echo "unknown" + return 1 fi +} + +check_redis_kernel_status() { + local role + local current_master="" + local -a redis_pod_fqdn_list + IFS=',' read -ra redis_pod_fqdn_list <<< "${REDIS_POD_FQDN_LIST}" + for redis_pod_fqdn in "${redis_pod_fqdn_list[@]}"; do + role=$(check_redis_role "$redis_pod_fqdn" "$SERVICE_PORT") || continue + if [[ "$role" == "primary" ]]; then + if [[ -n "$current_master" ]]; then + echo "Error: Multiple primaries detected" >&2 + return 1 + fi + current_master="$redis_pod_fqdn" + fi + done + + if [[ -z "$current_master" ]]; then + echo "Error: No primary found" >&2 + return 1 + fi + + echo "$current_master" + return 0 +} + +check_switchover_result() { + local expected_master="$1" + local initial_master="$2" + local max_wait=300 + local wait_interval=5 + local elapsed=0 + + while [[ $elapsed -lt $max_wait ]]; do + local current_master + if current_master=$(check_redis_kernel_status); then + # if expected_master is specified, check if it is achieved + if ! is_empty "$expected_master"; then + if [[ "$current_master" = "$expected_master"* ]]; then + echo "Switchover successful: $expected_master is now master" + return 0 + fi + # if initial_master is specified, check if it is switched to a different node + elif ! is_empty "$initial_master"; then + if [[ "$current_master" != "$initial_master" ]]; then + echo "Switchover successful: new master is $current_master" + return 0 + fi + else + echo "Error: Neither expected_master nor initial_master specified" >&2 + return 1 + fi + fi + sleep_when_ut_mode_false $wait_interval + elapsed=$((elapsed + wait_interval)) + done - if is_empty "$REDIS_COMPONENT_NAME"; then - echo "Error: Required environment variable REDIS_COMPONENT_NAME: $REDIS_COMPONENT_NAME is not set." - exit 1 + if ! is_empty "$expected_master"; then + echo "Switchover verification failed: expected master $expected_master not achieved" >&2 + else + echo "Switchover verification failed: could not confirm new master" >&2 fi + return 1 } check_connectivity() { @@ -50,16 +147,23 @@ check_connectivity() { local port=$2 local password=$3 echo "Checking connectivity to $host on port $port using redis-cli..." - if redis-cli -h "$host" -p "$port" -a "$password" PING | grep -q "PONG"; then + local result + unset_xtrace_when_ut_mode_false + if ! is_empty "$password"; then + result=$(redis-cli -h "$host" -p "$port" -a "$password" PING) + else + result=$(redis-cli -h "$host" -p "$port" PING) + fi + set_xtrace_when_ut_mode_false + if [[ "$result" == "PONG" ]]; then echo "$host is reachable on port $port." return 0 else - echo "$host is not reachable on port $port." + echo "$host is not reachable on port $port." >&2 return 1 fi } -# Function to execute and log redis-cli command execute_sub_command() { local host=$1 local port=$2 @@ -67,54 +171,47 @@ execute_sub_command() { local command=$4 local output - local status - - # Check if password is provided, build the appropriate redis-cli command + unset_xtrace_when_ut_mode_false if ! is_empty "$password"; then output=$(redis-cli -h "$host" -p "$port" -a "$password" $command) - status=$? else output=$(redis-cli -h "$host" -p "$port" $command) - status=$? fi - echo "$output" - # Check if the command failed or the output is not "OK" - if [ $status -ne 0 ] || ! equals "$output" "OK"; then - echo "Command failed with status $status or output not OK." + local status=$? + set_xtrace_when_ut_mode_false + + echo "execute_sub_command output: $output" + if [[ $status -ne 0 ]] || [[ "$output" != "OK" ]]; then + echo "Command failed with status $status or output not OK." >&2 return 1 - else - echo "Command executed successfully." - return 0 fi + echo "Command executed successfully." + return 0 } -redis_config_get(){ +redis_config_get() { local host=$1 local port=$2 local password=$3 local command=$4 local output - local status - - # Check if password is provided, build the appropriate redis-cli command + unset_xtrace_when_ut_mode_false if ! is_empty "$password"; then output=$(redis-cli -h "$host" -p "$port" -a "$password" $command) - status=$? else output=$(redis-cli -h "$host" -p "$port" $command) - status=$? fi + local status=$? + set_xtrace_when_ut_mode_false - # Check if the command failed - if [ $status -ne 0 ]; then - echo "Command failed with status $status." + if [[ $status -ne 0 ]]; then + echo "Command failed with status $status." >&2 return 1 fi - # Check if the output is empty - if is_empty "$output"; then - echo "Command returned no output." + if [[ -z "$output" ]]; then + echo "Command returned no output." >&2 return 1 fi @@ -122,81 +219,126 @@ redis_config_get(){ return 0 } -switchover_with_candidate() { - redis_get_cmd="CONFIG GET replica-priority" - redis_set_switchover_cmd="CONFIG SET replica-priority 1" - redis_set_lowest_priority_cmd="CONFIG SET replica-priority 100" - IFS=',' read -ra redis_pod_fqdn_list <<< "${REDIS_POD_FQDN_LIST}" - unset_xtrace_when_ut_mode_false - - declare -A original_priorities - for redis_pod_fqdn in "${redis_pod_fqdn_list[@]}"; do - call_func_with_retry 3 5 check_connectivity "$redis_pod_fqdn" $SERVICE_PORT "$REDIS_DEFAULT_PASSWORD" || exit 1 - original_priority=$(redis_config_get "$redis_pod_fqdn" $SERVICE_PORT "$REDIS_DEFAULT_PASSWORD" "$redis_get_cmd" | sed -n '2p') - original_priorities["$redis_pod_fqdn"]=$original_priority - - if [ "$redis_pod_fqdn" = "$KB_SWITCHOVER_CANDIDATE_FQDN" ]; then - call_func_with_retry 3 5 execute_sub_command "$redis_pod_fqdn" $SERVICE_PORT "$REDIS_DEFAULT_PASSWORD" "$redis_set_switchover_cmd" || exit 1 - else - call_func_with_retry 3 5 execute_sub_command "$redis_pod_fqdn" $SERVICE_PORT "$REDIS_DEFAULT_PASSWORD" "$redis_set_lowest_priority_cmd" || exit 1 - fi - done +execute_sentinel_failover() { + local master_name=$1 + local success=false - # TODO: check the role in kernel before switchover - IFS=',' read -ra sentinel_pod_fqdn_list <<< "${SENTINEL_POD_FQDN_LIST}" - if is_empty "$CUSTOM_SENTINEL_MASTER_NAME"; then + if [[ -z "$master_name" ]]; then master_name=$REDIS_COMPONENT_NAME - else - master_name="$CUSTOM_SENTINEL_MASTER_NAME" fi - local success=false + local -a sentinel_pod_fqdn_list + IFS=',' read -ra sentinel_pod_fqdn_list <<< "${SENTINEL_POD_FQDN_LIST}" + unset_xtrace_when_ut_mode_false for sentinel_pod_fqdn in "${sentinel_pod_fqdn_list[@]}"; do - if call_func_with_retry 3 5 execute_sub_command "$sentinel_pod_fqdn" $SENTINEL_SERVICE_PORT "$SENTINEL_PASSWORD" "SENTINEL FAILOVER $master_name"; then - echo "Sentinel failover execute in $sentinel_pod_fqdn, Switchover is processing" + if call_func_with_retry 3 5 execute_sub_command "$sentinel_pod_fqdn" "$SENTINEL_SERVICE_PORT" "$SENTINEL_PASSWORD" "SENTINEL FAILOVER $master_name"; then + echo "Sentinel failover started with $sentinel_pod_fqdn" success=true break fi done + set_xtrace_when_ut_mode_false - if [ "$success" = false ]; then + if [[ "$success" == false ]]; then echo "All Sentinel failover attempts failed." >&2 - exit 1 + return 1 fi - echo "Sentinel failover command execute success." + return 0 +} + +# set target candidate highest priority to make sure it will be promoted to master +set_redis_priorities() { + local candidate_fqdn="$1" + + local -a redis_pod_fqdn_list + IFS=',' read -ra redis_pod_fqdn_list <<< "${REDIS_POD_FQDN_LIST}" + for redis_pod_fqdn in "${redis_pod_fqdn_list[@]}"; do + call_func_with_retry 3 5 check_connectivity "$redis_pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" || return 1 + + # Get original priority + local redis_get_cmd="CONFIG GET replica-priority" + local original_priority + original_priority=$(redis_config_get "$redis_pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" "$redis_get_cmd" | sed -n '2p') + status=$? + if [ $status -ne 0 ]; then + echo "Error: Failed to get replica-priority for $redis_pod_fqdn" >&2 + return 1 + fi + + # Save original priority to global variable + ORIGINAL_PRIORITIES[$redis_pod_fqdn]=$original_priority + + local redis_set_cmd + if [[ "$redis_pod_fqdn" = "$candidate_fqdn"* ]]; then + redis_set_cmd="CONFIG SET replica-priority 1" + else + redis_set_cmd="CONFIG SET replica-priority 100" + fi + + call_func_with_retry 3 5 execute_sub_command "$redis_pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" "$redis_set_cmd" || return 1 + done + return 0 +} - # TODO: check switchover result +# recover all redis replica-priority +recover_redis_priorities() { + local -a redis_pod_fqdn_list + IFS=',' read -ra redis_pod_fqdn_list <<< "${REDIS_POD_FQDN_LIST}" + + echo "Recovering all Redis replica-priority..." for redis_pod_fqdn in "${redis_pod_fqdn_list[@]}"; do - redis_set_recover_cmd="CONFIG SET replica-priority ${original_priorities[$redis_pod_fqdn]}" - call_func_with_retry 3 5 execute_sub_command "$redis_pod_fqdn" $SERVICE_PORT "$REDIS_DEFAULT_PASSWORD" "$redis_set_recover_cmd" || exit 1 + local redis_set_recover_cmd="CONFIG SET replica-priority ${ORIGINAL_PRIORITIES[$redis_pod_fqdn]}" + call_func_with_retry 3 5 execute_sub_command "$redis_pod_fqdn" "$SERVICE_PORT" "$REDIS_DEFAULT_PASSWORD" "$redis_set_recover_cmd" || return 1 done echo "All Redis config set replica-priority recovered." - set_xtrace_when_ut_mode_false + return 0 } -switchover_without_candidate() { - # TODO: check the role in kernel before switchover - IFS=',' read -ra sentinel_pod_fqdn_list <<< "${SENTINEL_POD_FQDN_LIST}" - if is_empty "$CUSTOM_SENTINEL_MASTER_NAME"; then - master_name=$REDIS_COMPONENT_NAME - else - master_name="$CUSTOM_SENTINEL_MASTER_NAME" +switchover_with_candidate() { + # check the role of candidate before switchover + local candidate_role + candidate_role=$(check_redis_role "$KB_SWITCHOVER_CANDIDATE_FQDN" "$SERVICE_PORT") + if [[ "$candidate_role" != "secondary" ]]; then + echo "Error: Candidate node $KB_SWITCHOVER_CANDIDATE_FQDN is not in secondary role" >&2 + return 1 fi - local success=false + + # check redis kernel role before switchover + local initial_master + initial_master=$(check_redis_kernel_status) || return 1 + + local redis_get_cmd="CONFIG GET replica-priority" + local redis_set_switchover_cmd="CONFIG SET replica-priority 1" + local redis_set_lowest_priority_cmd="CONFIG SET replica-priority 100" + + # set target candidate highest priority to make sure it will be promoted to master unset_xtrace_when_ut_mode_false - for sentinel_pod_fqdn in "${sentinel_pod_fqdn_list[@]}"; do - if call_func_with_retry 3 5 execute_sub_command "$sentinel_pod_fqdn" $SENTINEL_SERVICE_PORT "$SENTINEL_PASSWORD" "SENTINEL FAILOVER $master_name"; then - echo "Sentinel failover start with $sentinel_pod_fqdn, Switchover is processing" - success=true - break - fi - done + set_redis_priorities "$KB_SWITCHOVER_CANDIDATE_FQDN" || return 1 + + # do switchover + execute_sentinel_failover "$CUSTOM_SENTINEL_MASTER_NAME" || return 1 + + # check switchover result + check_switchover_result "$KB_SWITCHOVER_CANDIDATE_FQDN" "" || return 1 + + # recover all redis replica-priority + echo "Recovering all Redis replica-priority..." + recover_redis_priorities || return 1 + set_xtrace_when_ut_mode_false - if [ "$success" = false ]; then - echo "All Sentinel failover attempts failed." - exit 1 - fi - # TODO: check switchover result + echo "All Redis config set replica-priority recovered." +} + +switchover_without_candidate() { + # check redis kernel role before switchover + local initial_master + initial_master=$(check_redis_kernel_status) || return 1 + + # do switchover + execute_sentinel_failover "$CUSTOM_SENTINEL_MASTER_NAME" || return 1 + + # check switchover result using initial_master + check_switchover_result "" "$initial_master" || return 1 } # This is magic for shellspec ut framework. @@ -208,9 +350,9 @@ ${__SOURCED__:+false} : || return 0 # main load_common_library -check_environment_exist +check_environment_exist || exit 1 if is_empty "$KB_SWITCHOVER_CANDIDATE_FQDN"; then - switchover_without_candidate + switchover_without_candidate || exit 1 else - switchover_with_candidate + switchover_with_candidate || exit 1 fi diff --git a/addons/redis/templates/cmpd-redis-cluster-7.yaml b/addons/redis/templates/cmpd-redis-cluster-7.yaml index 06173eecd..5001f70d4 100644 --- a/addons/redis/templates/cmpd-redis-cluster-7.yaml +++ b/addons/redis/templates/cmpd-redis-cluster-7.yaml @@ -376,6 +376,14 @@ spec: - /scripts/redis-cluster-replica-member-leave.sh retryPolicy: maxRetries: 10 + switchover: + exec: + image: {{ include "redis7.image" . }} + container: redis-cluster + command: + - /bin/bash + - -c + - /scripts/redis-cluster-switchover.sh runtime: initContainers: - name: init-dbctl