From a597c393285552cf1a503a91975f1231f067d685 Mon Sep 17 00:00:00 2001 From: yangw Date: Tue, 21 Nov 2023 04:21:39 +0800 Subject: [PATCH] [Fix] : cluster scaling (#704) * add scale e2e yamls Signed-off-by: drivebyer * temp Signed-off-by: drivebyer * fix the cluster scale Signed-off-by: drivebyer * adjust e2e Signed-off-by: drivebyer --------- Signed-off-by: drivebyer Signed-off-by: drivebyer --- .github/workflows/e2e.yaml | 1 + controllers/rediscluster_controller.go | 84 ++++++++++--------- k8sutils/cluster-scaling.go | 22 +++-- k8sutils/redis.go | 6 +- tests/_config/kuttl-test.yaml | 3 +- .../scaling/redis-cluster/00-install.yaml | 7 ++ .../scaling/redis-cluster/01-scale-up.yaml | 7 ++ .../scaling/redis-cluster/02-scale-down.yaml | 7 ++ .../redis-cluster/cluster-scale-up.yaml | 47 +++++++++++ .../scaling/redis-cluster/cluster.yaml | 47 +++++++++++ .../redis-cluster/ready-cluster-scale-up.yaml | 8 ++ .../scaling/redis-cluster/ready-cluster.yaml | 8 ++ .../redis-cluster/ready-sts-scale-up.yaml | 23 +++++ .../scaling/redis-cluster/ready-sts.yaml | 25 ++++++ 14 files changed, 245 insertions(+), 50 deletions(-) create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/00-install.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/01-scale-up.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/02-scale-down.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/cluster-scale-up.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/cluster.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster-scale-up.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/ready-sts-scale-up.yaml create mode 100644 tests/e2e/v1beta2/scaling/redis-cluster/ready-sts.yaml diff --git a/.github/workflows/e2e.yaml b/.github/workflows/e2e.yaml index 9c1aaa69c..1e82126eb 100644 --- a/.github/workflows/e2e.yaml +++ b/.github/workflows/e2e.yaml @@ -18,6 +18,7 @@ jobs: - ./tests/e2e/v1beta2/setup - ./tests/e2e/v1beta2/teardown - ./tests/e2e/v1beta2/ignore-annots + - ./tests/e2e/v1beta2/scaling steps: - name: Checkout code diff --git a/controllers/rediscluster_controller.go b/controllers/rediscluster_controller.go index 8a43c5840..9b99bbaba 100644 --- a/controllers/rediscluster_controller.go +++ b/controllers/rediscluster_controller.go @@ -75,27 +75,36 @@ func (r *RedisClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request // Check if the cluster is downscaled if leaderReplicas < instance.Status.ReadyLeaderReplicas { - - // Imp if the last index of leader sts is not leader make it then - // check whether the redis is leader or not ? - // if not true then make it leader pod - - if !(k8sutils.VerifyLeaderPod(ctx, r.K8sClient, r.Log, instance)) { - // lastLeaderPod is slaving right now Make it the master Pod - // We have to bring a manual failover here to make it a leaderPod - // clusterFailover should also include the clusterReplicate since we have to map the followers to new leader - k8sutils.ClusterFailover(ctx, r.K8sClient, r.Log, instance) + reqLogger.Info("Redis cluster is downscaling...", "Ready.ReadyLeaderReplicas", instance.Status.ReadyLeaderReplicas, "Expected.ReadyLeaderReplicas", leaderReplicas) + + // loop count times to remove the latest leader/follower pod + count := instance.Status.ReadyLeaderReplicas - leaderReplicas + for i := int32(0); i < count; i++ { + reqLogger.Info("Redis cluster is downscaling", "The times of loop", i) + + // Imp if the last index of leader sts is not leader make it then + // check whether the redis is leader or not ? + // if not true then make it leader pod + if !(k8sutils.VerifyLeaderPod(ctx, r.K8sClient, r.Log, instance)) { + // lastLeaderPod is slaving right now Make it the master Pod + // We have to bring a manual failover here to make it a leaderPod + // clusterFailover should also include the clusterReplicate since we have to map the followers to new leader + k8sutils.ClusterFailover(ctx, r.K8sClient, r.Log, instance) + } + // Step 1 Remove the Follower Node + k8sutils.RemoveRedisFollowerNodesFromCluster(ctx, r.K8sClient, r.Log, instance) + // Step 2 Reshard the Cluster + k8sutils.ReshardRedisCluster(r.K8sClient, r.Log, instance, true) } - - // Step 1 Rehard the Cluster - k8sutils.ReshardRedisCluster(r.K8sClient, r.Log, instance) - // Step 2 Remove the Follower Node - k8sutils.RemoveRedisFollowerNodesFromCluster(ctx, r.K8sClient, r.Log, instance) - // Step 3 Remove the Leader Node - k8sutils.RemoveRedisNodeFromCluster(ctx, r.K8sClient, r.Log, instance) - // Step 4 Rebalance the cluster + reqLogger.Info("Redis cluster is downscaled... Rebalancing the cluster") + // Step 3 Rebalance the cluster k8sutils.RebalanceRedisCluster(r.K8sClient, r.Log, instance) - return ctrl.Result{RequeueAfter: time.Second * 100}, nil + reqLogger.Info("Redis cluster is downscaled... Rebalancing the cluster is done") + err = k8sutils.UpdateRedisClusterStatus(instance, status.RedisClusterReady, status.ReadyClusterReason, leaderReplicas, leaderReplicas) + if err != nil { + return ctrl.Result{RequeueAfter: time.Second * 10}, err + } + return ctrl.Result{RequeueAfter: time.Second * 60}, nil } // Mark the cluster status as initializing if there are no leader or follower nodes @@ -130,7 +139,7 @@ func (r *RedisClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request return ctrl.Result{}, err } - if int32(redisLeaderInfo.Status.ReadyReplicas) == leaderReplicas { + if redisLeaderInfo.Status.ReadyReplicas == leaderReplicas { // Mark the cluster status as initializing if there are no follower nodes if instance.Status.ReadyLeaderReplicas == 0 && instance.Status.ReadyFollowerReplicas == 0 { @@ -166,26 +175,24 @@ func (r *RedisClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request if leaderReplicas == 0 { reqLogger.Info("Redis leaders Cannot be 0", "Ready.Replicas", strconv.Itoa(int(redisLeaderInfo.Status.ReadyReplicas)), "Expected.Replicas", leaderReplicas) - return ctrl.Result{RequeueAfter: time.Second * 120}, nil + return ctrl.Result{RequeueAfter: time.Second * 60}, nil } if !(redisLeaderInfo.Status.ReadyReplicas == leaderReplicas && redisFollowerInfo.Status.ReadyReplicas == followerReplicas) { reqLogger.Info("Redis leader and follower nodes are not ready yet", "Ready.Replicas", strconv.Itoa(int(redisLeaderInfo.Status.ReadyReplicas)), "Expected.Replicas", leaderReplicas) - return ctrl.Result{RequeueAfter: time.Second * 120}, nil + return ctrl.Result{RequeueAfter: time.Second * 60}, nil } // Mark the cluster status as bootstrapping if all the leader and follower nodes are ready - if int32(redisLeaderInfo.Status.ReadyReplicas) == leaderReplicas && int32(redisFollowerInfo.Status.ReadyReplicas) == followerReplicas { - if instance.Status.ReadyLeaderReplicas == leaderReplicas && instance.Status.ReadyFollowerReplicas == 0 { - err = k8sutils.UpdateRedisClusterStatus(instance, status.RedisClusterBootstrap, status.BootstrapClusterReason, leaderReplicas, followerReplicas) - if err != nil { - return ctrl.Result{RequeueAfter: time.Second * 10}, err - } + if !(instance.Status.ReadyLeaderReplicas == leaderReplicas && instance.Status.ReadyFollowerReplicas == followerReplicas) { + err = k8sutils.UpdateRedisClusterStatus(instance, status.RedisClusterBootstrap, status.BootstrapClusterReason, leaderReplicas, followerReplicas) + if err != nil { + return ctrl.Result{RequeueAfter: time.Second * 10}, err } } reqLogger.Info("Creating redis cluster by executing cluster creation commands", "Leaders.Ready", strconv.Itoa(int(redisLeaderInfo.Status.ReadyReplicas)), "Followers.Ready", strconv.Itoa(int(redisFollowerInfo.Status.ReadyReplicas))) - if k8sutils.CheckRedisNodeCount(ctx, r.K8sClient, r.Log, instance, "") != totalReplicas { + if nc := k8sutils.CheckRedisNodeCount(ctx, r.K8sClient, r.Log, instance, ""); nc != totalReplicas { leaderCount := k8sutils.CheckRedisNodeCount(ctx, r.K8sClient, r.Log, instance, "leader") if leaderCount != leaderReplicas { reqLogger.Info("Not all leader are part of the cluster...", "Leaders.Count", leaderCount, "Instance.Size", leaderReplicas) @@ -208,16 +215,17 @@ func (r *RedisClusterReconciler) Reconcile(ctx context.Context, req ctrl.Request reqLogger.Info("no follower/replicas configured, skipping replication configuration", "Leaders.Count", leaderCount, "Leader.Size", leaderReplicas, "Follower.Replicas", followerReplicas) } } - } else { - reqLogger.Info("Redis leader count is desired") - if int(totalReplicas) > 1 && k8sutils.CheckRedisClusterState(ctx, r.K8sClient, r.Log, instance) >= int(totalReplicas)-1 { - reqLogger.Info("Redis leader is not desired, executing failover operation") - err = k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, r.Log, instance) - if err != nil { - return ctrl.Result{RequeueAfter: time.Second * 10}, err - } + reqLogger.Info("Redis cluster count is not desired", "Current.Count", nc, "Desired.Count", totalReplicas) + return ctrl.Result{RequeueAfter: time.Second * 60}, nil + } + + reqLogger.Info("Redis cluster count is desired") + if int(totalReplicas) > 1 && k8sutils.CheckRedisClusterState(ctx, r.K8sClient, r.Log, instance) >= int(totalReplicas)-1 { + reqLogger.Info("Redis leader is not desired, executing failover operation") + err = k8sutils.ExecuteFailoverOperation(ctx, r.K8sClient, r.Log, instance) + if err != nil { + return ctrl.Result{RequeueAfter: time.Second * 10}, err } - return ctrl.Result{RequeueAfter: time.Second * 120}, nil } // Check If there is No Empty Master Node diff --git a/k8sutils/cluster-scaling.go b/k8sutils/cluster-scaling.go index b0a384c02..395a5c942 100644 --- a/k8sutils/cluster-scaling.go +++ b/k8sutils/cluster-scaling.go @@ -11,8 +11,10 @@ import ( "k8s.io/client-go/kubernetes" ) -// Reshard the redis Cluster -func ReshardRedisCluster(client kubernetes.Interface, logger logr.Logger, cr *redisv1beta2.RedisCluster) { +// ReshardRedisCluster transfer the slots from the last node to the first node. +// +// NOTE: when all slot been transferred, the node become slave of the first master node. +func ReshardRedisCluster(client kubernetes.Interface, logger logr.Logger, cr *redisv1beta2.RedisCluster, remove bool) { ctx := context.TODO() var cmd []string currentRedisCount := CheckRedisNodeCount(ctx, client, logger, cr, "leader") @@ -72,6 +74,10 @@ func ReshardRedisCluster(client kubernetes.Interface, logger logr.Logger, cr *re return } executeCommand(client, logger, cr, cmd, cr.ObjectMeta.Name+"-leader-0") + + if remove { + RemoveRedisNodeFromCluster(ctx, client, logger, cr, removePOD) + } } func getRedisClusterSlots(ctx context.Context, client kubernetes.Interface, logger logr.Logger, cr *redisv1beta2.RedisCluster, nodeID string) string { @@ -336,18 +342,18 @@ func RemoveRedisFollowerNodesFromCluster(ctx context.Context, client kubernetes. } // Remove redis cluster node would remove last node to the existing redis cluster using redis-cli -func RemoveRedisNodeFromCluster(ctx context.Context, client kubernetes.Interface, logger logr.Logger, cr *redisv1beta2.RedisCluster) { +func RemoveRedisNodeFromCluster(ctx context.Context, client kubernetes.Interface, logger logr.Logger, cr *redisv1beta2.RedisCluster, removePod RedisDetails) { var cmd []string - currentRedisCount := CheckRedisNodeCount(ctx, client, logger, cr, "leader") + //currentRedisCount := CheckRedisNodeCount(ctx, client, logger, cr, "leader") existingPod := RedisDetails{ PodName: cr.ObjectMeta.Name + "-leader-0", Namespace: cr.Namespace, } - removePod := RedisDetails{ - PodName: cr.ObjectMeta.Name + "-leader-" + strconv.Itoa(int(currentRedisCount)-1), - Namespace: cr.Namespace, - } + //removePod := RedisDetails{ + // PodName: cr.ObjectMeta.Name + "-leader-" + strconv.Itoa(int(currentRedisCount)-1), + // Namespace: cr.Namespace, + //} cmd = []string{"redis-cli", "--cluster", "del-node"} diff --git a/k8sutils/redis.go b/k8sutils/redis.go index 51cb0a4e3..0ec4433e9 100644 --- a/k8sutils/redis.go +++ b/k8sutils/redis.go @@ -406,20 +406,20 @@ func getContainerID(client kubernetes.Interface, logger logr.Logger, cr *redisv1 return -1, nil } - logger.Info("Pod info retrieved successfully", "Pod Name", podName, "Namespace", cr.Namespace) + logger.V(1).Info("Pod info retrieved successfully", "Pod Name", podName, "Namespace", cr.Namespace) targetContainer := -1 for containerID, tr := range pod.Spec.Containers { logger.V(1).Info("Inspecting container", "Pod Name", podName, "Container ID", containerID, "Container Name", tr.Name) if tr.Name == cr.ObjectMeta.Name+"-leader" { targetContainer = containerID - logger.Info("Leader container found", "Container ID", containerID, "Container Name", tr.Name) + logger.V(1).Info("Leader container found", "Container ID", containerID, "Container Name", tr.Name) break } } if targetContainer == -1 { - logger.Info("Leader container not found in pod", "Pod Name", podName) + logger.V(1).Info("Leader container not found in pod", "Pod Name", podName) return -1, nil } diff --git a/tests/_config/kuttl-test.yaml b/tests/_config/kuttl-test.yaml index 9630d9f51..984bba05e 100644 --- a/tests/_config/kuttl-test.yaml +++ b/tests/_config/kuttl-test.yaml @@ -3,10 +3,11 @@ kind: TestSuite startKIND: false kindConfig: "./kind-config.yaml" parallel: 1 -timeout: 300 +timeout: 1200 testDirs: - tests/e2e/v1beta2/setup - tests/e2e/v1beta2/teardown - tests/e2e/v1beta2/ignore-annots + - tests/e2e/v1beta2/scaling suppress : - events \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/00-install.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/00-install.yaml new file mode 100644 index 000000000..7529f3a78 --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/00-install.yaml @@ -0,0 +1,7 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +apply : + - cluster.yaml +assert : + - ready-cluster.yaml + - ready-sts.yaml \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/01-scale-up.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/01-scale-up.yaml new file mode 100644 index 000000000..7272499a2 --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/01-scale-up.yaml @@ -0,0 +1,7 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +apply : + - cluster-scale-up.yaml +assert : + - ready-cluster-scale-up.yaml + - ready-sts-scale-up.yaml \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/02-scale-down.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/02-scale-down.yaml new file mode 100644 index 000000000..60d8821f1 --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/02-scale-down.yaml @@ -0,0 +1,7 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +apply : + - cluster.yaml +assert : + - ready-cluster.yaml + - ready-sts.yaml \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/cluster-scale-up.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/cluster-scale-up.yaml new file mode 100644 index 000000000..5f64363e8 --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/cluster-scale-up.yaml @@ -0,0 +1,47 @@ +apiVersion: redis.redis.opstreelabs.in/v1beta2 +kind: RedisCluster +metadata: + name: redis-cluster-v1beta2 +spec: + clusterSize: 6 + clusterVersion: v7 + persistenceEnabled: true + podSecurityContext: + runAsUser: 1000 + fsGroup: 1000 + kubernetesConfig: + image: quay.io/opstree/redis:latest + imagePullPolicy: Always + resources: + requests: + cpu: 101m + memory: 128Mi + limits: + cpu: 101m + memory: 128Mi + redisExporter: + enabled: true + image: quay.io/opstree/redis-exporter:v1.44.0 + imagePullPolicy: Always + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 100m + memory: 128Mi + storage: + volumeClaimTemplate: + spec: + # storageClassName: standard + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi + nodeConfVolume: true + nodeConfVolumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/cluster.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/cluster.yaml new file mode 100644 index 000000000..ecf147104 --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/cluster.yaml @@ -0,0 +1,47 @@ +apiVersion: redis.redis.opstreelabs.in/v1beta2 +kind: RedisCluster +metadata: + name: redis-cluster-v1beta2 +spec: + clusterSize: 3 + clusterVersion: v7 + persistenceEnabled: true + podSecurityContext: + runAsUser: 1000 + fsGroup: 1000 + kubernetesConfig: + image: quay.io/opstree/redis:latest + imagePullPolicy: Always + resources: + requests: + cpu: 101m + memory: 128Mi + limits: + cpu: 101m + memory: 128Mi + redisExporter: + enabled: true + image: quay.io/opstree/redis-exporter:v1.44.0 + imagePullPolicy: Always + resources: + requests: + cpu: 100m + memory: 128Mi + limits: + cpu: 100m + memory: 128Mi + storage: + volumeClaimTemplate: + spec: + # storageClassName: standard + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi + nodeConfVolume: true + nodeConfVolumeClaimTemplate: + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1Gi \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster-scale-up.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster-scale-up.yaml new file mode 100644 index 000000000..907e75ccc --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster-scale-up.yaml @@ -0,0 +1,8 @@ +apiVersion: redis.redis.opstreelabs.in/v1beta2 +kind: RedisCluster +metadata: + name: redis-cluster-v1beta2 +status: + state: Ready + readyLeaderReplicas: 6 + readyFollowerReplicas: 6 \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster.yaml new file mode 100644 index 000000000..3a08c80b6 --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/ready-cluster.yaml @@ -0,0 +1,8 @@ +apiVersion: redis.redis.opstreelabs.in/v1beta2 +kind: RedisCluster +metadata: + name: redis-cluster-v1beta2 +status: + state: Ready + readyLeaderReplicas: 3 + readyFollowerReplicas: 3 \ No newline at end of file diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/ready-sts-scale-up.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/ready-sts-scale-up.yaml new file mode 100644 index 000000000..61d865e0f --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/ready-sts-scale-up.yaml @@ -0,0 +1,23 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: redis-cluster-v1beta2-leader + labels: + app: redis-cluster-v1beta2-leader + redis_setup_type: cluster + role: leader +status: + replicas: 6 + readyReplicas: 6 +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: redis-cluster-v1beta2-follower + labels: + app: redis-cluster-v1beta2-follower + redis_setup_type: cluster + role: follower +status: + replicas: 6 + readyReplicas: 6 diff --git a/tests/e2e/v1beta2/scaling/redis-cluster/ready-sts.yaml b/tests/e2e/v1beta2/scaling/redis-cluster/ready-sts.yaml new file mode 100644 index 000000000..1053eb784 --- /dev/null +++ b/tests/e2e/v1beta2/scaling/redis-cluster/ready-sts.yaml @@ -0,0 +1,25 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: redis-cluster-v1beta2-leader + labels: + app: redis-cluster-v1beta2-leader + redis_setup_type: cluster + role: leader +status: + replicas: 3 + readyReplicas: 3 + +--- + +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: redis-cluster-v1beta2-follower + labels: + app: redis-cluster-v1beta2-follower + redis_setup_type: cluster + role: follower +status: + replicas: 3 + readyReplicas: 3