From f147a4098ddb4c54164b4b61f2d1fda89432b2dd Mon Sep 17 00:00:00 2001 From: wangyelei Date: Thu, 6 Feb 2025 11:05:10 +0800 Subject: [PATCH] fix: switchover failed (#546) Co-authored-by: wangyelei (cherry picked from commit 99ff4bfe5f06aee3eb8968dff932f34544b34682) --- docs/user_docs/cli/kbcli_cluster_promote.md | 1 + go.mod | 2 +- go.sum | 4 +- .../template/cluster_operations_template.cue | 2 +- pkg/cluster/cluster.go | 1 - pkg/cmd/cluster/operations.go | 56 +++++++++++++------ pkg/cmd/cluster/operations_test.go | 12 ++-- pkg/cmd/componentdefinition/describe.go | 9 ++- pkg/testing/fake.go | 21 +++---- 9 files changed, 64 insertions(+), 44 deletions(-) diff --git a/docs/user_docs/cli/kbcli_cluster_promote.md b/docs/user_docs/cli/kbcli_cluster_promote.md index a76bb1548..ad6f60d2f 100644 --- a/docs/user_docs/cli/kbcli_cluster_promote.md +++ b/docs/user_docs/cli/kbcli_cluster_promote.md @@ -24,6 +24,7 @@ kbcli cluster promote NAME [--instance ] [flags] --edit Edit the API resource before creating --force skip the pre-checks of the opsRequest to run the opsRequest forcibly -h, --help help for promote + --instance string Specify the instance name that will transfer its role to the candidate pod, If not set, the current primary or leader of the cluster will be used. --name string OpsRequest name. if not specified, it will be randomly generated -o, --output format Prints the output in the specified format. Allowed values: JSON and YAML (default yaml) --ttlSecondsAfterSucceed int Time to live after the OpsRequest succeed diff --git a/go.mod b/go.mod index 5bcbaceb2..97bf3e0c1 100644 --- a/go.mod +++ b/go.mod @@ -8,7 +8,7 @@ require ( github.com/Masterminds/semver/v3 v3.3.0 github.com/NimbleMarkets/ntcharts v0.1.2 github.com/apecloud/dbctl v0.0.0-20240827084000-68a1980b1a46 - github.com/apecloud/kubeblocks v1.0.0-beta.23 + github.com/apecloud/kubeblocks v1.0.0-beta.25 github.com/asaskevich/govalidator v0.0.0-20230301143203-a9d515a09cc2 github.com/briandowns/spinner v1.23.0 github.com/chaos-mesh/chaos-mesh/api v0.0.0-20230912020346-a5d89c1c90ad diff --git a/go.sum b/go.sum index 59ecb474a..4f6eaeb14 100644 --- a/go.sum +++ b/go.sum @@ -677,8 +677,8 @@ github.com/apache/arrow/go/v11 v11.0.0/go.mod h1:Eg5OsL5H+e299f7u5ssuXsuHQVEGC4x github.com/apache/thrift v0.16.0/go.mod h1:PHK3hniurgQaNMZYaCLEqXKsYK8upmhPbmdP2FXSqgU= github.com/apecloud/dbctl v0.0.0-20240827084000-68a1980b1a46 h1:+Jcc7IjDGxPgIfIkGX2Q5Yxj35U65zgcfjh0B9rDhjo= github.com/apecloud/dbctl v0.0.0-20240827084000-68a1980b1a46/go.mod h1:eksJtZ7z1nVcVLqDzAdcN5EfpHwXllIAvHZEks2zWys= -github.com/apecloud/kubeblocks v1.0.0-beta.23 h1:JrQBB9gJ/jtMD9wHv5js26rdfNQgeoU5+GcUEiaAmFU= -github.com/apecloud/kubeblocks v1.0.0-beta.23/go.mod h1:b656nTyvHhwRwOuwNpOPG87Q0Lba3ygGRWoSOacPt5o= +github.com/apecloud/kubeblocks v1.0.0-beta.25 h1:wWnvGSt/0emzvnbKSIVUQ5/nB9IeRrsgtsqFsu2Jfis= +github.com/apecloud/kubeblocks v1.0.0-beta.25/go.mod h1:b656nTyvHhwRwOuwNpOPG87Q0Lba3ygGRWoSOacPt5o= github.com/apparentlymart/go-textseg v1.0.0/go.mod h1:z96Txxhf3xSFMPmb5X/1W05FF/Nj9VFpLOpjS5yuumk= github.com/apparentlymart/go-textseg/v13 v13.0.0/go.mod h1:ZK2fH7c4NqDTLtiYLvIkEghdlcqw7yxLeM89kiTRPUo= github.com/armon/go-socks5 v0.0.0-20160902184237-e75332964ef5 h1:0CwZNZbxp69SHPdPJAN/hZIm0C4OItdklCFmMRWYpio= diff --git a/pkg/action/template/cluster_operations_template.cue b/pkg/action/template/cluster_operations_template.cue index abc685414..fc37fdae9 100644 --- a/pkg/action/template/cluster_operations_template.cue +++ b/pkg/action/template/cluster_operations_template.cue @@ -262,7 +262,7 @@ content: { switchover: [{ componentObjectName: options.componentObjectName instanceName: options.instance - candidateName: options.instance + candidateName: options.candidate }] } if options.type == "RebuildInstance" { diff --git a/pkg/cluster/cluster.go b/pkg/cluster/cluster.go index 29db32204..81a688edc 100644 --- a/pkg/cluster/cluster.go +++ b/pkg/cluster/cluster.go @@ -423,7 +423,6 @@ func (o *ClusterObjects) GetInstanceInfo() []*InstanceInfo { Component: componentName, Status: o.getPodPhase(&pod), Role: getLabelVal(pod.Labels, constant.RoleLabelKey), - AccessMode: getLabelVal(pod.Labels, constant.AccessModeLabelKey), CreatedTime: util.TimeFormat(&pod.CreationTimestamp), } var componentSpec *kbappsv1.ClusterComponentSpec diff --git a/pkg/cmd/cluster/operations.go b/pkg/cmd/cluster/operations.go index c5b9b305d..1a2f47a6f 100755 --- a/pkg/cmd/cluster/operations.go +++ b/pkg/cmd/cluster/operations.go @@ -22,6 +22,7 @@ package cluster import ( "context" "fmt" + "math" "strings" appsv1 "github.com/apecloud/kubeblocks/apis/apps/v1" @@ -111,6 +112,7 @@ type OperationsOptions struct { Component string `json:"component"` ComponentObjectName string `json:"componentObjectName,omitempty"` Instance string `json:"instance"` + Candidate string `json:"candidate"` BackupName string `json:"-"` Inplace bool `json:"-"` InstanceNames []string `json:"-"` @@ -438,54 +440,75 @@ func (o *OperationsOptions) validatePromote(clusterObj *appsv1.Cluster) error { podObj = &corev1.Pod{} ) - if o.Instance == "" { + if o.Candidate == "" { return fmt.Errorf("--candidate is required") } // if the instance is not specified, do not need to check the validity of the instance // checks the validity of the instance whether it belongs to the current component and ensure it is not the primary or leader instance currently. podKey := client.ObjectKey{ Namespace: clusterObj.Namespace, - Name: o.Instance, + Name: o.Candidate, } if err := util.GetResourceObjectFromGVR(types.PodGVR(), podKey, o.Dynamic, podObj); err != nil || podObj == nil { - return fmt.Errorf("instance %s not found, please check the validity of the instance using \"kbcli cluster list-instances\"", o.Instance) + return fmt.Errorf("instance %s not found, please check the validity of the instance using \"kbcli cluster list-instances\"", o.Candidate) } o.ComponentObjectName = constant.GenerateClusterComponentName(clusterObj.Name, podObj.Labels[constant.KBAppComponentLabelKey]) getAndValidatePod := func(targetRoles ...string) error { // if the instance is not specified, do not need to check the validity of the instance - if o.Instance == "" { + if o.Candidate == "" { return nil } v, ok := podObj.Labels[constant.RoleLabelKey] if !ok || v == "" { - return fmt.Errorf("instance %s cannot be promoted because it had a invalid role label", o.Instance) + return fmt.Errorf("instance %s cannot be promoted because it had a invalid role label", o.Candidate) } for _, targetRole := range targetRoles { if v == targetRole { - return fmt.Errorf("instanceName %s cannot be promoted because it is already the targetRole %s instance", o.Instance, targetRole) + return fmt.Errorf("instanceName %s cannot be promoted because it is already the targetRole %s instance", o.Candidate, targetRole) } } if podObj.Labels[constant.AppInstanceLabelKey] != clusterObj.Name { - return fmt.Errorf("instanceName %s does not belong to the current cluster, please check the validity of the instance using \"kbcli cluster list-instances\"", o.Instance) + return fmt.Errorf("instanceName %s does not belong to the current cluster, please check the validity of the instance using \"kbcli cluster list-instances\"", o.Candidate) } return nil } getTargetRole := func(roles []appsv1.ReplicaRole) (string, error) { + pods, err := o.Client.CoreV1().Pods(clusterObj.Namespace).List(context.Background(), metav1.ListOptions{ + LabelSelector: fmt.Sprintf("%s=%s", constant.KBAppComponentLabelKey, podObj.Labels[constant.KBAppComponentLabelKey]), + }) + if err != nil { + return "", err + } + if o.Instance != "" { + for _, pod := range pods.Items { + if pod.Name == o.Instance { + return pod.Labels[constant.RoleLabelKey], nil + } + } + return "", fmt.Errorf("instance %s not found, please check the validity of the instance using \"kbcli cluster list-instances\"", o.Instance) + } targetRole := "" if len(roles) == 0 { return targetRole, fmt.Errorf("component has no roles definition, does not support switchover") } - for _, role := range roles { - if role.Serviceable && role.Writable { - if targetRole != "" { - return targetRole, fmt.Errorf("componentDefinition has more than role is serviceable and writable, does not support switchover") - } - targetRole = role.Name + // HACK: assume the role with highest priority to be writable + highestPriority := math.MinInt + var role *appsv1.ReplicaRole + for _, r := range compDefObj.Spec.Roles { + if r.UpdatePriority > highestPriority { + highestPriority = r.UpdatePriority + role = &r + } + } + for _, pod := range pods.Items { + if pod.Labels[constant.RoleLabelKey] == role.Name { + o.Instance = pod.Name + break } } - return targetRole, nil + return role.Name, nil } // check componentDefinition exist @@ -497,7 +520,7 @@ func (o *OperationsOptions) validatePromote(clusterObj *appsv1.Cluster) error { return err } if compDefObj.Spec.LifecycleActions == nil || compDefObj.Spec.LifecycleActions.Switchover == nil { - return fmt.Errorf(`this instance "%s does not support switchover, you can define the switchover action in the componentDef "%s"`, o.Instance, compDefKey.Name) + return fmt.Errorf(`this instance "%s does not support switchover, you can define the switchover action in the componentDef "%s"`, o.Candidate, compDefKey.Name) } targetRole, err := getTargetRole(compDefObj.Spec.Roles) if err != nil { @@ -1008,7 +1031,8 @@ func NewPromoteCmd(f cmdutil.Factory, streams genericiooptions.IOStreams) *cobra cmdutil.CheckErr(o.Run()) }, } - cmd.Flags().StringVar(&o.Instance, "candidate", "", "Specify the instance name as the new primary or leader of the cluster, you can get the instance name by running \"kbcli cluster list-instances\"") + cmd.Flags().StringVar(&o.Candidate, "candidate", "", "Specify the instance name as the new primary or leader of the cluster, you can get the instance name by running \"kbcli cluster list-instances\"") + cmd.Flags().StringVar(&o.Instance, "instance", "", "Specify the instance name that will transfer its role to the candidate pod, If not set, the current primary or leader of the cluster will be used.") cmd.Flags().BoolVar(&o.AutoApprove, "auto-approve", false, "Skip interactive approval before promote the instance") _ = cmd.MarkFlagRequired("candidate") o.addCommonFlags(cmd, f) diff --git a/pkg/cmd/cluster/operations_test.go b/pkg/cmd/cluster/operations_test.go index bb4f351c5..e8de94f32 100644 --- a/pkg/cmd/cluster/operations_test.go +++ b/pkg/cmd/cluster/operations_test.go @@ -359,17 +359,17 @@ var _ = Describe("operations", func() { By("validate failed because o.Instance is illegal ") o.Name = clusterName1 o.Component = testing.ComponentName - o.Instance = fmt.Sprintf("%s-%s-%d", clusterName1, testing.ComponentName, 5) + o.Candidate = fmt.Sprintf("%s-%s-%d", clusterName1, testing.ComponentName, 5) Expect(o.Validate()).ShouldNot(Succeed()) Expect(testing.ContainExpectStrings(o.Validate().Error(), "not found")).Should(BeTrue()) By("validate failed because o.Instance is already leader and cannot be promoted") - o.Instance = fmt.Sprintf("%s-%s-%d", clusterName1, testing.ComponentName, 0) + o.Candidate = fmt.Sprintf("%s-%s-%d", clusterName1, testing.ComponentName, 0) Expect(o.Validate()).ShouldNot(Succeed()) Expect(testing.ContainExpectStrings(o.Validate().Error(), "cannot be promoted because it is already the targetRole")).Should(BeTrue()) By("validate failed because o.Instance does not belong to the current component") - o.Instance = fmt.Sprintf("%s-%s-%d", clusterName, testing.ComponentName, 1) + o.Candidate = fmt.Sprintf("%s-%s-%d", clusterName, testing.ComponentName, 1) o.Name = clusterName1 Expect(o.Validate()).ShouldNot(Succeed()) Expect(testing.ContainExpectStrings(o.Validate().Error(), "does not belong to the current cluster")).Should(BeTrue()) @@ -394,17 +394,17 @@ var _ = Describe("operations", func() { By("validate failed because o.Instance is illegal ") o.Name = clusterNameWithCompDef - o.Instance = fmt.Sprintf("%s-%s-%d", clusterNameWithCompDef, testing.ComponentName, 5) + o.Candidate = fmt.Sprintf("%s-%s-%d", clusterNameWithCompDef, testing.ComponentName, 5) Expect(o.Validate()).ShouldNot(Succeed()) Expect(testing.ContainExpectStrings(o.Validate().Error(), "not found")).Should(BeTrue()) By("validate failed because o.Instance is already leader and cannot be promoted") - o.Instance = fmt.Sprintf("%s-%s-%d", clusterNameWithCompDef, testing.ComponentName, 0) + o.Candidate = fmt.Sprintf("%s-%s-%d", clusterNameWithCompDef, testing.ComponentName, 0) Expect(o.Validate()).ShouldNot(Succeed()) Expect(testing.ContainExpectStrings(o.Validate().Error(), "cannot be promoted because it is already the targetRole")).Should(BeTrue()) By("validate failed because o.Instance does not belong to the current cluster") - o.Instance = fmt.Sprintf("%s-%s-%d", clusterName1, testing.ComponentName, 1) + o.Candidate = fmt.Sprintf("%s-%s-%d", clusterName1, testing.ComponentName, 1) o.Component = testing.ComponentName o.Name = clusterName Expect(o.Validate()).ShouldNot(Succeed()) diff --git a/pkg/cmd/componentdefinition/describe.go b/pkg/cmd/componentdefinition/describe.go index f2e46f87c..1cb4df6ee 100644 --- a/pkg/cmd/componentdefinition/describe.go +++ b/pkg/cmd/componentdefinition/describe.go @@ -198,9 +198,9 @@ func showRoles(roles []kbappsv1.ReplicaRole, out io.Writer) { } fmt.Fprintf(out, "Roles:\n") tbl := printer.NewTablePrinter(out) - tbl.SetHeader("\tNAME", "SERVICE-ABLE", "WRITE-ABLE", "VOTE-ABLE") + tbl.SetHeader("\tNAME", "WITH-QUORUM", "UPDATE-PRIORITY") for _, role := range roles { - tbl.AddRow("\t"+role.Name, role.Serviceable, role.Writable, role.Votable) + tbl.AddRow("\t"+role.Name, role.ParticipatesInQuorum, role.UpdatePriority) } tbl.Print() fmt.Fprint(out, "\n") @@ -228,10 +228,9 @@ func showSystemAccounts(systemAccounts []kbappsv1.SystemAccount, out io.Writer) } fmt.Fprintf(out, "System Accounts:\n") tbl := printer.NewTablePrinter(out) - tbl.SetHeader("\tNAME", "INIT-ACCOUNT", "HAS-SECRET-REF") + tbl.SetHeader("\tNAME", "INIT-ACCOUNT") for _, sa := range systemAccounts { - hasSecretRef := sa.SecretRef != nil - tbl.AddRow("\t"+sa.Name, sa.InitAccount, hasSecretRef) + tbl.AddRow("\t"+sa.Name, sa.InitAccount) } tbl.Print() fmt.Fprint(out, "\n") diff --git a/pkg/testing/fake.go b/pkg/testing/fake.go index f28770ada..679459849 100644 --- a/pkg/testing/fake.go +++ b/pkg/testing/fake.go @@ -314,22 +314,19 @@ func FakeCompDef() *kbappsv1.ComponentDefinition { }, Roles: []kbappsv1.ReplicaRole{ { - Name: "leader", - Serviceable: true, - Writable: true, - Votable: true, + Name: "leader", + ParticipatesInQuorum: true, + UpdatePriority: 2, }, { - Name: "follower", - Serviceable: true, - Writable: false, - Votable: true, + Name: "follower", + ParticipatesInQuorum: true, + UpdatePriority: 1, }, { - Name: "learner", - Serviceable: false, - Writable: false, - Votable: false, + Name: "learner", + ParticipatesInQuorum: true, + UpdatePriority: 0, }, }, SystemAccounts: []kbappsv1.SystemAccount{