cybozu-go · morimoto-cybozu · May 23, 2024 · May 21, 2024 · May 20, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,7 +7,7 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve
 
 ### Added
 
-- Add sabakan-triggered automatic repair functionality in [#725](https://github.com/cybozu-go/cke/pull/725)
+- Add sabakan-triggered automatic repair functionality in [#725](https://github.com/cybozu-go/cke/pull/725) and [#732](https://github.com/cybozu-go/cke/pull/732)
 
 ### Fixed
 

diff --git a/constraints.go b/constraints.go
@@ -9,6 +9,7 @@ type Constraints struct {
 	MaximumWorkers           int `json:"maximum-workers"`
 	RebootMaximumUnreachable int `json:"maximum-unreachable-nodes-for-reboot"`
 	MaximumRepairs           int `json:"maximum-repair-queue-entries"`
+	RepairRebootingSeconds   int `json:"wait-seconds-to-repair-rebooting"`
 }
 
 // Check checks the cluster satisfies the constraints
@@ -43,5 +44,6 @@ func DefaultConstraints() *Constraints {
 		MaximumWorkers:           0,
 		RebootMaximumUnreachable: 0,
 		MaximumRepairs:           0,
+		RepairRebootingSeconds:   0,
 	}
 }
diff --git a/docs/ckecli.md b/docs/ckecli.md
@@ -97,6 +97,7 @@ Set a constraint on the cluster configuration.
 - `maximum-workers`
 - `maximum-unreachable-nodes-for-reboot`
 - `maximum-repair-queue-entries`
+- `wait-seconds-to-repair-rebooting`
 
 ### `ckecli constraints show`
 

diff --git a/docs/constraints.md b/docs/constraints.md
@@ -13,3 +13,4 @@ Cluster should satisfy these constraints.
 | `maximum-workers`                      | int  | 0       | The maximum number of worker nodes. 0 means unlimited.                |
 | `maximum-unreachable-nodes-for-reboot` | int  | 0       | The maximum number of unreachable nodes allowed for operating reboot. |
 | `maximum-repair-queue-entries`         | int  | 0       | The maximum number of repair queue entries                            |
+| `wait-seconds-to-repair-rebooting`     | int  | 0       | The wait time in seconds to repair a rebooting machine                |
diff --git a/docs/sabakan-triggered-repair.md b/docs/sabakan-triggered-repair.md
@@ -81,6 +81,12 @@ The maximum number of recent repair queue entries and new failure reports is [co
 
 As stated above, CKE considers all persisting queue entries as "recent" for simplicity.
 
+### Limiter for planned reboot
+
+A machine may become "UNREACHABLE" very quickly even if it is [being rebooted in a planned manner](reboot.md).
+CKE should wait for a while before starting repair operations for a rebooting machine.
+
+A user can [configure the wait time](ckecli.md#ckecli-constraints-set-name-value) as a [constraint `wait-seconds-to-repair-rebooting`](constraints.md)
 
 [sabakan]: https://github.com/cybozu-go/sabakan
 [schema]: https://github.com/cybozu-go/sabakan/blob/main/gql/graph/schema.graphqls
diff --git a/pkg/ckecli/cmd/constraints_set.go b/pkg/ckecli/cmd/constraints_set.go
@@ -24,6 +24,7 @@ NAME is one of:
     maximum-workers
     maximum-unreachable-nodes-for-reboot
     maximum-repair-queue-entries
+    wait-seconds-to-repair-rebooting
 
 VALUE is an integer.`,
 
@@ -58,6 +59,10 @@ VALUE is an integer.`,
 			cstrSet = func(cstr *cke.Constraints) {
 				cstr.MaximumRepairs = val
 			}
+		case "wait-seconds-to-repair-rebooting":
+			cstrSet = func(cstr *cke.Constraints) {
+				cstr.RepairRebootingSeconds = val
+			}
 		default:
 			return errors.New("no such constraint: " + args[0])
 		}

diff --git a/sabakan/integrate.go b/sabakan/integrate.go
@@ -174,12 +174,18 @@ func (ig integrator) runRepairer(ctx context.Context, clusterStatus *cke.Cluster
 		return nil
 	}
 
+	rebootEntries, err := st.GetRebootsEntries(ctx)
+	if err != nil {
+		return err
+	}
+	rebootEntries = cke.DedupRebootQueueEntries(rebootEntries)
+
 	constraints, err := st.GetConstraints(ctx)
 	if err != nil {
 		return err
 	}
 
-	entries := Repairer(machines, clusterStatus.RepairQueue.Entries, constraints)
+	entries := Repairer(machines, clusterStatus.RepairQueue.Entries, rebootEntries, constraints, time.Now().UTC())
 
 	for _, entry := range entries {
 		err := st.RegisterRepairsEntry(ctx, entry)

diff --git a/sabakan/repairer.go b/sabakan/repairer.go
@@ -2,18 +2,27 @@ package sabakan
 
 import (
 	"strings"
+	"time"
 
 	"github.com/cybozu-go/cke"
 	"github.com/cybozu-go/log"
 )
 
-func Repairer(machines []Machine, entries []*cke.RepairQueueEntry, constraints *cke.Constraints) []*cke.RepairQueueEntry {
+func Repairer(machines []Machine, repairEntries []*cke.RepairQueueEntry, rebootEntries []*cke.RebootQueueEntry, constraints *cke.Constraints, now time.Time) []*cke.RepairQueueEntry {
 	recent := make(map[string]bool)
-	for _, entry := range entries {
+	for _, entry := range repairEntries {
 		// entry.Operation is ignored when checking duplication
 		recent[entry.Address] = true
 	}
 
+	rebootLimit := now.Add(time.Duration(-constraints.RepairRebootingSeconds) * time.Second)
+	rebootingSince := make(map[string]time.Time)
+	for _, entry := range rebootEntries {
+		if entry.Status == cke.RebootStatusRebooting {
+			rebootingSince[entry.Node] = entry.LastTransitionTime // entry.Node denotes IP address
+		}
+	}
+
 	newMachines := make([]Machine, 0, len(machines))
 	for _, machine := range machines {
 		if len(machine.Spec.IPv4) == 0 {
@@ -31,10 +40,19 @@ func Repairer(machines []Machine, entries []*cke.RepairQueueEntry, constraints *
 			continue
 		}
 
+		since, ok := rebootingSince[machine.Spec.IPv4[0]]
+		if ok && since.After(rebootLimit) && machine.Status.State == StateUnreachable {
+			log.Info("ignore rebooting unreachable machine", map[string]interface{}{
+				"serial":  machine.Spec.Serial,
+				"address": machine.Spec.IPv4[0],
+			})
+			continue
+		}
+
 		newMachines = append(newMachines, machine)
 	}
 
-	if len(entries)+len(newMachines) > constraints.MaximumRepairs {
+	if len(repairEntries)+len(newMachines) > constraints.MaximumRepairs {
 		log.Warn("ignore too many repair requests", nil)
 		return nil
 	}

diff --git a/sabakan/repairer_test.go b/sabakan/repairer_test.go
@@ -2,6 +2,7 @@ package sabakan
 
 import (
 	"testing"
+	"time"
 
 	"github.com/cybozu-go/cke"
 	"github.com/google/go-cmp/cmp"
@@ -10,75 +11,122 @@ import (
 
 func TestRepairer(t *testing.T) {
 	constraints := &cke.Constraints{
-		MaximumRepairs: 3,
+		MaximumRepairs:         3,
+		RepairRebootingSeconds: 300,
 	}
 
 	machines := []Machine{
 		{Spec: MachineSpec{Serial: "0000"}, Status: MachineStatus{State: StateUnhealthy}},
-		{Spec: MachineSpec{Serial: "1111", IPv4: []string{"1.1.1.1"}, BMC: BMC{Type: "type1"}}, Status: MachineStatus{State: StateUnhealthy}},
+		{Spec: MachineSpec{Serial: "1111", IPv4: []string{"1.1.1.1"}, BMC: BMC{Type: "type1"}}, Status: MachineStatus{State: StateUnreachable}},
 		{Spec: MachineSpec{Serial: "2222", IPv4: []string{"2.2.2.2"}, BMC: BMC{Type: "type2"}}, Status: MachineStatus{State: StateUnhealthy}},
 		{Spec: MachineSpec{Serial: "3333", IPv4: []string{"3.3.3.3"}, BMC: BMC{Type: "type3"}}, Status: MachineStatus{State: StateUnreachable}},
 		{Spec: MachineSpec{Serial: "4444", IPv4: []string{"4.4.4.4"}, BMC: BMC{Type: "type4"}}, Status: MachineStatus{State: StateUnreachable}},
 	}
 
 	entries := []*cke.RepairQueueEntry{
 		nil,
-		cke.NewRepairQueueEntry("unhealthy", "type1", "1.1.1.1"),
+		cke.NewRepairQueueEntry("unreachable", "type1", "1.1.1.1"),
 		cke.NewRepairQueueEntry("unhealthy", "type2", "2.2.2.2"),
 		cke.NewRepairQueueEntry("unreachable", "type3", "3.3.3.3"),
 		cke.NewRepairQueueEntry("unreachable", "type4", "4.4.4.4"),
 	}
 
+	now := time.Now().UTC()
+	recent := now.Add(-30 * time.Second)
+	stale := now.Add(-3000 * time.Second)
+	rebootEntries := []*cke.RebootQueueEntry{
+		nil,
+		{Node: "1.1.1.1", Status: cke.RebootStatusRebooting, LastTransitionTime: recent},
+		{Node: "2.2.2.2", Status: cke.RebootStatusRebooting, LastTransitionTime: recent},
+		{Node: "3.3.3.3", Status: cke.RebootStatusRebooting, LastTransitionTime: stale},
+		{Node: "4.4.4.4", Status: cke.RebootStatusDraining, LastTransitionTime: recent},
+	}
+
 	tests := []struct {
 		name            string
 		failedMachines  []Machine
 		queuedEntries   []*cke.RepairQueueEntry
+		rebootEntries   []*cke.RebootQueueEntry
 		expectedEntries []*cke.RepairQueueEntry
 	}{
 		{
 			name:            "NoFailedMachine",
 			failedMachines:  []Machine{},
 			queuedEntries:   []*cke.RepairQueueEntry{entries[2]},
+			rebootEntries:   nil,
 			expectedEntries: []*cke.RepairQueueEntry{},
 		},
 		{
 			name:            "OneFailedMachine",
 			failedMachines:  []Machine{machines[1]},
 			queuedEntries:   []*cke.RepairQueueEntry{entries[2]},
+			rebootEntries:   nil,
 			expectedEntries: []*cke.RepairQueueEntry{entries[1]},
 		},
 		{
 			name:            "IgnoreNoIPAddress",
 			failedMachines:  []Machine{machines[0], machines[1]},
 			queuedEntries:   []*cke.RepairQueueEntry{entries[2]},
+			rebootEntries:   nil,
 			expectedEntries: []*cke.RepairQueueEntry{entries[1]},
 		},
 		{
 			name:            "IgnoreRecentlyRepaired",
 			failedMachines:  []Machine{machines[1], machines[2], machines[3]},
 			queuedEntries:   []*cke.RepairQueueEntry{entries[2]},
+			rebootEntries:   nil,
 			expectedEntries: []*cke.RepairQueueEntry{entries[1], entries[3]},
 		},
 		{
 			name:            "IgnoreRecentlyRepairedWithDifferentOperation",
 			failedMachines:  []Machine{machines[1], machines[2], machines[3]},
 			queuedEntries:   []*cke.RepairQueueEntry{cke.NewRepairQueueEntry("unreachable", "type2", "2.2.2.2")},
+			rebootEntries:   nil,
 			expectedEntries: []*cke.RepairQueueEntry{entries[1], entries[3]},
 		},
 		{
 			name:            "IgnoreTooManyFailedMachines",
 			failedMachines:  []Machine{machines[1], machines[2], machines[3]},
 			queuedEntries:   []*cke.RepairQueueEntry{entries[2], entries[4]},
+			rebootEntries:   nil,
+			expectedEntries: []*cke.RepairQueueEntry{},
+		},
+		{
+			name:            "IgnoreRebootingUnreachableMachine",
+			failedMachines:  []Machine{machines[1]},
+			queuedEntries:   []*cke.RepairQueueEntry{},
+			rebootEntries:   []*cke.RebootQueueEntry{rebootEntries[1]},
 			expectedEntries: []*cke.RepairQueueEntry{},
 		},
+		{
+			name:            "RebootingButUnhealthy",
+			failedMachines:  []Machine{machines[2]},
+			queuedEntries:   []*cke.RepairQueueEntry{},
+			rebootEntries:   []*cke.RebootQueueEntry{rebootEntries[2]},
+			expectedEntries: []*cke.RepairQueueEntry{entries[2]},
+		},
+		{
+			name:            "RebootingButStale",
+			failedMachines:  []Machine{machines[3]},
+			queuedEntries:   []*cke.RepairQueueEntry{},
+			rebootEntries:   []*cke.RebootQueueEntry{rebootEntries[3]},
+			expectedEntries: []*cke.RepairQueueEntry{entries[3]},
+		},
+		{
+			name:            "QueuedButNotRebooting",
+			failedMachines:  []Machine{machines[4]},
+			queuedEntries:   []*cke.RepairQueueEntry{},
+			rebootEntries:   []*cke.RebootQueueEntry{rebootEntries[4]},
+			expectedEntries: []*cke.RepairQueueEntry{entries[4]},
+		},
 	}
 
 	for _, tt := range tests {
 		tt := tt
 		t.Run(tt.name, func(t *testing.T) {
 			t.Parallel()
 
-			entries := Repairer(tt.failedMachines, tt.queuedEntries, constraints)
+			entries := Repairer(tt.failedMachines, tt.queuedEntries, tt.rebootEntries, constraints, now)
 			if !cmp.Equal(entries, tt.expectedEntries, cmpopts.EquateEmpty()) {
 				t.Errorf("!cmp.Equal(entries, tt.newEntries), actual: %v, expected: %v", entries, tt.expectedEntries)
 			}

diff --git a/server/strategy.go b/server/strategy.go
@@ -704,8 +704,8 @@ func repairOps(c *cke.Cluster, cs *cke.ClusterStatus, constraints *cke.Constrain
 
 	// Sort/filter entries to limit the number of concurrent repairs.
 	// - Entries being deleted are dequeued unconditionally.
-	// - Entries just repaired are moved to succeeded status unconditionally.
 	// - Succeeded/failed entries are left unchanged.
+	// - Entries just repaired are moved to succeeded status.
 	// - Entries already being processed have higher priority than newly queued entries.
 	//     - Entries waiting for unexpired drain-retry-timeout are filtered out.
 	//     - Other types of timeout-wait are considered as "being processed" and