Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement repair queue #692

Merged
merged 5 commits into from
Jan 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/main.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
runs-on: ubuntu-22.04
strategy:
matrix:
suite: [functions, robustness, operators, reboot]
suite: [functions, robustness, operators, reboot, repair]
env:
SUITE: ${{ matrix.suite }}
CLUSTER: "cke-cluster.yml"
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,10 @@ This project employs a versioning scheme described in [RELEASE.md](RELEASE.md#ve

## [Unreleased]

### Added

- Implement repair queue in [#692](https://github.com/cybozu-go/cke/pull/692)

## [1.27.3]

### Changed
Expand Down
36 changes: 36 additions & 0 deletions cluster.go
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,41 @@ type Reboot struct {
const DefaultRebootEvictionTimeoutSeconds = 600
const DefaultMaxConcurrentReboots = 1

type Repair struct {
RepairProcedures []RepairProcedure `json:"repair_procedures"`
MaxConcurrentRepairs *int `json:"max_concurrent_repairs,omitempty"`
ProtectedNamespaces *metav1.LabelSelector `json:"protected_namespaces,omitempty"`
EvictRetries *int `json:"evict_retries,omitempty"`
EvictInterval *int `json:"evict_interval,omitempty"`
EvictionTimeoutSeconds *int `json:"eviction_timeout_seconds,omitempty"`
}

type RepairProcedure struct {
MachineTypes []string `json:"machine_types"`
RepairOperations []RepairOperation `json:"repair_operations"`
}

type RepairOperation struct {
Operation string `json:"operation"`
RepairSteps []RepairStep `json:"repair_steps"`
HealthCheckCommand []string `json:"health_check_command"`
CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"`
}

type RepairStep struct {
RepairCommand []string `json:"repair_command"`
CommandTimeoutSeconds *int `json:"command_timeout_seconds,omitempty"`
CommandRetries *int `json:"command_retries,omitempty"`
CommandInterval *int `json:"command_interval,omitempty"`
NeedDrain bool `json:"need_drain,omitempty"`
WatchSeconds *int `json:"watch_seconds,omitempty"`
}

const DefaultMaxConcurrentRepairs = 1
const DefaultRepairEvictionTimeoutSeconds = 600
const DefaultRepairHealthCheckCommandTimeoutSeconds = 30
const DefaultRepairCommandTimeoutSeconds = 30

// Options is a set of optional parameters for k8s components.
type Options struct {
Etcd EtcdParams `json:"etcd"`
Expand All @@ -307,6 +342,7 @@ type Cluster struct {
DNSServers []string `json:"dns_servers"`
DNSService string `json:"dns_service"`
Reboot Reboot `json:"reboot"`
Repair Repair `json:"repair"`
Options Options `json:"options"`
}

Expand Down
85 changes: 85 additions & 0 deletions cluster_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package cke

import (
"os"
"slices"
"testing"

"github.com/google/go-cmp/cmp"
Expand Down Expand Up @@ -131,6 +132,90 @@ func testClusterYAML(t *testing.T) {
if c.Reboot.ProtectedNamespaces.MatchLabels["app"] != "sample" {
t.Error(`c.Reboot.ProtectedNamespaces.MatchLabels["app"] != "sample"`)
}
if len(c.Repair.RepairProcedures) != 1 {
t.Fatal(`len(c.Repair.RepairProcedures) != 1`)
}
if !slices.Equal(c.Repair.RepairProcedures[0].MachineTypes, []string{"Cray-1", "Cray-2"}) {
t.Error(`c.Repair.RepairProcedures[0].MachineTypes != {"Cray-1", "Cray-2"}`)
}
if len(c.Repair.RepairProcedures[0].RepairOperations) != 1 {
t.Fatal(`len(c.Repair.RepairProcedures[0].RepairOperations) != 1`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].Operation != "unreachable" {
t.Error(`c.Repair.RepairProcedures[0].RepairOperations[0].OperationName != "unreachable"`)
}
if len(c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps) != 2 {
t.Fatal(`len(c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps) != 2`)
}
if !slices.Equal(c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].RepairCommand, []string{"reset", "remotely"}) {
t.Error(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].RepairCommand != {"reset", "remotely"}`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds != 10 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandTimeoutSeconds != 10`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries != 1 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandRetries != 1`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval != 5 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].CommandInterval != 5`)
}
if !c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].NeedDrain {
t.Fatal(`!c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].NeedDrain`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds != 60 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].RepairSteps[0].WatchSeconds != 60`)
}
if !slices.Equal(c.Repair.RepairProcedures[0].RepairOperations[0].HealthCheckCommand, []string{"knock"}) {
t.Error(`c.Repair.RepairProcedures[0].RepairOperations[0].HealthCheckCommand != {"knock"}`)
}
if c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds == nil {
t.Fatal(`c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds == nil`)
}
if *c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds != 30 {
t.Error(`*c.Repair.RepairProcedures[0].RepairOperations[0].CommandTimeoutSeconds != 30`)
}
if c.Repair.MaxConcurrentRepairs == nil {
t.Fatal(`c.Repair.MaxConcurrentRepairs == nil`)
}
if *c.Repair.MaxConcurrentRepairs != 2 {
t.Error(`*c.Repair.MaxConcurrentRepairs != 2`)
}
if c.Repair.ProtectedNamespaces == nil {
t.Fatal(`c.Repair.ProtectedNamespaces == nil`)
}
if c.Repair.ProtectedNamespaces.MatchLabels["app"] != "protected" {
t.Error(`c.Repair.ProtectedNamespaces.MatchLabels["app"] != "protected"`)
}
if c.Repair.EvictRetries == nil {
t.Fatal(`c.Repair.EvictRetries == nil`)
}
if *c.Repair.EvictRetries != 3 {
t.Error(`*c.Repair.EvictRetries != 3`)
}
if c.Repair.EvictInterval == nil {
t.Fatal(`c.Repair.EvictInterval == nil`)
}
if *c.Repair.EvictInterval != 5 {
t.Error(`*c.Repair.EvictInterval != 5`)
}
if c.Repair.EvictionTimeoutSeconds == nil {
t.Fatal(`c.Repair.EvictionTimeoutSeconds == nil`)
}
if *c.Repair.EvictionTimeoutSeconds != 120 {
t.Error(`*c.Repair.EvictionTimeoutSeconds != 120`)
}
if c.Options.Etcd.VolumeName != "myetcd" {
t.Error(`c.Options.Etcd.VolumeName != "myetcd"`)
}
Expand Down
56 changes: 56 additions & 0 deletions docs/ckecli.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,15 @@ $ ckecli [--config FILE] <subcommand> args...
- [`ckecli reboot-queue cancel INDEX`](#ckecli-reboot-queue-cancel-index)
- [`ckecli reboot-queue cancel-all`](#ckecli-reboot-queue-cancel-all)
- [`ckecli reboot-queue reset-backoff`](#ckecli-reboot-queue-reset-backoff)
- [`ckecli repair-queue`](#ckecli-repair-queue)
- [`ckecli repair-queue enable|disable`](#ckecli-repair-queue-enabledisable)
- [`ckecli repair-queue is-enabled`](#ckecli-repair-queue-is-enabled)
- [`ckecli repair-queue add OPERATION MACHINE_TYPE ADDRESS`](#ckecli-repair-queue-add-operation-machine_type-address)
- [`ckecli repair-queue list`](#ckecli-repair-queue-list)
- [`ckecli repair-queue delete INDEX`](#ckecli-repair-queue-delete-index)
- [`ckecli repair-queue delete-finished`](#ckecli-repair-queue-delete-finished)
- [`ckecli repair-queue delete-unfinished`](#ckecli-repair-queue-delete-unfinished)
- [`ckecli repair-queue reset-backoff`](#ckecli-repair-queue-reset-backoff)
- [`ckecli sabakan`](#ckecli-sabakan)
- [`ckecli sabakan enable|disable`](#ckecli-sabakan-enabledisable)
- [`ckecli sabakan is-enabled`](#ckecli-sabakan-is-enabled)
Expand Down Expand Up @@ -311,6 +320,53 @@ Cancel all the reboot queue entries.
Reset `drain_backoff_count` and `drain_backoff_expire` of the entries in reboot queue.
Resetting these values makes CKE try to reboot nodes again immediately.

## `ckecli repair-queue`

Control a queue of repair requests.

### `ckecli repair-queue enable|disable`

Enable/Disable processing repair queue entries.

### `ckecli repair-queue is-enabled`

Show repair queue is enabled or disabled.
This displays `true` or `false`.

### `ckecli repair-queue add OPERATION MACHINE_TYPE ADDRESS`

Append a repair request to the repair queue.
The repair target is a machine with an IP address `ADDRESS` and a machine type `MACHINE_TYPE`.
The machine should be processed with an operation `OPERATION`.

### `ckecli repair-queue list`

List the entries in the repair queue.

### `ckecli repair-queue delete INDEX`

Delete the specified repair queue entry.
This has two meanings: this clears up an old entry if the specified entry has finished and cancels an ongoing entry otherwise.

Unlike the reboot queue, repair queue entries remain in the queue even after they finish.

### `ckecli repair-queue delete-finished`

Delete all finished repair queue entries.
Entries in `succeeded` or `failed` status are deleted.
This displays the index numbers of deleted entries, one per line.

### `ckecli repair-queue delete-unfinished`

Delete all unfinished repair queue entries.
Entries not in `succeeded` or `failed` status are deleted.
This displays the index numbers of deleted entries, one per line.

### `ckecli repair-queue reset-backoff`

Reset `drain_backoff_count` and `drain_backoff_expire` of the entries in repair queue.
Resetting these values makes CKE try to drain machines again immediately.

## `ckecli sabakan`

Control [sabakan integration feature](sabakan-integration.md).
Expand Down
46 changes: 45 additions & 1 deletion docs/cluster.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ a YAML or JSON object with these fields:
- [Node](#node)
- [Taint](#taint)
- [Reboot](#reboot)
- [Repair](#repair)
- [RepairProcedure](#repairprocedure)
- [Options](#options)
- [ServiceParams](#serviceparams)
- [Mount](#mount)
Expand All @@ -27,6 +29,7 @@ a YAML or JSON object with these fields:
| `dns_servers` | false | array | List of upstream DNS server IP addresses. |
| `dns_service` | false | string | Upstream DNS service name with namespace as `namespace/service`. |
| `reboot` | false | `Reboot` | See [Reboot](#reboot). |
| `repair` | false | `Repair` | See [Repair](#repair). |
| `options` | false | `Options` | See [Options](#options). |

* `control_plane_tolerations` is used in [sabakan integration](sabakan-integration.md#strategy).
Expand Down Expand Up @@ -68,7 +71,7 @@ Reboot
------

| Name | Required | Type | Description |
|----------------------------| -------- | -------------------------------- |-------------------------------------------------------------------------|
| -------------------------- | -------- | -------------------------------- | ----------------------------------------------------------------------- |
| `reboot_command` | true | array | A command to reboot. List of strings. |
| `boot_check_command` | true | array | A command to check nodes booted. List of strings. |
| `eviction_timeout_seconds` | false | *int | Deadline for eviction. Must be positive. Default: 600 (10 minutes). |
Expand Down Expand Up @@ -98,6 +101,47 @@ The Pods in the non-protected namespaces are also tried to be deleted gracefully

If `protected_namespaces` is not given, all namespaces are protected.

Repair
------

| Name | Required | Type | Description |
| -------------------------- | -------- | -------------------------------- | --------------------------------------------------------------------- |
| `repair_procedures` | true | `[]RepairProcedure` | List of [repair procedures](#repairprocedure). |
| `max_concurrent_repairs` | false | \*int | Maximum number of machines to be repaired concurrently. Default: 1 |
| `protected_namespaces` | false | [`LabelSelector`][LabelSelector] | A label selector to protect namespaces. |
| `evict_retries` | false | \*int | Number of eviction retries, not including initial attempt. Default: 0 |
| `evict_interval` | false | \*int | Number of time between eviction retries in seconds. Default: 0 |
| `eviction_timeout_seconds` | false | *int | Deadline for eviction. Must be positive. Default: 600 (10 minutes) |

The repair configurations control the [repair functionality](repair.md).

### RepairProcedure

| Name | Required | Type | Description |
| ------------------- | -------- | ------------------- | ------------------------------------------------------------------------------------ |
| `machine_types` | true | array | Type names of the target machines to be repaired by this procedure. List of strings. |
| `repair_operations` | true | `[]RepairOperation` | List of [repair operations](#repairoperation). |

#### RepairOperation

| Name | Required | Type | Description |
| ------------------------- | -------- | -------------- | --------------------------------------------------------------- |
| `operation` | true | string | Name of repair operation. |
| `repair_steps` | true | `[]RepairStep` | Sequences of [repair steps](#repairstep). |
| `health_check_command` | true | array | A command to check repaired machine's health. List of strings. |
| `command_timeout_seconds` | false | \*int | Deadline for health retrieval. Zero means infinity. Default: 30 |

##### RepairStep

| Name | Required | Type | Description |
| ------------------------- | -------- | ----- | -------------------------------------------------------------------------------------------------------------------------------- |
| `repair_command` | true | array | A command and its arguments to repair the target machine. List of strings. |
| `command_timeout_seconds` | false | \*int | Deadline for repairing. Zero means infinity. Default: 30 |
| `command_retries` | false | \*int | Number of repair retries, not including initial attempt. Default: 0 |
| `command_interval` | false | \*int | Interval of time between repair retries in seconds. Default: 0 |
| `need_drain` | false | bool | If true, perform drain of Pods on the target machine prior to the execution of the repair command. Default: false |
| `watch_seconds` | false | \*int | Follow-up duration in seconds to watch whether the machine becomes healthy after the execution of the repair command. Default: 0 |

Options
-------

Expand Down
Loading