diff --git a/README.md b/README.md index 4271a4c..8015166 100644 --- a/README.md +++ b/README.md @@ -48,7 +48,7 @@ To enable IPv6 support, set the `ipv6` flag (or set `IPV6` environment variable) ### Kubernetes Service Account -KubeIP requires a Kubernetes service account with the following permissions: +KubeIP requires a Kubernetes service account with at least the following permissions: ```yaml apiVersion: v1 @@ -129,6 +129,44 @@ spec: value: "true" ``` +### Node Taints + +KubeIP can be configured to attempt removal of a Taint Key from its node once the static IP has been successfully assigned, preventing workloads from being scheduled on the node until it has successfully received a static IP address. This can be useful, for example, in cases where the workload must call resources with IP-whitelisting, to prevent race conditions between KubeIP and the workload on newly provisioned nodes. + +To enable this feature, set the `taint-key` configuration parameter (See [How to run KubeIP](#how-to-run-kubeip)) to the taint key that should be removed. Then add a toleration to the KubeIP DaemonSet, so that it itself can be scheduled on the tainted nodes. For example, given that new nodes are created with a taint key of `kubeip.com/not-ready`: + +```diff +kind: DaemonSet +spec: + template: + spec: + serviceAccountName: kubeip-service-account ++ tolerations: ++ - key: kubeip.com/not-ready ++ operator: Exists ++ effect: NoSchedule + containers: + - name: kubeip + image: doitintl/kubeip-agent + env: ++ - name: TAINT_KEY ++ value: kubeip.com/not-ready +``` + +The parameter has no default value, and if not set, KubeIP will not attempt to remove any taints. If the provided Taint Key is not present on the node, KubeIP will simply log this fact and continue normally without attempting to remove it. If the Taint Key is present, but removing it fails for some reason, KubeIP will release the IP address back into the pool before restarting and trying again. + +Using this feature requires KubeIP to have permission to patch nodes. To use this feature, the `ClusterRole` resource rules need to be updated. **Note that if this configuration option is not set, KubeIP will not attempt to patch any nodes, and the change to the rules is not necessary.** + +Please keep in mind that this will give KubeIP permission to make updates to any node in your cluster, so please make sure that this aligns with your security requirements before enabling this feature! + +```diff +rules: + - apiGroups: [ "" ] + resources: [ "nodes" ] +- verbs: [ "get" ] ++ verbs: [ "get", "patch" ] +``` + ### AWS Make sure that KubeIP DaemonSet is deployed on nodes that have a public IP (node running in public subnet) and uses a Kubernetes service @@ -231,6 +269,7 @@ OPTIONS: --project value name of the GCP project or the AWS account ID (not needed if running in node) [$PROJECT] --region value name of the GCP region or the AWS region (not needed if running in node) [$REGION] --release-on-exit release the static public IP address on exit (default: true) [$RELEASE_ON_EXIT] + --taint-key value specify a taint key to remove from the node once the static public IP address is assigned [$TAINT_KEY] --retry-attempts value number of attempts to assign the static public IP address (default: 10) [$RETRY_ATTEMPTS] --retry-interval value when the agent fails to assign the static public IP address, it will retry after this interval (default: 5m0s) [$RETRY_INTERVAL] --lease-duration value duration of the kubernetes lease (default: 5) [$LEASE_DURATION] diff --git a/chart/templates/clusterrole.yaml b/chart/templates/clusterrole.yaml index a24da84..4f2123c 100644 --- a/chart/templates/clusterrole.yaml +++ b/chart/templates/clusterrole.yaml @@ -8,7 +8,11 @@ metadata: rules: - apiGroups: [ "" ] resources: [ "nodes" ] + {{- if .Values.rbac.allowNodesPatchPermission }} + verbs: [ "get", "patch" ] + {{- else }} verbs: [ "get" ] + {{- end }} - apiGroups: [ "coordination.k8s.io" ] resources: [ "leases" ] verbs: [ "create", "delete", "get" ] diff --git a/chart/templates/daemonset.yaml b/chart/templates/daemonset.yaml index 8d11e50..5ec4d3d 100644 --- a/chart/templates/daemonset.yaml +++ b/chart/templates/daemonset.yaml @@ -42,6 +42,8 @@ spec: fieldPath: spec.nodeName - name: FILTER value: {{ .Values.daemonSet.env.FILTER | quote }} + - name: TAINT_KEY + value: {{ .Values.daemonSet.env.TAINT_KEY | quote }} - name: LOG_LEVEL value: {{ .Values.daemonSet.env.LOG_LEVEL | quote }} - name: LOG_JSON diff --git a/chart/values.yaml b/chart/values.yaml index 08dca3c..9cdd362 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -25,6 +25,7 @@ serviceAccount: # Role-Based Access Control (RBAC) configuration. rbac: create: true + allowNodesPatchPermission: false # DaemonSet configuration. daemonSet: @@ -35,6 +36,7 @@ daemonSet: kubeip: use env: FILTER: labels.kubeip=reserved;labels.environment=demo + TAINT_KEY: "" LOG_LEVEL: debug LOG_JSON: true resources: diff --git a/cmd/main.go b/cmd/main.go index 3256a14..5405d44 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -174,6 +174,26 @@ func run(c context.Context, log *logrus.Entry, cfg *config.Config) error { return errors.Wrap(err, "assigning static public IP address") } + if cfg.TaintKey != "" { + logger := log.WithField("taint-key", cfg.TaintKey) + tainter := nd.NewTainter(clientset) + + didRemoveTaint, err := tainter.RemoveTaintKey(ctx, n, cfg.TaintKey) + if err != nil { + logger.Error("removing taint key failed, releasing static public IP address") + if releaseErr := releaseIP(assigner, n); releaseErr != nil { //nolint:contextcheck + log.WithError(releaseErr).Error("releasing static public IP address after taint key removal failed") + } + return errors.Wrap(err, "removing node taint key") + } + + if didRemoveTaint { + logger.Info("taint key removed successfully") + } else { + logger.Warning("taint key not present on node, skipped removal") + } + } + // pause the agent to prevent it from exiting immediately after assigning the static public IP address // wait for the context to be done: SIGTERM, SIGINT <-ctx.Done() @@ -303,6 +323,12 @@ func main() { Category: "Configuration", Value: true, }, + &cli.StringFlag{ + Name: "taint-key", + Usage: "specify a taint key to remove from the node once the static public IP address is assigned", + EnvVars: []string{"TAINT_KEY"}, + Category: "Configuration", + }, &cli.StringFlag{ Name: "log-level", Usage: "set log level (debug, info(*), warning, error, fatal, panic)", diff --git a/internal/config/config.go b/internal/config/config.go index 7ecf067..c4fb238 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -33,6 +33,8 @@ type Config struct { LeaseDuration int `json:"lease-duration"` // LeaseNamespace is the namespace of the kubernetes lease LeaseNamespace string `json:"lease-namespace"` + // TaintKey is the taint key to remove from the node once the IP address is assigned + TaintKey string `json:"taint-key"` } func NewConfig(c *cli.Context) *Config { @@ -50,5 +52,6 @@ func NewConfig(c *cli.Context) *Config { cfg.ReleaseOnExit = c.Bool("release-on-exit") cfg.LeaseDuration = c.Int("lease-duration") cfg.LeaseNamespace = c.String("lease-namespace") + cfg.TaintKey = c.String("taint-key") return &cfg } diff --git a/internal/node/tainter.go b/internal/node/tainter.go new file mode 100644 index 0000000..f201a68 --- /dev/null +++ b/internal/node/tainter.go @@ -0,0 +1,73 @@ +package node + +import ( + "context" + "encoding/json" + "fmt" + + "github.com/doitintl/kubeip/internal/types" + "github.com/pkg/errors" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + typesv1 "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes" +) + +type Tainter interface { + RemoveTaintKey(ctx context.Context, node *types.Node, taintKey string) (bool, error) +} + +type tainter struct { + client kubernetes.Interface +} + +func deleteTaintsByKey(taints []v1.Taint, taintKey string) ([]v1.Taint, bool) { + newTaints := []v1.Taint{} + didDelete := false + + for i := range taints { + if taintKey == taints[i].Key { + didDelete = true + continue + } + newTaints = append(newTaints, taints[i]) + } + + return newTaints, didDelete +} + +func NewTainter(client kubernetes.Interface) Tainter { + return &tainter{ + client: client, + } +} + +func (t *tainter) RemoveTaintKey(ctx context.Context, node *types.Node, taintKey string) (bool, error) { + // get node object from API server + n, err := t.client.CoreV1().Nodes().Get(ctx, node.Name, metav1.GetOptions{}) + if err != nil { + return false, errors.Wrap(err, "failed to get kubernetes node") + } + + // Remove taint from the node representation + newTaints, didDelete := deleteTaintsByKey(n.Spec.Taints, taintKey) + if !didDelete { + return false, nil + } + + // Marshal the remaining taints of the node into json format for patching. + // The remaining taints may be empty, and that will result in an empty json array "[]" + newTaintsMarshaled, err := json.Marshal(newTaints) + if err != nil { + return false, errors.Wrap(err, "failed to marshal new taints") + } + + // Patch the node with only the remaining taints + patch := fmt.Sprintf(`{"spec":{"taints":%v}}`, string(newTaintsMarshaled)) + _, err = t.client.CoreV1().Nodes().Patch(ctx, node.Name, typesv1.MergePatchType, []byte(patch), metav1.PatchOptions{}) + if err != nil { + return false, errors.Wrap(err, "failed to patch node taints") + } + + return true, nil +} diff --git a/internal/node/tainter_test.go b/internal/node/tainter_test.go new file mode 100644 index 0000000..cca3656 --- /dev/null +++ b/internal/node/tainter_test.go @@ -0,0 +1,273 @@ +package node + +import ( + "context" + "reflect" + "testing" + + "github.com/doitintl/kubeip/internal/types" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes/fake" +) + +func Test_deleteTaintsByKey(t *testing.T) { + tests := []struct { + name string + taints []v1.Taint + taintKey string + want []v1.Taint + wantDidDelete bool + }{ + { + name: "taints contains taintKey", + taints: []v1.Taint{ + { + Key: "taint1", + Value: "one", + }, + { + Key: "taint2", + Value: "two", + }, + }, + taintKey: "taint2", + want: []v1.Taint{ + { + Key: "taint1", + Value: "one", + }, + }, + wantDidDelete: true, + }, + { + name: "taint does not contain taintKey", + taints: []v1.Taint{ + { + Key: "taint1", + Value: "one", + }, + }, + taintKey: "taint2", + want: []v1.Taint{ + { + Key: "taint1", + Value: "one", + }, + }, + wantDidDelete: false, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, gotDidDelete := deleteTaintsByKey(tt.taints, tt.taintKey) + + if !reflect.DeepEqual(got, tt.want) { + t.Errorf("deleteTaintsByKey() got = %v, want %v", got, tt.want) + } + + if gotDidDelete != tt.wantDidDelete { + t.Errorf("deleteTaintsByKey() gotDidDelete = %v, want %v", gotDidDelete, tt.wantDidDelete) + } + }) + } +} + +func Test_tainter_RemoveTaintKey(t *testing.T) { + type fields struct { + client *fake.Clientset + } + type args struct { + node *types.Node + taintKey string + } + + tests := []struct { + name string + fields *fields + args args + want bool + wantErr bool + validateNode func(t *testing.T, node *v1.Node) + }{ + { + name: "remove taint key", + fields: &fields{ + client: fake.NewSimpleClientset(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: "taint1", + Value: "true", + Effect: "NoSchedule", + }, + { + Key: "taint2", + Value: "two", + Effect: "NoSchedule", + }, + }, + }, + }), + }, + args: args{ + node: &types.Node{Name: "node1"}, + taintKey: "taint1", + }, + want: true, + wantErr: false, + validateNode: func(t *testing.T, node *v1.Node) { + if node.ObjectMeta.Name != "node1" { + t.Errorf("RemoveTaintKey() node.ObjectMeta.Name = %v, want node1", node.ObjectMeta.Name) + } + + if len(node.Spec.Taints) != 1 { + t.Errorf("RemoveTaintKey() node.Spec.Taints = %v, want 1", node.Spec.Taints) + } + + if node.Spec.Taints[0].Key != "taint2" { + t.Errorf("RemoveTaintKey() node.Spec.Taints[0].Key = %v, want taint2", node.Spec.Taints[0].Key) + } + }, + }, + { + name: "only one taint key on node", + fields: &fields{ + client: fake.NewSimpleClientset(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: "taint1", + Value: "true", + Effect: "NoSchedule", + }, + }, + }, + }), + }, + args: args{ + node: &types.Node{Name: "node1"}, + taintKey: "taint1", + }, + want: true, + wantErr: false, + validateNode: func(t *testing.T, node *v1.Node) { + if node.ObjectMeta.Name != "node1" { + t.Errorf("RemoveTaintKey() node.ObjectMeta.Name = %v, want node1", node.ObjectMeta.Name) + } + + if len(node.Spec.Taints) != 0 { + t.Errorf("RemoveTaintKey() node.Spec.Taints = %v, want 0", node.Spec.Taints) + } + }, + }, + { + name: "taint key not present on node", + fields: &fields{ + client: fake.NewSimpleClientset(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + Spec: v1.NodeSpec{ + Taints: []v1.Taint{ + { + Key: "taint1", + Value: "true", + Effect: "NoSchedule", + }, + }, + }, + }), + }, + args: args{ + node: &types.Node{Name: "node1"}, + taintKey: "taint2", + }, + want: false, + wantErr: false, + validateNode: func(t *testing.T, node *v1.Node) { + if node.ObjectMeta.Name != "node1" { + t.Errorf("RemoveTaintKey() node.ObjectMeta.Name = %v, want node1", node.ObjectMeta.Name) + } + + if len(node.Spec.Taints) != 1 { + t.Errorf("RemoveTaintKey() node.Spec.Taints = %v, want 1", node.Spec.Taints) + } + + if node.Spec.Taints[0].Key != "taint1" { + t.Errorf("RemoveTaintKey() node.Spec.Taints[0].Key = %v, want taint1", node.Spec.Taints[0].Key) + } + }, + }, + { + name: "no taints on node", + fields: &fields{ + client: fake.NewSimpleClientset(&v1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "node1", + }, + Spec: v1.NodeSpec{}, + }), + }, + args: args{ + node: &types.Node{Name: "node1"}, + taintKey: "taint1", + }, + want: false, + wantErr: false, + validateNode: func(t *testing.T, node *v1.Node) { + if node.ObjectMeta.Name != "node1" { + t.Errorf("RemoveTaintKey() node.ObjectMeta.Name = %v, want node1", node.ObjectMeta.Name) + } + + if len(node.Spec.Taints) != 0 { + t.Errorf("RemoveTaintKey() node.Spec.Taints = %v, want 0", node.Spec.Taints) + } + }, + }, + { + name: "node not found", + fields: &fields{ + client: fake.NewSimpleClientset(), + }, + args: args{ + node: &types.Node{Name: "node1"}, + taintKey: "taint1", + }, + want: false, + wantErr: true, + validateNode: func(t *testing.T, node *v1.Node) { + // no node to validate + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + tainter := NewTainter(tt.fields.client) + got, err := tainter.RemoveTaintKey(ctx, tt.args.node, tt.args.taintKey) + + if (err != nil) != tt.wantErr { + t.Errorf("RemoveTaintKey() error = %v, wantErr %v", err, tt.wantErr) + } + if got != tt.want { + t.Errorf("RemoveTaintKey() got = %v, want %v", got, tt.want) + } + + if !tt.wantErr { + node, _ := tt.fields.client.CoreV1().Nodes().Get(ctx, tt.args.node.Name, metav1.GetOptions{}) + tt.validateNode(t, node) + } + }) + } +}