From 362ff2cd379acd3fa678c3b929dd524a92e5c519 Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Fri, 20 Sep 2024 15:24:32 +0300 Subject: [PATCH 1/9] Support different rolling upgrade strategies. --- .github/workflows/k3d-tests.yaml | 127 +++++++ Makefile | 111 +++++- api/v1/coherenceresource_types.go | 58 ++- api/v1/coherenceresourcespec_types.go | 42 +++ api/v1/zz_generated.deepcopy.go | 10 + config/manager/manager.yaml | 10 + controllers/coherence_controller.go | 17 +- controllers/coherencejob_controller.go | 16 +- controllers/job/job_controller.go | 5 +- controllers/reconciler/base_controller.go | 15 +- controllers/reconciler/simple_reconciler.go | 15 +- controllers/secret/secret_controller.go | 7 +- .../servicemonitor_controller.go | 12 +- .../statefulset/statefulset_controller.go | 35 +- controllers/statefulset/upgrade_strategy.go | 322 +++++++++++++++++ .../statefulset/upgrade_strategy_test.go | 91 +++++ controllers/webhook/webhook_controller.go | 4 +- docs/about/04_coherence_spec.adoc | 2 + docs/applications/030_deploy_application.adoc | 4 +- docs/applications/032_rolling_upgrade.adoc | 92 +++++ .../templates/deployment.yaml | 15 + helm-charts/coherence-operator/values.yaml | 5 + pkg/clients/clients.go | 11 +- pkg/nodes/nodes.go | 71 ++++ pkg/operator/operator.go | 5 + pkg/probe/probe.go | 45 ++- pkg/rest/rest.go | 100 +++--- pkg/runner/cmd_operator.go | 24 +- test/e2e/helper/e2e-helpers.go | 339 ++++-------------- test/e2e/helper/proj_helpers.go | 31 +- test/e2e/helper/test_context.go | 307 ++++++++++++++++ .../large-cluster/large_cluster_suite_test.go | 150 ++++++++ .../e2e/large-cluster/rolling_upgrade_test.go | 284 +++++++++++++++ 33 files changed, 1972 insertions(+), 410 deletions(-) create mode 100644 .github/workflows/k3d-tests.yaml create mode 100644 controllers/statefulset/upgrade_strategy.go create mode 100644 controllers/statefulset/upgrade_strategy_test.go create mode 100644 docs/applications/032_rolling_upgrade.adoc create mode 100644 pkg/nodes/nodes.go create mode 100644 test/e2e/helper/test_context.go create mode 100644 test/e2e/large-cluster/large_cluster_suite_test.go create mode 100644 test/e2e/large-cluster/rolling_upgrade_test.go diff --git a/.github/workflows/k3d-tests.yaml b/.github/workflows/k3d-tests.yaml new file mode 100644 index 000000000..123de91d0 --- /dev/null +++ b/.github/workflows/k3d-tests.yaml @@ -0,0 +1,127 @@ +# Copyright 2019, 2024, Oracle Corporation and/or its affiliates. All rights reserved. +# Licensed under the Universal Permissive License v 1.0 as shown at +# http://oss.oracle.com/licenses/upl. + +# --------------------------------------------------------------------------- +# Coherence Operator GitHub Actions K3d build. +# --------------------------------------------------------------------------- +name: K3d Tests + +on: + workflow_dispatch: + push: + branches-ignore: + - gh-pages + - 1.0.0 + - 2.x + - 3.x + pull_request: + types: + - opened + - synchronize + - committed + branches-ignore: + - gh-pages + - 1.0.0 + - 2.x + - 3.x + +env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + +jobs: + build: + runs-on: ubuntu-latest + +# Checkout the source, we need a depth of zero to fetch all the history otherwise +# the copyright check cannot work out the date of the files from Git. + steps: + - uses: actions/checkout@v4 + with: + fetch-depth: 0 + +# This step will free up disc space on the runner by removing +# lots of things that we do not need. + - name: disc + shell: bash + run: | + echo "Listing 100 largest packages" + dpkg-query -Wf '${Installed-Size}\t${Package}\n' | sort -n | tail -n 100 + df -h + echo "Removing large packages" + sudo apt-get remove -y '^dotnet-.*' || true + sudo apt-get remove -y '^llvm-.*' || true + sudo apt-get remove -y 'monodoc-http' || true + sudo apt-get remove -y 'php.*' || true + sudo apt-get remove -y azure-cli google-cloud-sdk hhvm google-chrome-stable firefox powershell mono-devel || true + sudo apt-get autoremove -y || true + sudo apt-get clean + df -h + echo "Removing large directories" + rm -rf /usr/share/dotnet/ + sudo rm -rf /usr/local/lib/android + df -h + + - name: Set up JDK + uses: oracle-actions/setup-java@v1 + with: + website: oracle.com + release: 21 + + - name: Set up Go + uses: actions/setup-go@v5 + with: + go-version: 1.22.x + + - name: Cache Go Modules + uses: actions/cache@v4 + with: + path: ~/go/pkg/mod + key: ${{ runner.os }}-go-mods-${{ hashFiles('**/go.sum') }} + restore-keys: | + ${{ runner.os }}-go-mods- + + - name: Cache Maven packages + uses: actions/cache@v4 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} + restore-keys: ${{ runner.os }}-m2 + + - name: Cache Tools + uses: actions/cache@v4 + with: + path: build/tools + key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} + restore-keys: ${{ runner.os }}-build-tools + + - name: Edit DNS Resolve + shell: bash + run: | + sudo chown -R runner:runner /run/systemd/resolve/stub-resolv.conf + sudo echo nameserver 8.8.8.8 > /run/systemd/resolve/stub-resolv.conf + + - name: Start K3d Cluster + shell: bash + run: | + echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $ --password-stdin + make k3d + kubectl version + kubectl get nodes + docker pull gcr.io/distroless/java + docker pull gcr.io/distroless/java11-debian11 + docker pull gcr.io/distroless/java17-debian11 + + - name: K3d Tests + shell: bash + timeout-minutes: 60 + run: | + make build-operator + make kind-load + make e2e-prometheus-test + + - uses: actions/upload-artifact@v4 + if: failure() + with: + name: test-output + path: build/_output/test-logs diff --git a/Makefile b/Makefile index b6dcd7c17..591cea62d 100644 --- a/Makefile +++ b/Makefile @@ -99,7 +99,9 @@ MAVEN_BUILD_OPTS :=$(USE_MAVEN_SETTINGS) -Drevision=$(MVN_VERSION) -Dcoherence.v # ---------------------------------------------------------------------------------------------------------------------- # Operator image names # ---------------------------------------------------------------------------------------------------------------------- -OPERATOR_IMAGE_REGISTRY ?= ghcr.io/oracle +BASE_IMAGE_REGISTRY ?= ghcr.io +BASE_IMAGE_REPO ?= oracle +OPERATOR_IMAGE_REGISTRY ?= $(BASE_IMAGE_REGISTRY)/$(BASE_IMAGE_REPO) RELEASE_IMAGE_PREFIX ?= $(OPERATOR_IMAGE_REGISTRY)/ OPERATOR_IMAGE_NAME := coherence-operator OPERATOR_BASE_IMAGE ?= scratch @@ -892,6 +894,28 @@ test-mvn: $(BUILD_OUTPUT)/certs $(BUILD_TARGETS)/java ## Run the Java artefact .PHONY: test-all test-all: test-mvn test-operator ## Run all unit tests +ENVTEST_K8S_VERSION = 1.31.0 +ENVTEST_VERSION ?= release-0.19 + +.PHONY: envtest +envtest: $(TOOLS_BIN)/setup-envtest ## Download setup-envtest locally if necessary. + +envtest-delete: + $(TOOLS_BIN)/setup-envtest --bin-dir $(TOOLS_BIN) cleanup latest-on-disk + rm -rf $(TOOLS_BIN)/k8s || true + +$(TOOLS_BIN)/setup-envtest: + test -s $(TOOLS_BIN)/setup-envtest || GOBIN=$(TOOLS_BIN) go install sigs.k8s.io/controller-runtime/tools/setup-envtest@$(ENVTEST_VERSION) + ls -al $(TOOLS_BIN) + +k8stools: $(TOOLS_BIN)/k8s + +$(TOOLS_BIN)/k8s: $(TOOLS_BIN)/setup-envtest + mkdir -p $(TOOLS_BIN)/k8s || true + $(TOOLS_BIN)/setup-envtest --bin-dir $(TOOLS_BIN) use $(ENVTEST_K8S_VERSION) + + + # ---------------------------------------------------------------------------------------------------------------------- # Executes the Go end-to-end tests that require a k8s cluster using # a LOCAL operator instance (i.e. the operator is not deployed to k8s). @@ -968,6 +992,39 @@ run-e2e-test: gotestsum ## Run the Operator 'remote' end-to-end functional test $(GOTESTSUM) --format standard-verbose --junitfile $(TEST_LOGS_DIR)/operator-e2e-test.xml \ -- $(GO_TEST_FLAGS_E2E) ./test/e2e/remote/... +# ---------------------------------------------------------------------------------------------------------------------- +# Executes the Go end-to-end tests that require a K3d cluster using +# a LOCAL operator instance (i.e. the operator is not deployed to k8s). +# These tests will use whichever K3d cluster the local environment +# is pointing to. +# ---------------------------------------------------------------------------------------------------------------------- +.PHONY: e2e-k3d-test +e2e-k3d-test: export CGO_ENABLED = 0 +e2e-k3d-test: export OPERATOR_NAMESPACE := $(OPERATOR_NAMESPACE) +e2e-k3d-test: export CLUSTER_NAMESPACE := $(CLUSTER_NAMESPACE) +e2e-k3d-test: export OPERATOR_NAMESPACE_CLIENT := $(OPERATOR_NAMESPACE_CLIENT) +e2e-k3d-test: export BUILD_OUTPUT := $(BUILD_OUTPUT) +e2e-k3d-test: export TEST_APPLICATION_IMAGE := $(TEST_APPLICATION_IMAGE) +e2e-k3d-test: export TEST_APPLICATION_IMAGE_CLIENT := $(TEST_APPLICATION_IMAGE_CLIENT) +e2e-k3d-test: export TEST_APPLICATION_IMAGE_HELIDON := $(TEST_APPLICATION_IMAGE_HELIDON) +e2e-k3d-test: export TEST_APPLICATION_IMAGE_SPRING := $(TEST_APPLICATION_IMAGE_SPRING) +e2e-k3d-test: export TEST_APPLICATION_IMAGE_SPRING_FAT := $(TEST_APPLICATION_IMAGE_SPRING_FAT) +e2e-k3d-test: export TEST_APPLICATION_IMAGE_SPRING_CNBP := $(TEST_APPLICATION_IMAGE_SPRING_CNBP) +e2e-k3d-test: export TEST_COHERENCE_IMAGE := $(TEST_COHERENCE_IMAGE) +e2e-k3d-test: export IMAGE_PULL_SECRETS := $(IMAGE_PULL_SECRETS) +e2e-k3d-test: export COH_SKIP_SITE := true +e2e-k3d-test: export TEST_IMAGE_PULL_POLICY := $(IMAGE_PULL_POLICY) +e2e-k3d-test: export TEST_STORAGE_CLASS := $(TEST_STORAGE_CLASS) +e2e-k3d-test: export GO_TEST_FLAGS_E2E := $(strip $(GO_TEST_FLAGS_E2E)) +e2e-k3d-test: export TEST_ASSET_KUBECTL := $(TEST_ASSET_KUBECTL) +e2e-k3d-test: export LOCAL_STORAGE_RESTART := $(LOCAL_STORAGE_RESTART) +e2e-k3d-test: export VERSION := $(VERSION) +e2e-k3d-test: export MVN_VERSION := $(MVN_VERSION) +e2e-k3d-test: export OPERATOR_IMAGE := $(OPERATOR_IMAGE) +e2e-k3d-test: export COHERENCE_IMAGE := $(COHERENCE_IMAGE) +e2e-k3d-test: reset-namespace create-ssl-secrets install-crds gotestsum undeploy ## Run the Operator end-to-end 'local' functional tests using a local Operator instance + $(GOTESTSUM) --format standard-verbose --junitfile $(TEST_LOGS_DIR)/operator-e2e-k3d-test.xml \ + -- $(GO_TEST_FLAGS_E2E) ./test/e2e/large-cluster/... # ---------------------------------------------------------------------------------------------------------------------- # Run the end-to-end Coherence client tests. @@ -1334,8 +1391,7 @@ endif .PHONY: just-deploy just-deploy: ## Deploy the Coherence Operator without rebuilding anything - $(call prepare_deploy,$(OPERATOR_IMAGE),$(OPERATOR_NAMESPACE)) - $(KUSTOMIZE) build $(BUILD_DEPLOY)/default | kubectl apply -f - + $(call do_deploy,$(OPERATOR_IMAGE),$(OPERATOR_NAMESPACE)) .PHONY: prepare-deploy prepare-deploy: $(BUILD_TARGETS)/manifests $(BUILD_TARGETS)/build-operator $(TOOLS_BIN)/kustomize @@ -1399,6 +1455,12 @@ define prepare_deploy cd $(BUILD_DEPLOY)/default && $(KUSTOMIZE) edit set namespace $(2) endef +define do_deploy + $(call prepare_deploy,$(1),$(2)) + $(KUSTOMIZE) build $(BUILD_DEPLOY)/default | kubectl apply -f - +endef + + # ---------------------------------------------------------------------------------------------------------------------- # Un-deploy controller from the configured Kubernetes cluster in ~/.kube/config # ---------------------------------------------------------------------------------------------------------------------- @@ -1642,6 +1704,49 @@ kind-load-operator: ## Load the Operator images into the KinD cluster kind-load-compatibility: ## Load the compatibility test images into the KinD cluster kind load docker-image --name $(KIND_CLUSTER) $(TEST_COMPATIBILITY_IMAGE) || true +# ====================================================================================================================== +# Targets related to running k3d clusters +# ====================================================================================================================== +##@ K3d + +K3D_CLUSTER ?= operator +K3D_REGISTRY ?= myregistry +K3D_REGISTRY_PORT ?= 12345 +K3D_INTERNAL_REGISTRY := k3d-$(K3D_REGISTRY).localhost:$(K3D_REGISTRY_PORT) + +.PHONY: k3d +k3d: $(TOOLS_BIN)/k3d k3d-create k3d-load-operator create-namespace ## Run a default k3d cluster + +.PHONY: k3d-create +k3d-create: $(TOOLS_BIN)/k3d ## Create the k3d cluster + $(TOOLS_BIN)/k3d registry create myregistry.localhost --port 12345 + $(TOOLS_BIN)/k3d cluster create $(K3D_CLUSTER) --agents 5 \ + --registry-use $(K3D_INTERNAL_REGISTRY) --no-lb \ + --runtime-ulimit "nofile=64000:64000" --runtime-ulimit "nproc=64000:64000" \ + --api-port 127.0.0.1:6550 + +.PHONY: k3d-stop +k3d-stop: $(TOOLS_BIN)/k3d ## Stop a default k3d cluster + $(TOOLS_BIN)/k3d cluster delete $(K3D_CLUSTER) + $(TOOLS_BIN)/k3d registry delete myregistry.localhost + +.PHONY: k3d-load-operator +k3d-load-operator: $(TOOLS_BIN)/k3d ## Load the Operator images into the k3d cluster + k3d image import $(OPERATOR_IMAGE) -c $(K3D_CLUSTER) + +.PHONY: k3d-load-coherence +k3d-load-coherence: $(TOOLS_BIN)/k3d ## Load the Coherence images into the k3d cluster + k3d image import $(COHERENCE_IMAGE) -c $(K3D_CLUSTER) + +.PHONY: k3d-load-all +k3d-load-all: $(TOOLS_BIN)/k3d k3d-load-operator k3d-load-coherence ## Load all the test images into the k3d cluster + + +k3d-get: $(TOOLS_BIN)/k3d ## Install k3d + +$(TOOLS_BIN)/k3d: + export K3D_INSTALL_DIR=$(TOOLS_BIN) && export USE_SUDO=false && curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash + # ====================================================================================================================== # Targets related to running Minikube # ====================================================================================================================== diff --git a/api/v1/coherenceresource_types.go b/api/v1/coherenceresource_types.go index e4b024e3d..bca656445 100644 --- a/api/v1/coherenceresource_types.go +++ b/api/v1/coherenceresource_types.go @@ -510,8 +510,44 @@ type CoherenceStatefulSetResourceSpec struct { // The Coherence operator does not apply any default resources. // +optional InitResources *corev1.ResourceRequirements `json:"initResources,omitempty"` + // The rolling upgrade strategy to use. + // If present, the value must be one of "UpgradeByPod", "UpgradeByNode" of "OnDelete". + // If not set, the default is "UpgradeByPod" + // UpgradeByPod will perform a rolling upgrade one Pod at a time. + // UpgradeByNode will update all Pods on a Node at the same time. + // OnDelete will not automatically apply any updates, Pods must be manually + // deleted for updates to be applied to the restarted Pod. + // +optional + RollingUpdateStrategy *RollingUpdateStrategyType `json:"rollingUpdateStrategy,omitempty"` + // The name of the Node label to use to group Pods during a rolling upgrade. + // This field ony applies if RollingUpdateStrategy is set to NodeLabel. + // If RollingUpdateStrategy is set to NodeLabel and this field is omitted then the + // rolling upgrade will be by Node. It is the users responsibility to ensure that + // Nodes actually have the label used for this field. The label should be + // one of the node labels used to set the Coherence site or rack value. + // +optional + RollingUpdateLabel *string `json:"rollingUpdateLabel,omitempty"` } +// RollingUpdateStrategyType is a string enumeration type that enumerates +// all possible rolling update strategies. +// +enum +type RollingUpdateStrategyType string + +const ( + // UpgradeByPod indicates that updates will be applied to all Pods in the StatefulSet + // with respect to the StatefulSet ordering constraints one Pod at a time. + // This is the default behaviour for a StatefulSet rolling upgrade. + UpgradeByPod RollingUpdateStrategyType = "Pod" + // UpgradeByNode indicates that updates will be applied to all Pods on a Node at the same time. + UpgradeByNode RollingUpdateStrategyType = "Node" + // UpgradeByNodeLabel indicates that updates will be applied to all Pods on a Node with the same label value at the same time. + UpgradeByNodeLabel RollingUpdateStrategyType = "NodeLabel" + // UpgradeManual is equivalent to using "OnDelete" as a StatefulSet upgrade strategy. + // Updates are applied to Pods by the StatefulSet controller after they are manually killed. + UpgradeManual RollingUpdateStrategyType = "Manual" +) + // CreateStatefulSetResource creates the deployment's StatefulSet resource. func (in *CoherenceStatefulSetResourceSpec) CreateStatefulSetResource(deployment *Coherence) Resource { sts := in.CreateStatefulSet(deployment) @@ -537,13 +573,33 @@ func (in *CoherenceStatefulSetResourceSpec) CreateStatefulSet(deployment *Cohere replicas := in.GetReplicas() podTemplate := in.CreatePodTemplateSpec(deployment) + // Work out the StatefulSet rolling upgrade strategy based on the + // value of the Coherence spec RollingUpdateStrategy field + var strategy appsv1.StatefulSetUpdateStrategyType + if in.RollingUpdateStrategy == nil { + // Nothing set, so default to a normal StatefulSet rolling upgrade one Pod at a time + strategy = appsv1.RollingUpdateStatefulSetStrategyType + } else { + // A strategy has been set in the Coherence spec + rollStrategy := *in.RollingUpdateStrategy + if rollStrategy == UpgradeByPod { + // UpgradeByPod is the same as the default StatefulSet strategy + strategy = appsv1.RollingUpdateStatefulSetStrategyType + } else { + // One of our custom strategies has been chosen, so we set the + // StatefulSet strategy to OnDelete as the Operator will control + // the rolling update + strategy = appsv1.OnDeleteStatefulSetStrategyType + } + } + // Add the component label sts.Labels[LabelComponent] = LabelComponentCoherenceStatefulSet sts.Spec = appsv1.StatefulSetSpec{ Replicas: &replicas, PodManagementPolicy: appsv1.ParallelPodManagement, UpdateStrategy: appsv1.StatefulSetUpdateStrategy{ - Type: appsv1.RollingUpdateStatefulSetStrategyType, + Type: strategy, }, RevisionHistoryLimit: ptr.To(int32(5)), ServiceName: deployment.GetHeadlessServiceName(), diff --git a/api/v1/coherenceresourcespec_types.go b/api/v1/coherenceresourcespec_types.go index 75e4f0c30..502f2a80c 100644 --- a/api/v1/coherenceresourcespec_types.go +++ b/api/v1/coherenceresourcespec_types.go @@ -1069,6 +1069,48 @@ func (in *CoherenceResourceSpec) CreateOperatorInitContainer(deployment Coherenc return c } +// EnsureTopologySpreadConstraints creates the Pod TopologySpreadConstraint array, either from that configured +// for the cluster or the default constraints. +func (in *CoherenceResourceSpec) EnsureTopologySpreadConstraints(deployment CoherenceResource) []corev1.TopologySpreadConstraint { + if in == nil || in.TopologySpreadConstraints == nil || len(in.TopologySpreadConstraints) == 0 { + return in.CreateDefaultTopologySpreadConstraints(deployment) + } + return in.TopologySpreadConstraints +} + +// CreateDefaultTopologySpreadConstraints creates the default Pod TopologySpreadConstraint array to use in a deployment's StatefulSet. +func (in *CoherenceResourceSpec) CreateDefaultTopologySpreadConstraints(deployment CoherenceResource) []corev1.TopologySpreadConstraint { + selector := metav1.LabelSelector{ + MatchExpressions: []metav1.LabelSelectorRequirement{ + { + Key: LabelCoherenceCluster, + Operator: metav1.LabelSelectorOpIn, + Values: []string{deployment.GetCoherenceClusterName()}, + }, + { + Key: LabelCoherenceDeployment, + Operator: metav1.LabelSelectorOpIn, + Values: []string{deployment.GetName()}, + }, + }, + } + + return []corev1.TopologySpreadConstraint{ + { + MaxSkew: 1, + TopologyKey: AffinityTopologyKey, + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &selector, + }, + { + MaxSkew: 1, + TopologyKey: operator.LabelHostName, + WhenUnsatisfiable: corev1.ScheduleAnyway, + LabelSelector: &selector, + }, + } +} + // EnsurePodAffinity creates the Pod Affinity either from that configured for the cluster or the default affinity. func (in *CoherenceResourceSpec) EnsurePodAffinity(deployment CoherenceResource) *corev1.Affinity { if in != nil && in.Affinity != nil { diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go index 5a74af029..ddf8ab6ac 100644 --- a/api/v1/zz_generated.deepcopy.go +++ b/api/v1/zz_generated.deepcopy.go @@ -907,6 +907,16 @@ func (in *CoherenceStatefulSetResourceSpec) DeepCopyInto(out *CoherenceStatefulS *out = new(corev1.ResourceRequirements) (*in).DeepCopyInto(*out) } + if in.RollingUpdateStrategy != nil { + in, out := &in.RollingUpdateStrategy, &out.RollingUpdateStrategy + *out = new(RollingUpdateStrategyType) + **out = **in + } + if in.RollingUpdateLabel != nil { + in, out := &in.RollingUpdateLabel, &out.RollingUpdateLabel + *out = new(string) + **out = **in + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CoherenceStatefulSetResourceSpec. diff --git a/config/manager/manager.yaml b/config/manager/manager.yaml index 702c69b52..c0828e43e 100644 --- a/config/manager/manager.yaml +++ b/config/manager/manager.yaml @@ -97,6 +97,16 @@ spec: defaultMode: 420 secretName: coherence-webhook-server-cert optional: true + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + control-plane: coherence + app.kubernetes.io/name: coherence-operator + app.kubernetes.io/instance: coherence-operator-manager + app.kubernetes.io/version: "3.4.0" affinity: podAntiAffinity: preferredDuringSchedulingIgnoredDuringExecution: diff --git a/controllers/coherence_controller.go b/controllers/coherence_controller.go index e4b1351b4..5275b0c6b 100644 --- a/controllers/coherence_controller.go +++ b/controllers/coherence_controller.go @@ -16,6 +16,7 @@ import ( "github.com/oracle/coherence-operator/controllers/secret" "github.com/oracle/coherence-operator/controllers/servicemonitor" "github.com/oracle/coherence-operator/controllers/statefulset" + "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/operator" "github.com/oracle/coherence-operator/pkg/probe" "github.com/oracle/coherence-operator/pkg/rest" @@ -54,6 +55,7 @@ const ( type CoherenceReconciler struct { client.Client reconciler.CommonReconciler + ClientSet clients.ClientSet Log logr.Logger Scheme *runtime.Scheme reconcilers []reconciler.SecondaryResourceReconciler @@ -267,7 +269,6 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque // process the secondary resources in the order they should be created var failures []Failure for _, rec := range in.reconcilers { - log.Info("Reconciling Coherence resource secondary resources", "controller", rec.GetControllerName()) r, err := rec.ReconcileAllResourceOfKind(ctx, request, deployment, storage) if err != nil { failures = append(failures, Failure{Name: rec.GetControllerName(), Error: err}) @@ -302,22 +303,22 @@ func (in *CoherenceReconciler) Reconcile(ctx context.Context, request ctrl.Reque return result, nil } -func (in *CoherenceReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (in *CoherenceReconciler) SetupWithManager(mgr ctrl.Manager, cs clients.ClientSet) error { SetupMonitoringResources(mgr) // Create the sub-resource reconcilers IN THE ORDER THAT RESOURCES MUST BE CREATED. // This is important to ensure, for example, that a ConfigMap is created before the // StatefulSet that uses it. reconcilers := []reconciler.SecondaryResourceReconciler{ - reconciler.NewConfigMapReconciler(mgr), - secret.NewSecretReconciler(mgr), - reconciler.NewServiceReconciler(mgr), - servicemonitor.NewServiceMonitorReconciler(mgr), - statefulset.NewStatefulSetReconciler(mgr), + reconciler.NewConfigMapReconciler(mgr, cs), + secret.NewSecretReconciler(mgr, cs), + reconciler.NewServiceReconciler(mgr, cs), + servicemonitor.NewServiceMonitorReconciler(mgr, cs), + statefulset.NewStatefulSetReconciler(mgr, cs), } in.reconcilers = reconcilers - in.SetCommonReconciler(controllerName, mgr) + in.SetCommonReconciler(controllerName, mgr, cs) in.SetPatchType(types.MergePatchType) template := &coh.Coherence{} diff --git a/controllers/coherencejob_controller.go b/controllers/coherencejob_controller.go index af4f1978b..0b8b35f56 100644 --- a/controllers/coherencejob_controller.go +++ b/controllers/coherencejob_controller.go @@ -14,6 +14,7 @@ import ( "github.com/oracle/coherence-operator/controllers/reconciler" "github.com/oracle/coherence-operator/controllers/secret" "github.com/oracle/coherence-operator/controllers/servicemonitor" + "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/operator" "github.com/oracle/coherence-operator/pkg/rest" "github.com/oracle/coherence-operator/pkg/utils" @@ -36,6 +37,7 @@ import ( type CoherenceJobReconciler struct { client.Client reconciler.CommonReconciler + ClientSet clients.ClientSet Log logr.Logger Scheme *runtime.Scheme reconcilers []reconciler.SecondaryResourceReconciler @@ -233,22 +235,22 @@ func (in *CoherenceJobReconciler) ReconcileDeployment(ctx context.Context, reque return result, nil } -func (in *CoherenceJobReconciler) SetupWithManager(mgr ctrl.Manager) error { +func (in *CoherenceJobReconciler) SetupWithManager(mgr ctrl.Manager, cs clients.ClientSet) error { SetupMonitoringResources(mgr) // Create the sub-resource reconcilers IN THE ORDER THAT RESOURCES MUST BE CREATED. // This is important to ensure, for example, that a ConfigMap is created before the // StatefulSet that uses it. reconcilers := []reconciler.SecondaryResourceReconciler{ - reconciler.NewConfigMapReconciler(mgr), - secret.NewSecretReconciler(mgr), - reconciler.NewServiceReconciler(mgr), - servicemonitor.NewServiceMonitorReconciler(mgr), - job.NewJobReconciler(mgr), + reconciler.NewConfigMapReconciler(mgr, cs), + secret.NewSecretReconciler(mgr, cs), + reconciler.NewServiceReconciler(mgr, cs), + servicemonitor.NewServiceMonitorReconciler(mgr, cs), + job.NewJobReconciler(mgr, cs), } in.reconcilers = reconcilers - in.SetCommonReconciler(jobControllerName, mgr) + in.SetCommonReconciler(jobControllerName, mgr, cs) in.SetPatchType(types.MergePatchType) template := &coh.CoherenceJob{} diff --git a/controllers/job/job_controller.go b/controllers/job/job_controller.go index 142ab6805..95ad6937e 100644 --- a/controllers/job/job_controller.go +++ b/controllers/job/job_controller.go @@ -12,6 +12,7 @@ import ( "github.com/go-logr/logr" coh "github.com/oracle/coherence-operator/api/v1" "github.com/oracle/coherence-operator/controllers/reconciler" + "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/probe" "github.com/oracle/coherence-operator/pkg/utils" "github.com/pkg/errors" @@ -39,7 +40,7 @@ const ( var _ reconcile.Reconciler = &ReconcileJob{} // NewJobReconciler returns a new Job reconciler. -func NewJobReconciler(mgr manager.Manager) reconciler.SecondaryResourceReconciler { +func NewJobReconciler(mgr manager.Manager, cs clients.ClientSet) reconciler.SecondaryResourceReconciler { r := &ReconcileJob{ ReconcileSecondaryResource: reconciler.ReconcileSecondaryResource{ @@ -48,7 +49,7 @@ func NewJobReconciler(mgr manager.Manager) reconciler.SecondaryResourceReconcile }, } - r.SetCommonReconciler(controllerName, mgr) + r.SetCommonReconciler(controllerName, mgr, cs) return r } diff --git a/controllers/reconciler/base_controller.go b/controllers/reconciler/base_controller.go index 1d9663de2..c643af143 100644 --- a/controllers/reconciler/base_controller.go +++ b/controllers/reconciler/base_controller.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2023, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -12,6 +12,7 @@ import ( "fmt" "github.com/go-logr/logr" coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/utils" "github.com/pkg/errors" appsv1 "k8s.io/api/apps/v1" @@ -62,6 +63,7 @@ type BaseReconciler interface { GetControllerName() string GetManager() manager.Manager GetClient() client.Client + GetClientSet() clients.ClientSet GetEventRecorder() record.EventRecorder GetLog() logr.Logger GetReconciler() reconcile.Reconciler @@ -72,6 +74,7 @@ type BaseReconciler interface { type CommonReconciler struct { name string mgr manager.Manager + clientSet clients.ClientSet locks map[types.NamespacedName]bool mutex *sync.Mutex logger logr.Logger @@ -81,6 +84,7 @@ type CommonReconciler struct { func (in *CommonReconciler) GetControllerName() string { return in.name } func (in *CommonReconciler) GetManager() manager.Manager { return in.mgr } func (in *CommonReconciler) GetClient() client.Client { return in.mgr.GetClient() } +func (in *CommonReconciler) GetClientSet() clients.ClientSet { return in.clientSet } func (in *CommonReconciler) GetMutex() *sync.Mutex { return in.mutex } func (in *CommonReconciler) GetPatchType() types.PatchType { return in.patchType } func (in *CommonReconciler) SetPatchType(pt types.PatchType) { in.patchType = pt } @@ -91,9 +95,10 @@ func (in *CommonReconciler) GetLog() logr.Logger { return in.logger } -func (in *CommonReconciler) SetCommonReconciler(name string, mgr manager.Manager) { +func (in *CommonReconciler) SetCommonReconciler(name string, mgr manager.Manager, cs clients.ClientSet) { in.name = name in.mgr = mgr + in.clientSet = cs in.mutex = commonMutex in.logger = logf.Log.WithName(name) in.patchType = types.StrategicMergePatchType @@ -745,7 +750,6 @@ func (in *ReconcileSecondaryResource) CanWatch() bool { return !in.S // ReconcileAllResourceOfKind reconciles the state of all the desired resources of the specified Kind for the reconciler func (in *ReconcileSecondaryResource) ReconcileAllResourceOfKind(ctx context.Context, request reconcile.Request, deployment coh.CoherenceResource, storage utils.Storage) (reconcile.Result, error) { logger := in.GetLog().WithValues("Namespace", request.Namespace, "Name", request.Name, "Kind", in.Kind.Name()) - logger.Info(fmt.Sprintf("Reconciling all %v", in.Kind)) var err error resources := storage.GetLatest().GetResourcesOfKind(in.Kind) @@ -762,7 +766,6 @@ func (in *ReconcileSecondaryResource) ReconcileAllResourceOfKind(ctx context.Con } } } - logger.Info(fmt.Sprintf("Finished reconciling all %v", in.Kind)) return reconcile.Result{}, nil } @@ -776,7 +779,7 @@ func (in *ReconcileSecondaryResource) HashLabelsMatch(o metav1.Object, storage u // ReconcileSingleResource reconciles a specific resource. func (in *ReconcileSecondaryResource) ReconcileSingleResource(ctx context.Context, namespace, name string, owner coh.CoherenceResource, storage utils.Storage, logger logr.Logger) error { logger = logger.WithValues("Resource", name) - logger.Info(fmt.Sprintf("Reconciling single %v", in.Kind)) + logger.Info(fmt.Sprintf("Reconciling %v", in.Kind)) // Fetch the resource's current state resource, exists, err := in.FindResource(ctx, namespace, name) @@ -801,7 +804,7 @@ func (in *ReconcileSecondaryResource) ReconcileSingleResource(ctx context.Contex if owner != nil && in.Kind.Name() == coh.ResourceTypeSecret.Name() && name == owner.GetName() { // this a reconcile event for the storage secret, we can ignore it - logger.Info(fmt.Sprintf("Finished reconciling single %v", in.Kind)) + logger.Info(fmt.Sprintf("Finished reconciling %v", in.Kind)) return nil } diff --git a/controllers/reconciler/simple_reconciler.go b/controllers/reconciler/simple_reconciler.go index 340ed13d6..0b631249b 100644 --- a/controllers/reconciler/simple_reconciler.go +++ b/controllers/reconciler/simple_reconciler.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -9,6 +9,7 @@ package reconciler import ( "context" coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/pkg/clients" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/manager" @@ -19,16 +20,16 @@ import ( // If the reconcile.Reconciler API was to change then we'd get a compile error here. var _ reconcile.Reconciler = &SimpleReconciler{} -func NewConfigMapReconciler(mgr manager.Manager) SecondaryResourceReconciler { - return NewSimpleReconciler(mgr, "controllers.ConfigMap", coh.ResourceTypeConfigMap, &corev1.ConfigMap{}) +func NewConfigMapReconciler(mgr manager.Manager, cs clients.ClientSet) SecondaryResourceReconciler { + return NewSimpleReconciler(mgr, cs, "controllers.ConfigMap", coh.ResourceTypeConfigMap, &corev1.ConfigMap{}) } -func NewServiceReconciler(mgr manager.Manager) SecondaryResourceReconciler { - return NewSimpleReconciler(mgr, "controllers.Service", coh.ResourceTypeService, &corev1.Service{}) +func NewServiceReconciler(mgr manager.Manager, cs clients.ClientSet) SecondaryResourceReconciler { + return NewSimpleReconciler(mgr, cs, "controllers.Service", coh.ResourceTypeService, &corev1.Service{}) } // NewSimpleReconciler returns a new SimpleReconciler. -func NewSimpleReconciler(mgr manager.Manager, name string, kind coh.ResourceType, template client.Object) SecondaryResourceReconciler { +func NewSimpleReconciler(mgr manager.Manager, cs clients.ClientSet, name string, kind coh.ResourceType, template client.Object) SecondaryResourceReconciler { r := &SimpleReconciler{ ReconcileSecondaryResource: ReconcileSecondaryResource{ Kind: kind, @@ -36,7 +37,7 @@ func NewSimpleReconciler(mgr manager.Manager, name string, kind coh.ResourceType }, } - r.SetCommonReconciler(name, mgr) + r.SetCommonReconciler(name, mgr, cs) return r } diff --git a/controllers/secret/secret_controller.go b/controllers/secret/secret_controller.go index 00b2e4bbb..aa371eea2 100644 --- a/controllers/secret/secret_controller.go +++ b/controllers/secret/secret_controller.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -10,6 +10,7 @@ import ( "context" coh "github.com/oracle/coherence-operator/api/v1" "github.com/oracle/coherence-operator/controllers/reconciler" + "github.com/oracle/coherence-operator/pkg/clients" corev1 "k8s.io/api/core/v1" "sigs.k8s.io/controller-runtime/pkg/manager" "sigs.k8s.io/controller-runtime/pkg/reconcile" @@ -25,7 +26,7 @@ const ( var _ reconcile.Reconciler = &ReconcileSecret{} // NewSecretReconciler returns a new Secret reconciler. -func NewSecretReconciler(mgr manager.Manager) reconciler.SecondaryResourceReconciler { +func NewSecretReconciler(mgr manager.Manager, cs clients.ClientSet) reconciler.SecondaryResourceReconciler { r := &ReconcileSecret{ SimpleReconciler: reconciler.SimpleReconciler{ ReconcileSecondaryResource: reconciler.ReconcileSecondaryResource{ @@ -36,7 +37,7 @@ func NewSecretReconciler(mgr manager.Manager) reconciler.SecondaryResourceReconc }, } - r.SetCommonReconciler(controllerName, mgr) + r.SetCommonReconciler(controllerName, mgr, cs) return r } diff --git a/controllers/servicemonitor/servicemonitor_controller.go b/controllers/servicemonitor/servicemonitor_controller.go index 278dabd29..1ace3d955 100644 --- a/controllers/servicemonitor/servicemonitor_controller.go +++ b/controllers/servicemonitor/servicemonitor_controller.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2023, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -12,6 +12,7 @@ import ( "github.com/go-logr/logr" coh "github.com/oracle/coherence-operator/api/v1" "github.com/oracle/coherence-operator/controllers/reconciler" + "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/utils" "github.com/pkg/errors" monitoring "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" @@ -33,7 +34,7 @@ const ( var _ reconcile.Reconciler = &ReconcileServiceMonitor{} // NewServiceMonitorReconciler returns a new ServiceMonitor reconciler. -func NewServiceMonitorReconciler(mgr manager.Manager) reconciler.SecondaryResourceReconciler { +func NewServiceMonitorReconciler(mgr manager.Manager, cs clients.ClientSet) reconciler.SecondaryResourceReconciler { r := &ReconcileServiceMonitor{ ReconcileSecondaryResource: reconciler.ReconcileSecondaryResource{ Kind: coh.ResourceTypeServiceMonitor, @@ -43,7 +44,7 @@ func NewServiceMonitorReconciler(mgr manager.Manager) reconciler.SecondaryResour monClient: client.NewForConfigOrDie(mgr.GetConfig()), } - r.SetCommonReconciler(controllerName, mgr) + r.SetCommonReconciler(controllerName, mgr, cs) return r } @@ -81,7 +82,6 @@ func (in *ReconcileServiceMonitor) Reconcile(ctx context.Context, request reconc // ReconcileAllResourceOfKind reconciles the state of the desired ServiceMonitors for the reconciler func (in *ReconcileServiceMonitor) ReconcileAllResourceOfKind(ctx context.Context, request reconcile.Request, d coh.CoherenceResource, storage utils.Storage) (reconcile.Result, error) { logger := in.GetLog().WithValues("Namespace", request.Namespace, "Name", request.Name, "Kind", in.Kind.Name()) - logger.Info(fmt.Sprintf("Reconciling all %v", in.Kind)) var err error @@ -104,7 +104,7 @@ func (in *ReconcileServiceMonitor) ReconcileAllResourceOfKind(ctx context.Contex func (in *ReconcileServiceMonitor) ReconcileSingleResource(ctx context.Context, namespace, name string, owner coh.CoherenceResource, storage utils.Storage, logger logr.Logger) error { logger = logger.WithValues("Resource", name) - logger.Info(fmt.Sprintf("Reconciling single %v", in.Kind)) + logger.Info(fmt.Sprintf("Reconciling %v", in.Kind)) // See whether it is even possible to handle Prometheus ServiceMonitor resources if !in.hasServiceMonitor() { @@ -162,7 +162,7 @@ func (in *ReconcileServiceMonitor) ReconcileSingleResource(ctx context.Context, err = in.UpdateServiceMonitor(ctx, namespace, name, sm, storage, logger) } - logger.Info(fmt.Sprintf("Finished reconciling single %v", in.Kind)) + logger.Info(fmt.Sprintf("Finished reconciling %v", in.Kind)) return err } diff --git a/controllers/statefulset/statefulset_controller.go b/controllers/statefulset/statefulset_controller.go index 8a107b67e..d2015415d 100644 --- a/controllers/statefulset/statefulset_controller.go +++ b/controllers/statefulset/statefulset_controller.go @@ -12,6 +12,7 @@ import ( "github.com/go-logr/logr" coh "github.com/oracle/coherence-operator/api/v1" "github.com/oracle/coherence-operator/controllers/reconciler" + "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/probe" "github.com/oracle/coherence-operator/pkg/utils" "github.com/pkg/errors" @@ -49,7 +50,7 @@ var _ reconcile.Reconciler = &ReconcileStatefulSet{} var log = logf.Log.WithName(controllerName) // NewStatefulSetReconciler returns a new StatefulSet reconciler. -func NewStatefulSetReconciler(mgr manager.Manager) reconciler.SecondaryResourceReconciler { +func NewStatefulSetReconciler(mgr manager.Manager, cs clients.ClientSet) reconciler.SecondaryResourceReconciler { // Parse the StatusHA retry time from the retry := time.Minute s := os.Getenv(statusHaRetryEnv) @@ -70,7 +71,7 @@ func NewStatefulSetReconciler(mgr manager.Manager) reconciler.SecondaryResourceR statusHARetry: retry, } - r.SetCommonReconciler(controllerName, mgr) + r.SetCommonReconciler(controllerName, mgr, cs) return r } @@ -84,9 +85,6 @@ func (in *ReconcileStatefulSet) GetReconciler() reconcile.Reconciler { return in // Reconcile reads that state of the Services for a deployment and makes changes based on the // state read and the desired state based on the parent Coherence resource. func (in *ReconcileStatefulSet) Reconcile(ctx context.Context, request reconcile.Request) (reconcile.Result, error) { - logger := in.GetLog().WithValues("Namespace", request.Namespace, "Name", request.Name, "Kind", "StatefulSet") - logger.Info("Starting reconcile") - // Attempt to lock the requested resource. If the resource is locked then another // request for the same resource is already in progress so requeue this one. if ok := in.Lock(request); !ok { @@ -101,7 +99,6 @@ func (in *ReconcileStatefulSet) Reconcile(ctx context.Context, request reconcile } result, err := in.ReconcileAllResourceOfKind(ctx, request, nil, storage) - logger.Info("Completed reconcile") return result, err } @@ -212,7 +209,7 @@ func (in *ReconcileStatefulSet) execActions(ctx context.Context, sts *appsv1.Sta for _, action := range spec.Actions { if action.Probe != nil { - if ok := coherenceProbe.ExecuteProbe(ctx, deployment, sts, action.Probe); !ok { + if ok := coherenceProbe.ExecuteProbe(ctx, sts, action.Probe); !ok { log.Info("Action probe execution failed.", "probe", action.Probe) } } @@ -349,7 +346,29 @@ func (in *ReconcileStatefulSet) updateStatefulSet(ctx context.Context, deploymen func (in *ReconcileStatefulSet) patchStatefulSet(ctx context.Context, deployment coh.CoherenceResource, current, desired *appsv1.StatefulSet, storage utils.Storage, logger logr.Logger) (reconcile.Result, error) { hashMatches := in.HashLabelsMatch(current, storage) if hashMatches { - return reconcile.Result{}, nil + // Nothing to patch, see if we need to do a rolling upgrade of Pods + // If the Operator is controlling the upgrade + p := probe.CoherenceProbe{Client: in.GetClient(), Config: in.GetManager().GetConfig()} + strategy := GetUpgradeStrategy(deployment, p) + if strategy.IsOperatorManaged() { + // The Operator is managing the rolling upgrade + if current.Spec.Replicas == nil { + if current.Status.ReadyReplicas != 1 || current.Status.CurrentReplicas != 1 { + return reconcile.Result{}, nil + } + } else { + replicas := *current.Spec.Replicas + if (current.Status.CurrentReplicas+current.Status.UpdatedReplicas) != replicas || current.Status.ReadyReplicas != replicas { + return reconcile.Result{}, nil + } + } + + if current.Status.CurrentRevision == current.Status.UpdateRevision { + return reconcile.Result{}, nil + } + + return strategy.RollingUpgrade(ctx, current, in.GetClientSet().KubeClient) + } } resource, _ := storage.GetPrevious().GetResource(coh.ResourceTypeStatefulSet, current.GetName()) diff --git a/controllers/statefulset/upgrade_strategy.go b/controllers/statefulset/upgrade_strategy.go new file mode 100644 index 000000000..3ce45501b --- /dev/null +++ b/controllers/statefulset/upgrade_strategy.go @@ -0,0 +1,322 @@ +/* + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package statefulset + +import ( + "context" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/pkg/nodes" + "github.com/oracle/coherence-operator/pkg/operator" + "github.com/oracle/coherence-operator/pkg/probe" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "time" +) + +type UpgradeStrategy interface { + IsOperatorManaged() bool + RollingUpgrade(context.Context, *appsv1.StatefulSet, kubernetes.Interface) (reconcile.Result, error) +} + +func GetUpgradeStrategy(c coh.CoherenceResource, p probe.CoherenceProbe) UpgradeStrategy { + spec, _ := c.GetStatefulSetSpec() + if spec.RollingUpdateStrategy != nil { + name := *spec.RollingUpdateStrategy + if name == coh.UpgradeManual { + return ManualUpgradeStrategy{} + } + if name == coh.UpgradeByNode { + sp := spec.GetScalingProbe() + return ByNodeUpgradeStrategy{ + cp: p, + scalingProbe: sp, + } + } + if name == coh.UpgradeByNodeLabel { + sp := spec.GetScalingProbe() + if spec.RollingUpdateLabel == nil { + return ByNodeUpgradeStrategy{ + cp: p, + scalingProbe: sp, + } + } else { + return ByNodeLabelUpgradeStrategy{ + label: *spec.RollingUpdateLabel, + cp: p, + scalingProbe: sp, + } + } + } + } + // default is by Pod + return ByPodUpgradeStrategy{} +} + +// ----- ByPodUpgradeStrategy ---------------------------------------------------------------------- + +var _ UpgradeStrategy = ByPodUpgradeStrategy{} + +type ByPodUpgradeStrategy struct { +} + +func (in ByPodUpgradeStrategy) RollingUpgrade(context.Context, *appsv1.StatefulSet, kubernetes.Interface) (reconcile.Result, error) { + return reconcile.Result{}, nil +} + +func (in ByPodUpgradeStrategy) IsOperatorManaged() bool { + return false +} + +// ----- ManualUpgradeStrategy --------------------------------------------------------------------- + +var _ UpgradeStrategy = ManualUpgradeStrategy{} + +type ManualUpgradeStrategy struct { +} + +func (in ManualUpgradeStrategy) RollingUpgrade(context.Context, *appsv1.StatefulSet, kubernetes.Interface) (reconcile.Result, error) { + return reconcile.Result{}, nil +} + +func (in ManualUpgradeStrategy) IsOperatorManaged() bool { + return false +} + +// ----- ByNodeUpgradeStrategy --------------------------------------------------------------------- + +var _ UpgradeStrategy = ByNodeUpgradeStrategy{} + +type ByNodeUpgradeStrategy struct { + cp probe.CoherenceProbe + scalingProbe *coh.Probe +} + +func (in ByNodeUpgradeStrategy) RollingUpgrade(ctx context.Context, sts *appsv1.StatefulSet, c kubernetes.Interface) (reconcile.Result, error) { + return rollingUpgrade(in.cp, in.scalingProbe, &PodNodeName{}, "NodeName", ctx, sts, c) +} + +func (in ByNodeUpgradeStrategy) IsOperatorManaged() bool { + return true +} + +// ----- ByNodeLabelUpgradeStrategy ----------------------------------------------------------------- + +var _ UpgradeStrategy = ByNodeLabelUpgradeStrategy{} + +type ByNodeLabelUpgradeStrategy struct { + cp probe.CoherenceProbe + scalingProbe *coh.Probe + label string +} + +func (in ByNodeLabelUpgradeStrategy) RollingUpgrade(ctx context.Context, sts *appsv1.StatefulSet, c kubernetes.Interface) (reconcile.Result, error) { + return rollingUpgrade(in.cp, in.scalingProbe, &PodNodeLabel{Label: in.label}, in.label, ctx, sts, c) +} + +func (in ByNodeLabelUpgradeStrategy) IsOperatorManaged() bool { + return true +} + +// ----- PodNodeIdSupplier ------------------------------------------------------------------------- + +type PodNodeIdSupplier interface { + GetNodeId(context.Context, kubernetes.Interface, corev1.Pod) (string, error) +} + +var _ PodNodeIdSupplier = &PodNodeName{} + +type PodNodeName struct { +} + +func (p *PodNodeName) GetNodeId(_ context.Context, _ kubernetes.Interface, pod corev1.Pod) (string, error) { + return pod.Spec.NodeName, nil +} + +var _ PodNodeIdSupplier = &PodNodeLabel{} + +type PodNodeLabel struct { + Label string + cache map[string]string +} + +func (p *PodNodeLabel) GetNodeId(ctx context.Context, c kubernetes.Interface, pod corev1.Pod) (string, error) { + var err error + var value string + var found bool + + if p.cache == nil { + p.cache = make(map[string]string) + } else { + value, found = p.cache[pod.Spec.NodeName] + if found { + return value, err + } + } + + value, err = nodes.GetExactLabelForNode(ctx, c, pod.Spec.NodeName, p.Label, log) + if err != nil { + return "", err + } + p.cache[pod.Spec.NodeName] = value + return value, err +} + +// ----- helper methods ---------------------------------------------------------------------------- + +func rollingUpgrade(cp probe.CoherenceProbe, scalingProbe *coh.Probe, fn PodNodeIdSupplier, idName string, ctx context.Context, sts *appsv1.StatefulSet, c kubernetes.Interface) (reconcile.Result, error) { + var err error + var replicas int32 + + if sts.Spec.Replicas == nil { + replicas = 1 + if sts.Status.ReadyReplicas == 0 { + return reconcile.Result{}, nil + } + } else { + replicas = *sts.Spec.Replicas + if sts.Status.ReadyReplicas != *sts.Spec.Replicas { + return reconcile.Result{}, nil + } + } + + if sts.Status.CurrentRevision == sts.Status.UpdateRevision { + return reconcile.Result{}, nil + } + + if !operator.IsNodeLookupEnabled() { + log.Info("Cannot rolling upgrade of StatefulSet by Node, the Coherence Operator has Node lookup disabled", "Namespace", sts.Namespace, "Name", sts.Name) + return reconcile.Result{}, nil + } + + log.Info("Perform rolling upgrade of StatefulSet by Node", "Namespace", sts.Namespace, "Name", sts.Name) + + pods, err := cp.GetPodsForStatefulSet(ctx, sts) + if err != nil { + log.Error(err, "Error getting list of Pods for StatefulSet", "Namespace", sts.Namespace, "Name", sts.Name) + return reconcile.Result{}, err + } + + if len(pods.Items) == 0 { + log.Info("Zero Pods found for StatefulSet", "Namespace", sts.Namespace, "Name", sts.Name) + return reconcile.Result{}, err + } + + if len(pods.Items) != int(replicas) { + log.Info("Count of Pods found for StatefulSet does not match replicas", "Namespace", sts.Namespace, "Name", sts.Name, "Replicas", replicas, "Found", len(pods.Items)) + return reconcile.Result{}, err + } + + revision := sts.Status.UpdateRevision + + podsToUpdate := corev1.PodList{} + if len(pods.Items) > 1 { + // we have multiple Pods + podsById, allPodsById, err := groupPods(ctx, c, pods, revision, fn) + if err != nil { + return reconcile.Result{}, err + } + + if len(podsById) == 0 { + // nothing to do, all pods are at the required revision + return reconcile.Result{}, nil + } + + if len(allPodsById) == 1 { + // There is only one Node, we cannot be NodeSafe so do not do anything + id, err := fn.GetNodeId(ctx, c, pods.Items[0]) + if err != nil { + return reconcile.Result{}, err + } + log.Info("All Pods have a single Node identifier and cannot be Safe, no Pods will be upgraded", "Namespace", sts.Namespace, + "Name", sts.Name, "Replicas", len(pods.Items), "NodeId", idName, "IdValue", id) + return reconcile.Result{}, nil + } + + // create an array of Node identifiers + var identifiers []string + for k := range podsById { + identifiers = append(identifiers, k) + } + + // Create the list of Pods to be deleted (upgraded) by picking the first identifier + node := identifiers[0] + p := podsById[node] + podsToUpdate.Items = p + } else { + // There is only on Pod and replicas == 1 + pod := pods.Items[0] + podRevision := pod.Labels["controller-revision-hash"] + if revision != podRevision { + // The single Pod is not the required revision so upgrade it + podsToUpdate.Items = append(podsToUpdate.Items, pod) + } + } + + if len(podsToUpdate.Items) > 0 { + // We have Pods to be upgraded + nodeId, _ := fn.GetNodeId(ctx, c, pods.Items[0]) + // Check Pods are "safe" + if cp.ExecuteProbeForSubSetOfPods(ctx, sts, scalingProbe, pods, podsToUpdate) { + // delete the Pods + log.Info("Upgrading all Pods for Node identifier", "Namespace", sts.Namespace, "Name", sts.Name, "NodeId", idName, "IdValue", nodeId, "Count", len(podsToUpdate.Items)) + err = deletePods(ctx, podsToUpdate, c) + } else { + log.Info("Pods failed Status HA check, upgrade is deferred for one minute", "Namespace", sts.Namespace, "Name", sts.Name, "NodeId", idName, "IdValue", nodeId) + return reconcile.Result{Requeue: true, RequeueAfter: time.Minute}, nil + } + } + + // Even if we still have nodes to upgrade, we send s non-requeue result. + // When the deleted Pods are rescheduled and become ready the StatefulSet status + // will be updated, and we will end up back in this method + return reconcile.Result{}, err +} + +// deletePods will delete the pods in a pod list +func deletePods(ctx context.Context, pods corev1.PodList, c kubernetes.Interface) error { + for _, pod := range pods.Items { + log.Info("Attempting to delete Pod to trigger upgrade", "Namespace", pod.Namespace, "Name", pod.Name) + if err := c.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{}); err != nil { + log.Error(err, "Error deleting Pod", "Namespace", pod.Namespace, "Name", pod.Name) + return err + } + log.Info("Deleted Pod", "Namespace", pod.Namespace, "Name", pod.Name) + } + return nil +} + +// groupPods returns two maps of Pods by an identifier. The first is Pods with a specific controller revision, the second is all Pods +func groupPods(ctx context.Context, c kubernetes.Interface, pods corev1.PodList, revision string, fn PodNodeIdSupplier) (map[string][]corev1.Pod, map[string][]corev1.Pod, error) { + allPodsById := make(map[string][]corev1.Pod) + podsById := make(map[string][]corev1.Pod) + for _, pod := range pods.Items { + id, err := fn.GetNodeId(ctx, c, pod) + if err != nil { + return nil, nil, err + } + allPods, found := allPodsById[id] + if !found { + allPods = make([]corev1.Pod, 0) + } + allPods = append(allPods, pod) + allPodsById[id] = allPods + + podRevision := pod.Labels["controller-revision-hash"] + if revision != podRevision { + ary, found := podsById[id] + if !found { + ary = make([]corev1.Pod, 0) + } + ary = append(ary, pod) + podsById[id] = ary + } + } + return podsById, allPodsById, nil +} diff --git a/controllers/statefulset/upgrade_strategy_test.go b/controllers/statefulset/upgrade_strategy_test.go new file mode 100644 index 000000000..d91d5790a --- /dev/null +++ b/controllers/statefulset/upgrade_strategy_test.go @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package statefulset_test + +import ( + . "github.com/onsi/gomega" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/controllers/statefulset" + "github.com/oracle/coherence-operator/pkg/probe" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + "testing" +) + +func TestUseUpgradeStrategyByPodIfNotSet(t *testing.T) { + g := NewGomegaWithT(t) + + c := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment", + }, + Spec: coh.CoherenceStatefulSetResourceSpec{}, + } + + p := probe.CoherenceProbe{} + s := statefulset.GetUpgradeStrategy(c, p) + + g.Expect(s).To(BeAssignableToTypeOf(statefulset.ByPodUpgradeStrategy{})) + g.Expect(s.IsOperatorManaged()).To(BeFalse()) +} + +func TestUseUpgradeStrategyByPod(t *testing.T) { + g := NewGomegaWithT(t) + + c := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment", + }, + Spec: coh.CoherenceStatefulSetResourceSpec{ + RollingUpdateStrategy: ptr.To(coh.UpgradeByPod), + }, + } + + p := probe.CoherenceProbe{} + s := statefulset.GetUpgradeStrategy(c, p) + + g.Expect(s).To(BeAssignableToTypeOf(statefulset.ByPodUpgradeStrategy{})) + g.Expect(s.IsOperatorManaged()).To(BeFalse()) +} + +func TestUseUpgradeStrategyByNode(t *testing.T) { + g := NewGomegaWithT(t) + + c := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment", + }, + Spec: coh.CoherenceStatefulSetResourceSpec{ + RollingUpdateStrategy: ptr.To(coh.UpgradeByNode), + }, + } + + p := probe.CoherenceProbe{} + s := statefulset.GetUpgradeStrategy(c, p) + + g.Expect(s).To(BeAssignableToTypeOf(statefulset.ByNodeUpgradeStrategy{})) + g.Expect(s.IsOperatorManaged()).To(BeTrue()) +} + +func TestUseUpgradeStrategyManual(t *testing.T) { + g := NewGomegaWithT(t) + + c := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-deployment", + }, + Spec: coh.CoherenceStatefulSetResourceSpec{ + RollingUpdateStrategy: ptr.To(coh.UpgradeManual), + }, + } + + p := probe.CoherenceProbe{} + s := statefulset.GetUpgradeStrategy(c, p) + + g.Expect(s).To(BeAssignableToTypeOf(statefulset.ManualUpgradeStrategy{})) + g.Expect(s.IsOperatorManaged()).To(BeFalse()) +} diff --git a/controllers/webhook/webhook_controller.go b/controllers/webhook/webhook_controller.go index 318cf97ca..59af9738f 100644 --- a/controllers/webhook/webhook_controller.go +++ b/controllers/webhook/webhook_controller.go @@ -43,8 +43,8 @@ type CertReconciler struct { hookInstaller *HookInstaller } -func (r *CertReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager) error { - r.SetCommonReconciler(controllerName, mgr) +func (r *CertReconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, cs clients.ClientSet) error { + r.SetCommonReconciler(controllerName, mgr, cs) r.rotateBefore = operator.GetCACertRotateBefore() // determine how webhook certs will be managed diff --git a/docs/about/04_coherence_spec.adoc b/docs/about/04_coherence_spec.adoc index e4e51a375..ef5c84942 100644 --- a/docs/about/04_coherence_spec.adoc +++ b/docs/about/04_coherence_spec.adoc @@ -926,6 +926,8 @@ m| global | Global contains attributes that will be applied to all resources man m| initResources | InitResources is the optional resource requests and limits for the init-container that the Operator adds to the Pod. + ref: https://kubernetes.io/docs/concepts/configuration/manage-resources-containers/ + + The Coherence operator does not apply any default resources. m| *https://{k8s-doc-link}/#resourcerequirements-v1-core[corev1.ResourceRequirements] | false +m| rollingUpdateStrategy | The rolling upgrade strategy to use. If present, the value must be one of "UpgradeByPod", "UpgradeByNode" of "OnDelete". If not set, the default is "UpgradeByPod" UpgradeByPod will perform a rolling upgrade one Pod at a time. UpgradeByNode will update all Pods on a Node at the same time. OnDelete will not automatically apply any updates, Pods must be manually deleted for updates to be applied to the restarted Pod. m| *RollingUpdateStrategyType | false +m| rollingUpdateLabel | The name of the Node label to use to group Pods during a rolling upgrade. This field ony applies if RollingUpdateStrategy is set to NodeLabel. If RollingUpdateStrategy is set to NodeLabel and this field is omitted then the rolling upgrade will be by Node. It is the users responsibility to ensure that Nodes actually have the label used for this field. The label should be one of the node labels used to set the Coherence site or rack value. m| *string | false |=== <> diff --git a/docs/applications/030_deploy_application.adoc b/docs/applications/030_deploy_application.adoc index 6c632520c..28741fab4 100644 --- a/docs/applications/030_deploy_application.adoc +++ b/docs/applications/030_deploy_application.adoc @@ -1,6 +1,6 @@ /////////////////////////////////////////////////////////////////////////////// - Copyright (c) 2020, Oracle and/or its affiliates. + Copyright (c) 2020, 2024, Oracle and/or its affiliates. Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. @@ -8,7 +8,7 @@ = Deploy Coherence Applications -== Deploy Coherence Application Images +== Deploy Coherence Applications Once a custom application image has been built (as described in <>) a `Coherence` resource can be configured to use that image. diff --git a/docs/applications/032_rolling_upgrade.adoc b/docs/applications/032_rolling_upgrade.adoc new file mode 100644 index 000000000..10b0bedec --- /dev/null +++ b/docs/applications/032_rolling_upgrade.adoc @@ -0,0 +1,92 @@ +/////////////////////////////////////////////////////////////////////////////// + + Copyright (c) 2024, Oracle and/or its affiliates. + Licensed under the Universal Permissive License v 1.0 as shown at + http://oss.oracle.com/licenses/upl. + +/////////////////////////////////////////////////////////////////////////////// + += Rolling Upgrades of Coherence Applications + +== Rolling Upgrades of Coherence Applications + +The Coherence Operator supports safe rolling upgrades of Coherence clusters. + +== Default Behaviour + +The Coherence Operator uses a StatefulSet to manage the application Pods. +The StatefulSet is configured to perform its default rolling upgrade behaviour. +This means that when a Coherence resource is updated the StatefulSet will control the rolling upgrade. +First a Pod is killed and rescheduled with the updated specification. +When this Pod is "ready" the next Pod is killed and rescheduled, and so on until all the Pods are updated. +Because the default readiness probe configured by the Operator will wait for Coherence members to be "safe" +(i.e. no endangered partitions) and redistribution to be complete when the new Pod is ready, it is safe +to kill the next Pod. + +== Custom Rolling Upgrades + +The Coherence resource yaml has a field named `RollingUpdateStrategy` which can be used to override the default +rolling upgrade strategy. The field can be set to one of the following values: + + +|=== +|RollingUpdateStrategy |Description + +|`Pod` +|This is the same as the default behaviour, one Pod at a time is upgraded. + +|`Node` +|This strategy will upgrade all Pods on a Node at the same time. + +|`NodeLabel ` +|This strategy will upgrade all Pods on all Nodes that have a matching value for a give Node label. + + +|`Manual` +|This strategy is the same as the `Manual` rolling upgrade configuration for a StatefulSet. +|=== + +=== Upgrade By Node + +By default, the Operator configures Coherence to be at least "machine safe", +using the Node as the machine identifier. This means that it should be safe to +lose all Pods on a Node. By upgrading multiple Pods at the same time the overall time to perform a +rolling upgrade is less than using the default one Pod at a time behaviour. + +[NOTE] +==== +When using the `Node` strategy where multiple Pods will be killed, the remaining cluster must have enough +capacity to recover the data and backups from the Pods that are killed. + +For example, if a cluster of 18 Pods is distributed over three Nodes, each Node will be running six Pods. +When upgrading by Node, six Pods will be killed at the same time, so there must be enough capacity in the +remaining 12 Pods to hold all the data that was in the original 18 Pods. +==== + +The `Node` strategy is configured by setting the `rollingUpdateStrategy` field to `Node` as shown below: + +[source,yaml] +.cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: test +spec: + rollingUpdateStrategy: Node + image: my-app:1.0.0 +---- + +=== Upgrade By Node Label + +In many production Kubernetes clusters, there is a concept of zones and fault domains, with each Node belonging to +one of these zones and domains. Typically, Nodes are labelled to indicate which zone and domain they are in. +For example the `topology.kubernetes.io/zone` is a standard label for the zone name. + +These labels are used by the Coherence Operator to configure the site and rack names for a Coherence cluster. +(see the documentation on <>). +In a properly configured cluster that is site or rack safe, it is possible to upgrade all Pods in a site or rack +at the same time. + +This is a more extreme version of the `Node` strategy. + diff --git a/helm-charts/coherence-operator/templates/deployment.yaml b/helm-charts/coherence-operator/templates/deployment.yaml index f15377d76..99c9bb4c6 100644 --- a/helm-charts/coherence-operator/templates/deployment.yaml +++ b/helm-charts/coherence-operator/templates/deployment.yaml @@ -274,6 +274,21 @@ spec: tolerations: {{ toYaml .Values.tolerations | indent 8 }} {{- end }} +{{- if .Values.affinity }} + topologySpreadConstraints: +{{ toYaml .Values.affinity | indent 8 }} +{{- else }} + topologySpreadConstraints: + - maxSkew: 1 + topologyKey: topology.kubernetes.io/zone + whenUnsatisfiable: ScheduleAnyway + labelSelector: + matchLabels: + control-plane: coherence + app.kubernetes.io/name: coherence-operator + app.kubernetes.io/instance: coherence-operator-manager + app.kubernetes.io/version: "3.4.0" +{{- end }} {{- if .Values.affinity }} affinity: {{ toYaml .Values.affinity | indent 8 }} diff --git a/helm-charts/coherence-operator/values.yaml b/helm-charts/coherence-operator/values.yaml index b54104e14..6c159a117 100644 --- a/helm-charts/coherence-operator/values.yaml +++ b/helm-charts/coherence-operator/values.yaml @@ -93,6 +93,11 @@ securityContext: # --------------------------------------------------------------------------- # Pod scheduling values +# topologySpreadConstraints controls how Pods are spread across the cluster. +# If not specified the default is to spread Pods over nodes using the topology.kubernetes.io/zone label. +# ref: https://kubernetes.io/docs/concepts/scheduling-eviction/topology-spread-constraints/ +topologySpreadConstraints: + # affinity controls Pod scheduling preferences. # ref: https://kubernetes.io/docs/concepts/configuration/assign-pod-node/#affinity-and-anti-affinity affinity: diff --git a/pkg/clients/clients.go b/pkg/clients/clients.go index 3b5b5dfda..b4cea7a1a 100644 --- a/pkg/clients/clients.go +++ b/pkg/clients/clients.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -12,7 +12,6 @@ import ( "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" - ctrl "sigs.k8s.io/controller-runtime" ) type ClientSet struct { @@ -22,14 +21,6 @@ type ClientSet struct { DiscoveryClient *discovery.DiscoveryClient } -func New() (ClientSet, error) { - cfg, err := ctrl.GetConfig() - if err != nil { - return ClientSet{}, err - } - return NewForConfig(cfg) -} - func NewForConfig(cfg *rest.Config) (ClientSet, error) { var err error c := ClientSet{} diff --git a/pkg/nodes/nodes.go b/pkg/nodes/nodes.go new file mode 100644 index 000000000..1081a3c5d --- /dev/null +++ b/pkg/nodes/nodes.go @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package nodes + +import ( + "context" + "github.com/go-logr/logr" + "github.com/oracle/coherence-operator/pkg/operator" + corev1 "k8s.io/api/core/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" +) + +// GetExactLabelForNode is a GET request that returns the node label on a k8s node +func GetExactLabelForNode(ctx context.Context, c kubernetes.Interface, name, label string, log logr.Logger) (string, error) { + var prefix []string + var labels []string + labels = append(labels, label) + value, _, err := GetLabelForNode(ctx, c, name, labels, prefix, log) + return value, err +} + +// GetLabelForNode is a GET request that returns the node label on a k8s node +func GetLabelForNode(ctx context.Context, c kubernetes.Interface, name string, labels, prefixLabels []string, log logr.Logger) (string, string, error) { + var value string + labelUsed := "" + var prefixUsed = "" + var err error + + if operator.IsNodeLookupEnabled() { + var node *corev1.Node + node, err = c.CoreV1().Nodes().Get(ctx, name, metav1.GetOptions{}) + if err == nil { + var ok bool + + prefixValue := "" + for _, label := range prefixLabels { + if prefix, ok := node.Labels[label]; ok && prefix != "" { + labelUsed = label + prefixValue = prefix + "-" + break + } + } + + for _, label := range labels { + if value, ok = node.Labels[label]; ok && value != "" { + labelUsed = label + value = prefixValue + value + break + } + } + } else { + if apierrors.IsNotFound(err) { + log.Info("GET query for node labels - NotFound", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value) + } else { + log.Error(err, "GET query for node labels - Error", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value) + } + value = "" + labelUsed = "" + } + } else { + log.Info("Node labels lookup disabled", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value) + value = "" + } + return value, labelUsed, err +} diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index ff526b91c..616a64103 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -90,6 +90,11 @@ const ( // LabelHostName is the Node label for the Node's hostname. LabelHostName = "kubernetes.io/hostname" + + // LabelTestHostName is a label applied to Pods to set a testing host name + LabelTestHostName = "coherence.oracle.com/test_hostname" + // LabelTestHealthPort is a label applied to Pods to set a testing health check port + LabelTestHealthPort = "coherence.oracle.com/test_health_port" ) var setupLog = ctrl.Log.WithName("setup") diff --git a/pkg/probe/probe.go b/pkg/probe/probe.go index f9b48ff8b..91fba0ac1 100644 --- a/pkg/probe/probe.go +++ b/pkg/probe/probe.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2023, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -54,8 +54,15 @@ func (in *CoherenceProbe) SetGetPodHostName(fn func(pod corev1.Pod) string) { } func (in *CoherenceProbe) GetPodHostName(pod corev1.Pod) string { + hostName, found := pod.Labels[operator.LabelTestHostName] + if found { + return hostName + } if in.getPodHostName == nil { - return pod.Spec.Hostname + "." + pod.Spec.Subdomain + "." + pod.GetNamespace() + ".svc" + if pod.Status.PodIP == "" { + return pod.Spec.Hostname + "." + pod.Spec.Subdomain + "." + pod.GetNamespace() + ".svc" + } + return pod.Status.PodIP } return in.getPodHostName(pod) } @@ -85,7 +92,7 @@ func (in *CoherenceProbe) IsStatusHA(ctx context.Context, deployment coh.Coheren spec, found := deployment.GetStatefulSetSpec() if found { p := spec.GetScalingProbe() - return in.ExecuteProbe(ctx, deployment, sts, p) + return in.ExecuteProbe(ctx, sts, p) } return true } @@ -138,36 +145,43 @@ func (in *CoherenceProbe) SuspendServices(ctx context.Context, deployment coh.Co } log.Info("Suspending Coherence services in StatefulSet "+sts.Name, "Namespace", ns, "Name", name) - if in.ExecuteProbe(ctx, deployment, sts, stsSpec.GetSuspendProbe()) { + if in.ExecuteProbe(ctx, sts, stsSpec.GetSuspendProbe()) { return ServiceSuspendSuccessful } return ServiceSuspendFailed } -func (in *CoherenceProbe) ExecuteProbe(ctx context.Context, deployment coh.CoherenceResource, sts *appsv1.StatefulSet, probe *coh.Probe) bool { - logger := log.WithValues("Namespace", deployment.GetNamespace(), "Name", deployment.GetName()) - list := corev1.PodList{} - +func (in *CoherenceProbe) GetPodsForStatefulSet(ctx context.Context, sts *appsv1.StatefulSet) (corev1.PodList, error) { + pods := corev1.PodList{} labels := client.MatchingLabels{} for k, v := range sts.Spec.Selector.MatchLabels { labels[k] = v } + err := in.Client.List(ctx, &pods, client.InNamespace(sts.GetNamespace()), labels) + return pods, err +} - err := in.Client.List(ctx, &list, client.InNamespace(deployment.GetNamespace()), labels) +func (in *CoherenceProbe) ExecuteProbe(ctx context.Context, sts *appsv1.StatefulSet, probe *coh.Probe) bool { + pods, err := in.GetPodsForStatefulSet(ctx, sts) if err != nil { log.Error(err, "Error getting list of Pods for StatefulSet "+sts.Name) return false } + return in.ExecuteProbeForSubSetOfPods(ctx, sts, probe, pods, pods) +} + +func (in *CoherenceProbe) ExecuteProbeForSubSetOfPods(ctx context.Context, sts *appsv1.StatefulSet, probe *coh.Probe, stsPods, pods corev1.PodList) bool { + logger := log.WithValues("Namespace", sts.GetNamespace(), "Name", sts.GetName()) // All Pods must be in the Running Phase - for _, pod := range list.Items { + for _, pod := range stsPods.Items { if ready, phase := in.IsPodReady(pod); !ready { logger.Info(fmt.Sprintf("Cannot execute probe, one or more Pods is not in a ready state - %s (%v) ", pod.Name, phase)) return false } } - count := int32(len(list.Items)) + count := int32(len(stsPods.Items)) switch { case count == 0: logger.Info("Cannot find any Pods for StatefulSet " + sts.Name) @@ -180,7 +194,7 @@ func (in *CoherenceProbe) ExecuteProbe(ctx context.Context, deployment coh.Coher return false } - for _, pod := range list.Items { + for _, pod := range pods.Items { if pod.Status.Phase == "Running" { if log.Enabled() { log.Info("Using pod " + pod.Name + " to execute probe") @@ -349,6 +363,13 @@ func (in *CoherenceProbe) findPort(pod corev1.Pod, port intstr.IntOrString) (int } func (in *CoherenceProbe) findPortInPod(pod corev1.Pod, name string) (int, error) { + if name == coh.PortNameHealth { + p, found := pod.Labels[operator.LabelTestHealthPort] + if found { + return strconv.Atoi(p) + } + } + for _, container := range pod.Spec.Containers { if container.Name == coh.ContainerNameCoherence { return in.findPortInContainer(pod, container, name) diff --git a/pkg/rest/rest.go b/pkg/rest/rest.go index 199196d45..be91a864e 100644 --- a/pkg/rest/rest.go +++ b/pkg/rest/rest.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2024, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -11,14 +11,13 @@ import ( "context" "fmt" v1 "github.com/oracle/coherence-operator/api/v1" - "github.com/oracle/coherence-operator/pkg/clients" onet "github.com/oracle/coherence-operator/pkg/net" + "github.com/oracle/coherence-operator/pkg/nodes" "github.com/oracle/coherence-operator/pkg/operator" "github.com/pkg/errors" apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/types" - k8s "k8s.io/client-go/kubernetes" + "k8s.io/client-go/kubernetes" "net" "net/http" ctrl "sigs.k8s.io/controller-runtime" @@ -71,12 +70,19 @@ func GetServerHostAndPort() string { } // NewServer will create a new REST server -func NewServer(c clients.ClientSet) Server { +func NewServer(c kubernetes.Interface) Server { + endpoints := make(map[string]func(w http.ResponseWriter, r *http.Request)) + return NewServerWithEndpoints(c, endpoints) +} + +// NewServerWithEndpoints will create a new REST server with additional endpoints +func NewServerWithEndpoints(c kubernetes.Interface, endpoints map[string]func(w http.ResponseWriter, r *http.Request)) Server { running := make(chan struct{}) if svr == nil { svr = &server{ - client: c.KubeClient, - running: running, + running: running, + endpoints: endpoints, + client: c, } } return svr @@ -84,11 +90,12 @@ func NewServer(c clients.ClientSet) Server { type server struct { listener net.Listener - client k8s.Interface + client kubernetes.Interface mgr ctrl.Manager ctx context.Context running chan struct{} httpServer *http.Server + endpoints map[string]func(w http.ResponseWriter, r *http.Request) } // blank assignment to verify that CoherenceReconciler implements manager.LeaderElectionRunnable @@ -100,28 +107,31 @@ var _ manager.LeaderElectionRunnable = &server{} var _ manager.Runnable = &server{} // SetupWithManager configures this server from the specified Manager. -func (s server) SetupWithManager(mgr ctrl.Manager) error { +func (s *server) SetupWithManager(mgr ctrl.Manager) error { s.mgr = mgr return mgr.Add(s) } -func (s server) Running() <-chan struct{} { +func (s *server) Running() <-chan struct{} { return s.running } -func (s server) NeedLeaderElection() bool { +func (s *server) NeedLeaderElection() bool { // The REST server does not require leadership return false } // Start starts this REST server -func (s server) Start(context.Context) error { +func (s *server) Start(context.Context) error { if s.listener != nil { log.Info("The REST server is already started", "listenAddress", s.listener.Addr().String()) return nil } mux := http.NewServeMux() + for path, endpoint := range s.endpoints { + mux.Handle(path, handler{fn: endpoint}) + } mux.Handle("/site/", handler{fn: s.getSiteLabelForNode}) mux.Handle("/rack/", handler{fn: s.getRackLabelForNode}) mux.Handle("/status/", handler{fn: s.getCoherenceStatus}) @@ -143,23 +153,23 @@ func (s server) Start(context.Context) error { return nil } -func (s server) GetAddress() net.Addr { +func (s *server) GetAddress() net.Addr { return s.listener.Addr() } -func (s server) GetPort() int32 { +func (s *server) GetPort() int32 { t, _ := net.ResolveTCPAddr(s.listener.Addr().Network(), s.listener.Addr().String()) return int32(t.Port) } -func (s server) Close() error { +func (s *server) Close() error { ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second) defer cancel() s.httpServer.SetKeepAlivesEnabled(false) return s.httpServer.Shutdown(ctx) } -func (s server) GetHostAndPort() string { +func (s *server) GetHostAndPort() string { var service string var port int32 @@ -198,18 +208,18 @@ func (s server) GetHostAndPort() string { } // getSiteLabelForNode is a GET request that returns the node label on a k8s node to use for a Coherence site value. -func (s server) getSiteLabelForNode(w http.ResponseWriter, r *http.Request) { +func (s *server) getSiteLabelForNode(w http.ResponseWriter, r *http.Request) { var prefix []string s.getLabelForNode(operator.GetSiteLabel(), prefix, w, r) } // getRackLabelForNode is a GET request that returns the node label on a k8s node to use for a Coherence rack value. -func (s server) getRackLabelForNode(w http.ResponseWriter, r *http.Request) { +func (s *server) getRackLabelForNode(w http.ResponseWriter, r *http.Request) { s.getLabelForNode(operator.GetRackLabel(), operator.GetSiteLabel(), w, r) } // getLabelForNode is a GET request that returns the node label on a k8s node to use for a Coherence rack value. -func (s server) getLabelForNode(labels, prefixLabels []string, w http.ResponseWriter, r *http.Request) { +func (s *server) getLabelForNode(labels, prefixLabels []string, w http.ResponseWriter, r *http.Request) { var value string labelUsed := "" var prefixUsed = "" @@ -224,53 +234,29 @@ func (s server) getLabelForNode(labels, prefixLabels []string, w http.ResponseWr pos := strings.LastIndex(path, "/") name := r.URL.Path[1+pos:] + logWithAddress := log.WithValues("remoteAddress", r.RemoteAddr) + if operator.IsNodeLookupEnabled() { - node, err := s.client.CoreV1().Nodes().Get(context.TODO(), name, metav1.GetOptions{}) - - if err == nil { - var ok bool - - queryLabel := r.URL.Query().Get("nodeLabel") - if queryLabel == "" { - prefixValue := "" - for _, label := range prefixLabels { - if prefix, ok := node.Labels[label]; ok && prefix != "" { - labelUsed = label - prefixValue = prefix + "-" - break - } - } - - for _, label := range labels { - if value, ok = node.Labels[label]; ok && value != "" { - labelUsed = label - value = prefixValue + value - break - } - } - } else { - value = node.Labels[queryLabel] - labelUsed = queryLabel - } - } else { - if apierrors.IsNotFound(err) { - log.Info("GET query for node labels - NotFound", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value, "remoteAddress", r.RemoteAddr) - } else { - log.Error(err, "GET query for node labels - Error", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value, "remoteAddress", r.RemoteAddr) - } + queryLabel := r.URL.Query().Get("nodeLabel") + if queryLabel != "" { + labels = []string{queryLabel} + prefixLabels = []string{} + } + value, labelUsed, err = nodes.GetLabelForNode(context.TODO(), s.client, name, labels, prefixLabels, logWithAddress) + if err != nil { + logWithAddress.Error(err, "Error obtaining node label", "node", name) value = "" - labelUsed = "" } } else { - log.Info("Node labels lookup disabled", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value, "remoteAddress", r.RemoteAddr) + logWithAddress.Info("Node labels lookup disabled", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value) value = "" } w.WriteHeader(http.StatusOK) if _, err = fmt.Fprint(w, value); err != nil { - log.Error(err, "Error writing value response for node "+name) + logWithAddress.Error(err, "Error writing value response for node "+name) } else { - log.Info("GET query for node labels", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value, "remoteAddress", r.RemoteAddr) + logWithAddress.Info("GET query for node labels", "node", name, "label", labelUsed, "prefix", prefixUsed, "value", value) } } @@ -280,7 +266,7 @@ func (s server) getLabelForNode(labels, prefixLabels []string, w http.ResponseWr // in namespace "foo". // By default, the request checks that the deployment has a status of Ready. // It is possible to pass in a different status using the ?phase= query parameter. -func (s server) getCoherenceStatus(w http.ResponseWriter, r *http.Request) { +func (s *server) getCoherenceStatus(w http.ResponseWriter, r *http.Request) { path := r.URL.Path // strip off any trailing slash if last := len(path) - 1; last >= 0 && path[last] == '/' { diff --git a/pkg/runner/cmd_operator.go b/pkg/runner/cmd_operator.go index 6a47458e7..f9558ba3b 100644 --- a/pkg/runner/cmd_operator.go +++ b/pkg/runner/cmd_operator.go @@ -81,7 +81,7 @@ func execute() error { cfg := ctrl.GetConfigOrDie() cs, err := clients.NewForConfig(cfg) if err != nil { - return errors.Wrap(err, "unable to create clientset") + return errors.Wrap(err, "unable to create client set") } // create the client here as we use it to install CRDs then inject it into the Manager @@ -144,20 +144,22 @@ func execute() error { // Set up the Coherence reconciler if err = (&controllers.CoherenceReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("Coherence"), - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { + Client: mgr.GetClient(), + ClientSet: cs, + Log: ctrl.Log.WithName("controllers").WithName("Coherence"), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr, cs); err != nil { return errors.Wrap(err, "unable to create Coherence controller") } // Set up the CoherenceJob reconciler if operator.ShouldInstallJobCRD() { if err = (&controllers.CoherenceJobReconciler{ - Client: mgr.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("CoherenceJob"), - Scheme: mgr.GetScheme(), - }).SetupWithManager(mgr); err != nil { + Client: mgr.GetClient(), + ClientSet: cs, + Log: ctrl.Log.WithName("controllers").WithName("CoherenceJob"), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr, cs); err != nil { return errors.Wrap(err, "unable to create CoherenceJob controller") } } @@ -174,7 +176,7 @@ func execute() error { cr = &webhook.CertReconciler{ Clientset: cs, } - if err := cr.SetupWithManager(handler, mgr); err != nil { + if err := cr.SetupWithManager(handler, mgr, cs); err != nil { return errors.Wrap(err, " unable to create webhook certificate controller") } @@ -187,7 +189,7 @@ func execute() error { } // Create the REST server - restServer := rest.NewServer(cs) + restServer := rest.NewServer(cs.KubeClient) if err := restServer.SetupWithManager(mgr); err != nil { return errors.Wrap(err, " unable to start REST server") } diff --git a/test/e2e/helper/e2e-helpers.go b/test/e2e/helper/e2e-helpers.go index 4ca984add..174044670 100644 --- a/test/e2e/helper/e2e-helpers.go +++ b/test/e2e/helper/e2e-helpers.go @@ -10,38 +10,22 @@ package helper import ( goctx "context" "encoding/json" - "flag" "fmt" - "github.com/go-logr/logr" . "github.com/onsi/gomega" coh "github.com/oracle/coherence-operator/api/v1" - "github.com/oracle/coherence-operator/controllers" - "github.com/oracle/coherence-operator/pkg/clients" "github.com/oracle/coherence-operator/pkg/operator" - oprest "github.com/oracle/coherence-operator/pkg/rest" - "github.com/spf13/cobra" - "github.com/spf13/pflag" - "github.com/spf13/viper" "golang.org/x/net/context" "io" appsv1 "k8s.io/api/apps/v1" batchv1 "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" - v1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/api/meta" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/types" - "k8s.io/client-go/kubernetes/scheme" - "k8s.io/client-go/rest" "os" - ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/cache" "sigs.k8s.io/controller-runtime/pkg/client/apiutil" "sigs.k8s.io/controller-runtime/pkg/controller/controllerutil" - "sigs.k8s.io/controller-runtime/pkg/envtest" - logf "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/log/zap" "strings" "testing" @@ -61,254 +45,6 @@ var ( Timeout = time.Minute * 3 ) -// TestContext is a context for end-to-end tests -type TestContext struct { - Config *rest.Config - Client client.Client - KubeClient kubernetes.Interface - Manager ctrl.Manager - Logger logr.Logger - Context context.Context - testEnv *envtest.Environment - stop chan struct{} - namespaces []string -} - -func (in TestContext) Logf(format string, a ...interface{}) { - in.Logger.Info(fmt.Sprintf(format, a...)) -} - -func (in TestContext) CleanupAfterTest(t *testing.T) { - t.Cleanup(func() { - if t.Failed() { - // dump the logs if the test failed - DumpOperatorLogs(t, in) - } - in.Cleanup() - }) -} - -func (in TestContext) Cleanup() { - in.Logger.Info("tearing down the test environment") - ns := GetTestNamespace() - in.CleanupNamespace(ns) - clusterNS := GetTestClusterNamespace() - if clusterNS != ns { - in.CleanupNamespace(clusterNS) - } - clientNS := GetTestClientNamespace() - in.CleanupNamespace(clientNS) - for i := range in.namespaces { - _ = in.cleanAndDeleteNamespace(in.namespaces[i]) - } - in.namespaces = nil -} - -func (in TestContext) CleanupNamespace(ns string) { - in.Logger.Info("tearing down the test environment - namespace: " + ns) - if err := WaitForCoherenceCleanup(in, ns); err != nil { - in.Logf("error waiting for cleanup to complete: %+v", err) - } - DeletePersistentVolumes(in, ns) -} - -func (in TestContext) CreateNamespace(ns string) error { - n := corev1.Namespace{ - ObjectMeta: metav1.ObjectMeta{ - Name: ns, - Namespace: ns, - }, - Spec: corev1.NamespaceSpec{}, - } - _, err := in.KubeClient.CoreV1().Namespaces().Create(in.Context, &n, metav1.CreateOptions{}) - if err != nil { - in.namespaces = append(in.namespaces, ns) - } - return err -} - -func (in TestContext) DeleteNamespace(ns string) error { - for i := range in.namespaces { - if in.namespaces[i] == ns { - err := in.cleanAndDeleteNamespace(ns) - last := len(in.namespaces) - 1 - in.namespaces[i] = in.namespaces[last] // Copy last element to index i. - in.namespaces[last-1] = "" // Erase last element (write zero value). - in.namespaces = in.namespaces[:last] // Truncate slice. - return err - } - } - return nil -} - -func (in TestContext) cleanAndDeleteNamespace(ns string) error { - in.CleanupNamespace(ns) - return in.KubeClient.CoreV1().Namespaces().Delete(in.Context, ns, metav1.DeleteOptions{}) -} - -func (in TestContext) Close() { - in.Cleanup() - if in.stop != nil { - close(in.stop) - } - if err := in.testEnv.Stop(); err != nil { - in.Logf("error stopping test environment: %+v", err) - } -} - -// NewContext creates a new TestContext. -func NewContext(startController bool, watchNamespaces ...string) (TestContext, error) { - testLogger := zap.New(zap.UseDevMode(true), zap.WriteTo(os.Stdout)) - logf.SetLogger(testLogger) - - // create a dummy command - Cmd := &cobra.Command{ - Use: "manager", - Short: "Start the operator manager", - } - - // configure viper for the flags and env-vars - operator.SetupFlags(Cmd, viper.GetViper()) - flagSet := pflag.NewFlagSet("operator", pflag.ContinueOnError) - flagSet.AddGoFlagSet(flag.CommandLine) - if err := viper.BindPFlags(flagSet); err != nil { - return TestContext{}, err - } - - // We need a real cluster for these tests - useCluster := true - - testLogger.WithName("test").Info("bootstrapping test environment") - testEnv := &envtest.Environment{ - UseExistingCluster: &useCluster, - AttachControlPlaneOutput: true, - CRDs: []*v1.CustomResourceDefinition{}, - } - - var err error - k8sCfg, err := testEnv.Start() - if err != nil { - return TestContext{}, err - } - - err = corev1.AddToScheme(scheme.Scheme) - if err != nil { - return TestContext{}, err - } - err = coh.AddToScheme(scheme.Scheme) - if err != nil { - return TestContext{}, err - } - - cl, err := client.New(k8sCfg, client.Options{Scheme: scheme.Scheme}) - if err != nil { - return TestContext{}, err - } - - options := ctrl.Options{ - Scheme: scheme.Scheme, - } - - if len(watchNamespaces) == 1 { - // Watch a single namespace - options.NewCache = func(config *rest.Config, opts cache.Options) (cache.Cache, error) { - opts.DefaultNamespaces = map[string]cache.Config{ - watchNamespaces[0]: {}, - } - return cache.New(config, opts) - } - } else if len(watchNamespaces) > 1 { - // Watch a multiple namespaces - options.NewCache = func(config *rest.Config, opts cache.Options) (cache.Cache, error) { - nsMap := make(map[string]cache.Config) - for _, ns := range watchNamespaces { - nsMap[ns] = cache.Config{} - } - opts.DefaultNamespaces = nsMap - return cache.New(config, opts) - } - } - - k8sManager, err := ctrl.NewManager(k8sCfg, options) - if err != nil { - return TestContext{}, err - } - - k8sClient := k8sManager.GetClient() - - cs, err := clients.NewForConfig(k8sCfg) - if err != nil { - return TestContext{}, err - } - - v, err := operator.DetectKubernetesVersion(cs) - if err != nil { - return TestContext{}, err - } - - ctx := context.Background() - - var stop chan struct{} - - // Create the REST server - restServer := oprest.NewServer(cs) - if err := restServer.SetupWithManager(k8sManager); err != nil { - return TestContext{}, err - } - - if startController { - // Ensure CRDs exist - err = coh.EnsureCRDs(ctx, v, scheme.Scheme, cl) - if err != nil { - return TestContext{}, err - } - - // Create the Coherence controller - err = (&controllers.CoherenceReconciler{ - Client: k8sManager.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("Coherence"), - }).SetupWithManager(k8sManager) - if err != nil { - return TestContext{}, err - } - - // Create the CoherenceJob controller - err = (&controllers.CoherenceJobReconciler{ - Client: k8sManager.GetClient(), - Log: ctrl.Log.WithName("controllers").WithName("CoherenceJob"), - }).SetupWithManager(k8sManager) - if err != nil { - return TestContext{}, err - } - } - - // Start the manager, which will start the controller and REST server - stop = make(chan struct{}) - go func() { - err = k8sManager.Start(ctx) - }() - - k8sManager.GetCache().WaitForCacheSync(ctx) - <-restServer.Running() - - time.Sleep(5 * time.Second) - - if err != nil { - return TestContext{}, err - } - - return TestContext{ - Config: k8sCfg, - Client: k8sClient, - KubeClient: cs.KubeClient, - Manager: k8sManager, - Logger: testLogger.WithName("test"), - Context: ctx, - testEnv: testEnv, - stop: stop, - }, nil -} - // WaitForStatefulSetForDeployment waits for a StatefulSet to be created for the specified deployment. func WaitForStatefulSetForDeployment(ctx TestContext, namespace string, deployment *coh.Coherence, retryInterval, timeout time.Duration) (*appsv1.StatefulSet, error) { return WaitForStatefulSet(ctx, namespace, deployment.Name, deployment.Spec.GetReplicas(), retryInterval, timeout) @@ -376,6 +112,45 @@ func WaitForStatefulSet(ctx TestContext, namespace, stsName string, replicas int return sts, err } +// WaitForStatefulSetPodCondition waits for all Pods in a StatefulSet to have the specified condition. +func WaitForStatefulSetPodCondition(ctx TestContext, namespace, stsName string, replicas int32, c corev1.PodConditionType, retryInterval, timeout time.Duration) (*appsv1.StatefulSet, error) { + sts, err := WaitForStatefulSet(ctx, namespace, stsName, replicas, retryInterval, timeout) + if err != nil { + return nil, err + } + + err = wait.PollUntilContextTimeout(ctx.Context, retryInterval, timeout, true, func(context.Context) (done bool, err error) { + pods, err := ListPodsForStatefulSet(ctx, sts) + if err != nil { + if apierrors.IsNotFound(err) { + ctx.Logf("Waiting for Pods in StatefulSet %s to reach condition %s - NotFound", stsName, c) + return false, nil + } + ctx.Logf("Waiting for Pods in StatefulSet %s to reach condition %s - %s", stsName, c, err.Error()) + return false, err + } + + ready := true + count := 0 + for _, pod := range pods.Items { + for _, cond := range pod.Status.Conditions { + if cond.Type == c { + if cond.Status == corev1.ConditionTrue { + count++ + } else { + ready = false + } + } + } + } + + ctx.Logf("Waiting for Pods in StatefulSet %s to reach condition %s = (%d/%d)", stsName, c, count, len(pods.Items)) + return ready, nil + }) + + return sts, err +} + // WaitForJob waits for a Job to be created with the specified number of replicas. func WaitForJob(ctx TestContext, namespace, stsName string, replicas int32, retryInterval, timeout time.Duration) (*batchv1.Job, error) { var job *batchv1.Job @@ -882,6 +657,35 @@ func ListPodsWithLabelAndFieldSelector(ctx TestContext, namespace, labelSelector return list.Items, nil } +func ListPodsForStatefulSet(ctx TestContext, sts *appsv1.StatefulSet) (corev1.PodList, error) { + pods := corev1.PodList{} + var replicas int + if sts.Spec.Replicas == nil { + replicas = 1 + } else { + replicas = int(*sts.Spec.Replicas) + } + + name := types.NamespacedName{Namespace: sts.Namespace} + for i := 0; i < replicas; i++ { + name.Name = fmt.Sprintf("%s-%d", sts.Name, i) + pod := corev1.Pod{} + err := ctx.Client.Get(ctx.Context, name, &pod) + if err != nil { + if apierrors.IsNotFound(err) { + t := metav1.Now() + pod.Namespace = name.Namespace + pod.Name = name.Name + pod.DeletionTimestamp = &t + } else { + return pods, err + } + } + pods.Items = append(pods.Items, pod) + } + return pods, nil +} + // WaitForPodReady waits for a Pods to be ready. // //goland:noinspection GoUnusedExportedFunction @@ -2180,3 +1984,10 @@ func DeletePersistentVolumes(ctx TestContext, namespace string) { } } } + +func AddLoopbackTestHostnameLabel(c *coh.Coherence) { + if c.Spec.Labels == nil { + c.Spec.Labels = map[string]string{} + } + c.Spec.Labels[operator.LabelTestHostName] = "127.0.0.1" +} diff --git a/test/e2e/helper/proj_helpers.go b/test/e2e/helper/proj_helpers.go index fa4015ee0..f51f42c82 100644 --- a/test/e2e/helper/proj_helpers.go +++ b/test/e2e/helper/proj_helpers.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2023, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -82,6 +82,8 @@ func EnsureTestEnvVars() { ensureEnvVar("TEST_IMAGE_PULL_POLICY", "IfNotPresent") ensureEnvVar("TEST_SKIP_SITE", "false") + ensureEnvVar("K3D_OPERATOR_IMAGE", "k3d-myregistry.localhost:12345/oracle/coherence-operator:1.0.0") + ensureEnvVar("TEST_COMPATIBILITY_IMAGE", "ghcr.io/oracle/operator-test-compatibility:1.0.0") ensureEnvVar("TEST_APPLICATION_IMAGE_CLIENT", "ghcr.io/oracle/operator-test-client:1.0.0") ensureEnvVar("TEST_APPLICATION_IMAGE", "ghcr.io/oracle/operator-test:1.0.0") @@ -276,6 +278,33 @@ func FindTestLogsDir() (string, error) { return pd + string(os.PathSeparator) + testLogs, nil } +// FindToolsDir returns the build tools directory. +func FindToolsDir() (string, error) { + pd, err := FindProjectRootDir() + if err != nil { + return "", err + } + return filepath.Join(pd, "build", "tools"), nil +} + +// FindK8sApiToolsDir returns the k8s API build tools directory. +func FindK8sApiToolsDir() (string, error) { + pd, err := FindProjectRootDir() + if err != nil { + return "", err + } + return filepath.Join(pd, "build", "tools", "bin", "k8s", fmt.Sprintf("1.31.0-%s-%s", runtime.GOOS, runtime.GOARCH)), nil +} + +// FindRuntimeCrdDir returns the CRD directory under the runtime assets. +func FindRuntimeCrdDir() (string, error) { + pd, err := FindProjectRootDir() + if err != nil { + return "", err + } + return filepath.Join(pd, "pkg", "data", "assets"), nil +} + // FindTestCertsDir returns the test cert directory. func FindTestCertsDir() (string, error) { pd, err := FindProjectRootDir() diff --git a/test/e2e/helper/test_context.go b/test/e2e/helper/test_context.go new file mode 100644 index 000000000..302c1eff4 --- /dev/null +++ b/test/e2e/helper/test_context.go @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package helper + +import ( + "flag" + "fmt" + "github.com/go-logr/logr" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/controllers" + "github.com/oracle/coherence-operator/pkg/clients" + "github.com/oracle/coherence-operator/pkg/operator" + oprest "github.com/oracle/coherence-operator/pkg/rest" + "github.com/spf13/cobra" + "github.com/spf13/pflag" + "github.com/spf13/viper" + "golang.org/x/net/context" + corev1 "k8s.io/api/core/v1" + v1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "k8s.io/utils/ptr" + "net/http" + "os" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/cache" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + "testing" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "sigs.k8s.io/controller-runtime/pkg/client" + + "time" +) + +// TestContext is a context for end-to-end tests +type TestContext struct { + Config *rest.Config + Client client.Client + KubeClient kubernetes.Interface + Manager ctrl.Manager + Logger logr.Logger + Context context.Context + testEnv *envtest.Environment + stop chan struct{} + Cancel context.CancelFunc + namespaces []string + RestServer oprest.Server + RestEndpoints map[string]func(w http.ResponseWriter, r *http.Request) +} + +func (in *TestContext) Logf(format string, a ...interface{}) { + in.Logger.Info(fmt.Sprintf(format, a...)) +} + +func (in *TestContext) CleanupAfterTest(t *testing.T) { + t.Cleanup(func() { + if t.Failed() { + // dump the logs if the test failed + DumpOperatorLogs(t, *in) + } + in.Cleanup() + }) +} + +func (in *TestContext) Cleanup() { + in.Logger.Info("tearing down the test environment") + ns := GetTestNamespace() + in.CleanupNamespace(ns) + clusterNS := GetTestClusterNamespace() + if clusterNS != ns { + in.CleanupNamespace(clusterNS) + } + clientNS := GetTestClientNamespace() + in.CleanupNamespace(clientNS) + for i := range in.namespaces { + _ = in.cleanAndDeleteNamespace(in.namespaces[i]) + } + in.namespaces = nil +} + +func (in *TestContext) CleanupNamespace(ns string) { + in.Logger.Info("tearing down the test environment - namespace: " + ns) + if err := WaitForCoherenceCleanup(*in, ns); err != nil { + in.Logf("error waiting for cleanup to complete: %+v", err) + } + DeletePersistentVolumes(*in, ns) +} + +func (in *TestContext) CreateNamespace(ns string) error { + n := corev1.Namespace{ + ObjectMeta: metav1.ObjectMeta{ + Name: ns, + Namespace: ns, + }, + Spec: corev1.NamespaceSpec{}, + } + _, err := in.KubeClient.CoreV1().Namespaces().Create(in.Context, &n, metav1.CreateOptions{}) + if err != nil { + in.namespaces = append(in.namespaces, ns) + } + return err +} + +func (in *TestContext) DeleteNamespace(ns string) error { + for i := range in.namespaces { + if in.namespaces[i] == ns { + err := in.cleanAndDeleteNamespace(ns) + last := len(in.namespaces) - 1 + in.namespaces[i] = in.namespaces[last] // Copy last element to index i. + in.namespaces[last-1] = "" // Erase last element (write zero value). + in.namespaces = in.namespaces[:last] // Truncate slice. + return err + } + } + return nil +} + +func (in *TestContext) cleanAndDeleteNamespace(ns string) error { + in.CleanupNamespace(ns) + return in.KubeClient.CoreV1().Namespaces().Delete(in.Context, ns, metav1.DeleteOptions{}) +} + +func (in *TestContext) Close() { + in.Cleanup() + if in.stop != nil { + close(in.stop) + } + if in.testEnv != nil { + if err := in.testEnv.Stop(); err != nil { + in.Logf("error stopping test environment: %+v", err) + } + } +} + +func (in *TestContext) Start() error { + var err error + + // Create the REST server + in.RestServer = oprest.NewServerWithEndpoints(in.KubeClient, in.RestEndpoints) + if err := in.RestServer.SetupWithManager(in.Manager); err != nil { + return err + } + + // Start the manager, which will start the controller and REST server + in.stop = make(chan struct{}) + go func() { + err = in.Manager.Start(in.Context) + }() + + in.Manager.GetCache().WaitForCacheSync(in.Context) + <-in.RestServer.Running() + + time.Sleep(5 * time.Second) + return err +} + +// NewContext creates a new TestContext. +func NewContext(startController bool, watchNamespaces ...string) (TestContext, error) { + var err error + var tc TestContext + + testEnv := &envtest.Environment{ + // We need a real cluster for these tests + UseExistingCluster: ptr.To(true), + AttachControlPlaneOutput: true, + CRDs: []*v1.CustomResourceDefinition{}, + } + + tc, err = NewContextWithTestEnv(testEnv, startController, watchNamespaces...) + if err != nil { + return TestContext{}, err + } + return tc, err +} + +// NewContextWithTestEnv creates a new TestContext. +func NewContextWithTestEnv(testEnv *envtest.Environment, startController bool, watchNamespaces ...string) (TestContext, error) { + testLogger := zap.New(zap.UseDevMode(true), zap.WriteTo(os.Stdout)) + logf.SetLogger(testLogger) + + // create a dummy command + Cmd := &cobra.Command{ + Use: "manager", + Short: "Start the operator manager", + } + + // configure viper for the flags and env-vars + operator.SetupFlags(Cmd, viper.GetViper()) + flagSet := pflag.NewFlagSet("operator", pflag.ContinueOnError) + flagSet.AddGoFlagSet(flag.CommandLine) + if err := viper.BindPFlags(flagSet); err != nil { + return TestContext{}, err + } + + testLogger.WithName("test").Info("bootstrapping test environment") + + var err error + + k8sCfg := ctrl.GetConfigOrDie() + + err = corev1.AddToScheme(scheme.Scheme) + if err != nil { + return TestContext{}, err + } + err = coh.AddToScheme(scheme.Scheme) + if err != nil { + return TestContext{}, err + } + + cl, err := client.New(k8sCfg, client.Options{Scheme: scheme.Scheme}) + if err != nil { + return TestContext{}, err + } + + options := ctrl.Options{ + Scheme: scheme.Scheme, + } + + if len(watchNamespaces) == 1 { + // Watch a single namespace + options.NewCache = func(config *rest.Config, opts cache.Options) (cache.Cache, error) { + opts.DefaultNamespaces = map[string]cache.Config{ + watchNamespaces[0]: {}, + } + return cache.New(config, opts) + } + } else if len(watchNamespaces) > 1 { + // Watch a multiple namespaces + options.NewCache = func(config *rest.Config, opts cache.Options) (cache.Cache, error) { + nsMap := make(map[string]cache.Config) + for _, ns := range watchNamespaces { + nsMap[ns] = cache.Config{} + } + opts.DefaultNamespaces = nsMap + return cache.New(config, opts) + } + } + + k8sManager, err := ctrl.NewManager(k8sCfg, options) + if err != nil { + return TestContext{}, err + } + + k8sClient := k8sManager.GetClient() + + cs, err := clients.NewForConfig(k8sCfg) + if err != nil { + return TestContext{}, err + } + + v, err := operator.DetectKubernetesVersion(cs) + if err != nil { + return TestContext{}, err + } + + ctx, cancel := context.WithCancel(context.Background()) + + var stop chan struct{} + + if startController { + // Ensure CRDs exist + err = coh.EnsureCRDs(ctx, v, scheme.Scheme, cl) + if err != nil { + return TestContext{}, err + } + + // Create the Coherence controller + err = (&controllers.CoherenceReconciler{ + Client: k8sManager.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("Coherence"), + }).SetupWithManager(k8sManager, cs) + if err != nil { + return TestContext{}, err + } + + // Create the CoherenceJob controller + err = (&controllers.CoherenceJobReconciler{ + Client: k8sManager.GetClient(), + Log: ctrl.Log.WithName("controllers").WithName("CoherenceJob"), + }).SetupWithManager(k8sManager, cs) + if err != nil { + return TestContext{}, err + } + } + + ep := make(map[string]func(w http.ResponseWriter, r *http.Request)) + return TestContext{ + Config: k8sCfg, + Client: k8sClient, + KubeClient: cs.KubeClient, + Manager: k8sManager, + Logger: testLogger.WithName("test"), + Context: ctx, + testEnv: testEnv, + stop: stop, + Cancel: cancel, + RestEndpoints: ep, + }, nil +} diff --git a/test/e2e/large-cluster/large_cluster_suite_test.go b/test/e2e/large-cluster/large_cluster_suite_test.go new file mode 100644 index 000000000..b6b03757f --- /dev/null +++ b/test/e2e/large-cluster/large_cluster_suite_test.go @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package large_cluster + +import ( + "fmt" + . "github.com/onsi/gomega" + cohv1 "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/test/e2e/helper" + appsv1 "k8s.io/api/apps/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "net/http" + "os" + "testing" + "time" +) + +var testContext helper.TestContext +var nodeList *corev1.NodeList +var nodeMap map[string]corev1.Node +var clusterCount = 0 +var statusHA = true + +var zones = [...]string{"zone-1", "zone-2", "zone-3"} +var faultDomains = [...]string{"fd-1", "fd-2"} + +// The entry point for the test suite +func TestMain(m *testing.M) { + var err error + + helper.EnsureTestEnvVars() + + // Create a new TestContext + if testContext, err = helper.NewContext(true); err != nil { + fmt.Printf("Error: %+v", err) + os.Exit(1) + } + + testContext.RestEndpoints["/ha"] = isStatusHA + + if err = testContext.Start(); err != nil { + fmt.Printf("Error: %+v", err) + os.Exit(1) + } + + nodeMap = make(map[string]corev1.Node) + nodeList, err = testContext.KubeClient.CoreV1().Nodes().List(testContext.Context, metav1.ListOptions{}) + if err != nil { + fmt.Printf("Failed to get Node list from K8s. %+v", err) + os.Exit(1) + } + + zoneCount := len(zones) + fdCount := len(faultDomains) + for i, node := range nodeList.Items { + nodeMap[node.Name] = node + zone := zones[i%zoneCount] + fd := faultDomains[(i/zoneCount)%fdCount] + labels := node.Labels + labels["failure-domain.beta.kubernetes.io/region"] = "region-1" + labels["failure-domain.beta.kubernetes.io/zone"] = zone + labels["topology.kubernetes.io/region"] = "region-1" + labels["topology.kubernetes.io/zone"] = zone + labels["oci.oraclecloud.com/fault-domain"] = fd + node.Labels = labels + updated, err := testContext.KubeClient.CoreV1().Nodes().Update(testContext.Context, &node, metav1.UpdateOptions{}) + if err != nil { + fmt.Printf("Failed to label Node. %+v", err) + os.Exit(1) + } + nodeList.Items[i] = *updated + } + + exitCode := m.Run() + testContext.Logf("Tests completed with return code %d", exitCode) + testContext.Close() + os.Exit(exitCode) +} + +func SetStatusHA(ha bool) { + statusHA = ha +} + +func isStatusHA(w http.ResponseWriter, r *http.Request) { + if statusHA { + w.WriteHeader(http.StatusOK) + } else { + w.WriteHeader(http.StatusServiceUnavailable) + } + _, _ = fmt.Fprint(w, "") +} + +func GenerateClusterName() string { + clusterCount++ + return fmt.Sprintf("cluster-%d", clusterCount) +} + +func GetTestContext() helper.TestContext { + return testContext +} +func GetRestPort() int32 { + return testContext.RestServer.GetPort() +} + +// installSimpleDeployment installs a deployment and asserts that the underlying +// StatefulSet resources reach the correct state. +func installSimpleDeployment(t *testing.T, d cohv1.Coherence) (cohv1.Coherence, appsv1.StatefulSet) { + g := NewGomegaWithT(t) + helper.AddLoopbackTestHostnameLabel(&d) + err := testContext.Client.Create(testContext.Context, &d) + g.Expect(err).NotTo(HaveOccurred()) + return assertDeploymentEventuallyInDesiredState(t, d, d.GetReplicas()) +} + +// assertDeploymentEventuallyInDesiredState asserts that a Coherence resource exists and has the correct spec and that the +// underlying StatefulSet exists with the correct status and ready replicas. +func assertDeploymentEventuallyInDesiredState(t *testing.T, d cohv1.Coherence, replicas int32) (cohv1.Coherence, appsv1.StatefulSet) { + g := NewGomegaWithT(t) + + testContext.Logf("Asserting Coherence resource %s exists with %d replicas", d.Name, replicas) + + // create a DeploymentStateCondition that checks a deployment's replica count + condition := helper.ReplicaCountCondition(replicas) + + // wait for the deployment to match the condition + _, err := helper.WaitForCoherenceCondition(testContext, d.Namespace, d.Name, condition, time.Second*10, time.Minute*5) + g.Expect(err).NotTo(HaveOccurred()) + + testContext.Logf("Asserting StatefulSet %s exists with %d replicas", d.Name, replicas) + + // wait for the StatefulSet to have the required ready replicas + sts, err := helper.WaitForStatefulSet(testContext, d.Namespace, d.Name, replicas, time.Second*10, time.Minute*5) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(sts.Status.ReadyReplicas).To(Equal(replicas)) + + testContext.Logf("Asserting StatefulSet %s exist with %d replicas - Done!", d.Name, replicas) + + err = testContext.Client.Get(testContext.Context, types.NamespacedName{Namespace: d.Namespace, Name: d.Name}, &d) + g.Expect(err).NotTo(HaveOccurred()) + err = testContext.Client.Get(testContext.Context, types.NamespacedName{Namespace: d.Namespace, Name: d.Name}, sts) + g.Expect(err).NotTo(HaveOccurred()) + + return d, *sts +} diff --git a/test/e2e/large-cluster/rolling_upgrade_test.go b/test/e2e/large-cluster/rolling_upgrade_test.go new file mode 100644 index 000000000..f400a82d1 --- /dev/null +++ b/test/e2e/large-cluster/rolling_upgrade_test.go @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. + * Licensed under the Universal Permissive License v 1.0 as shown at + * http://oss.oracle.com/licenses/upl. + */ + +package large_cluster + +import ( + "fmt" + . "github.com/onsi/gomega" + coh "github.com/oracle/coherence-operator/api/v1" + "github.com/oracle/coherence-operator/pkg/operator" + "github.com/oracle/coherence-operator/test/e2e/helper" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/utils/ptr" + "slices" + "testing" + "time" +) + +func TestUpgradeByNodeWithOnePodPerNode(t *testing.T) { + testContext.CleanupAfterTest(t) + + ns := helper.GetTestNamespace() + + labels := map[string]string{} + labels[operator.LabelTestHostName] = "127.0.0.1" + labels[operator.LabelTestHealthPort] = fmt.Sprintf("%d", GetRestPort()) + + replicas := 3 + + c, _ := installSimpleDeployment(t, coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: GenerateClusterName(), + }, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Replicas: ptr.To(int32(replicas)), + Labels: labels, + ReadinessProbe: &coh.ReadinessProbeSpec{ + InitialDelaySeconds: ptr.To(int32(10)), + PeriodSeconds: ptr.To(int32(10)), + FailureThreshold: ptr.To(int32(20)), + }, + }, + RollingUpdateStrategy: ptr.To(coh.UpgradeByNode), + }, + }) + + nodeNameGetter := func(pod corev1.Pod) string { + return pod.Spec.NodeName + } + + DoUpgradeTest(t, c, nodeNameGetter) +} + +func TestUpgradeByNodeWithTwoPodsPerNode(t *testing.T) { + testContext.CleanupAfterTest(t) + + ns := helper.GetTestNamespace() + + labels := map[string]string{} + labels[operator.LabelTestHostName] = "127.0.0.1" + labels[operator.LabelTestHealthPort] = fmt.Sprintf("%d", GetRestPort()) + + replicas := len(nodeList.Items) * 2 + + c, _ := installSimpleDeployment(t, coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: GenerateClusterName(), + }, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Replicas: ptr.To(int32(replicas)), + Labels: labels, + ReadinessProbe: &coh.ReadinessProbeSpec{ + InitialDelaySeconds: ptr.To(int32(10)), + PeriodSeconds: ptr.To(int32(10)), + FailureThreshold: ptr.To(int32(20)), + }, + }, + RollingUpdateStrategy: ptr.To(coh.UpgradeByNode), + }, + }) + + nodeNameGetter := func(pod corev1.Pod) string { + return pod.Spec.NodeName + } + + DoUpgradeTest(t, c, nodeNameGetter) +} + +func TestUpgradeByNodeLabelTwoPodsPerNode(t *testing.T) { + testContext.CleanupAfterTest(t) + + ns := helper.GetTestNamespace() + + labels := map[string]string{} + labels[operator.LabelTestHostName] = "127.0.0.1" + labels[operator.LabelTestHealthPort] = fmt.Sprintf("%d", GetRestPort()) + + replicas := len(nodeList.Items) * 2 + + zoneLabel := "topology.kubernetes.io/zone" + + c, _ := installSimpleDeployment(t, coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: GenerateClusterName(), + }, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Replicas: ptr.To(int32(replicas)), + Labels: labels, + ReadinessProbe: &coh.ReadinessProbeSpec{ + InitialDelaySeconds: ptr.To(int32(10)), + PeriodSeconds: ptr.To(int32(10)), + FailureThreshold: ptr.To(int32(20)), + }, + }, + RollingUpdateStrategy: ptr.To(coh.UpgradeByNodeLabel), + RollingUpdateLabel: &zoneLabel, + }, + }) + + nodeZoneGetter := func(pod corev1.Pod) string { + node, ok := nodeMap[pod.Spec.NodeName] + if ok { + return node.Labels[zoneLabel] + } + return "" + } + + DoUpgradeTest(t, c, nodeZoneGetter) +} + +func DoUpgradeTest(t *testing.T, c coh.Coherence, idFunction func(corev1.Pod) string) { + var err error + g := NewWithT(t) + + replicas := int(c.GetReplicas()) + ns := c.Namespace + + before, err := helper.ListCoherencePodsForDeployment(testContext, c.Namespace, c.Name) + g.Expect(err).To(BeNil()) + g.Expect(len(before)).To(Equal(replicas)) + + if c.Spec.Labels == nil { + c.Spec.Labels = make(map[string]string) + } + c.Spec.Labels["test"] = "one" + + err = testContext.Client.Update(testContext.Context, &c) + g.Expect(err).NotTo(HaveOccurred()) + + _, err = helper.WaitForPodsWithLabel(testContext, ns, "test=one", int(replicas), time.Second*5, time.Minute*10) + g.Expect(err).NotTo(HaveOccurred()) + + // Wait for the final Pod to be ready, i.e. all Pods in ready state + _, err = helper.WaitForStatefulSetPodCondition(testContext, c.Namespace, c.Name, int32(replicas), corev1.PodReady, time.Second*10, time.Minute*5) + g.Expect(err).To(BeNil()) + + after, err := helper.ListCoherencePodsForDeployment(testContext, c.Namespace, c.Name) + g.Expect(err).To(BeNil()) + g.Expect(len(after)).To(Equal(replicas)) + + AssertPodRestartTimes(t, before, after, idFunction) +} + +func AssertPodRestartTimes(t *testing.T, before, after []corev1.Pod, idFunction func(corev1.Pod) string) { + g := NewWithT(t) + + // get the list of nodes the before pods were running on + nodeNames := make([]string, 0) + m := SplitPodsById(before, idFunction) + for name := range m { + nodeNames = append(nodeNames, name) + } + + // dump data to logs for analysis on failure + for _, pods := range m { + for _, pod := range pods { + ap, ok := FindPod(pod.Name, after) + g.Expect(ok).To(BeTrue()) + t.Logf(">>>> Pod %s beforeNode %s beforeId %s afterNode %s afterId %s scheduled %v ready %v", + pod.Name, pod.Spec.NodeName, idFunction(pod), ap.Spec.NodeName, idFunction(ap), GetPodScheduledTime(ap), GetPodReadyTime(ap)) + } + } + + // for each after node get min scheduled and max ready and store by before node + mapScheduled := make(map[string]time.Time) + mapReady := make(map[string]time.Time) + + for _, pod := range after { + scheduled := GetPodScheduledTime(pod) + ready := GetPodReadyTime(pod) + + bp, ok := FindPod(pod.Name, before) + g.Expect(ok).To(BeTrue()) + n := idFunction(bp) + + s, found := mapScheduled[n] + if !found || s.After(scheduled) { + mapScheduled[n] = scheduled + } + + r, found := mapReady[n] + if !found || r.Before(ready) { + mapReady[n] = ready + } + } + + // make sure none of the ranges overlap + for name, scheduled := range mapScheduled { + ready := mapReady[name] + for _, otherName := range nodeNames { + if otherName != name { + scheduledOther := mapScheduled[otherName] + readyOther := mapReady[otherName] + // compare scheduled time + i := scheduled.Compare(scheduledOther) + // must not be equal + g.Expect(i).NotTo(BeZero()) + // ready must be the same comparison as scheduled, + //i.e. if scheduled is before then ready must be before + g.Expect(ready.Compare(readyOther)).To(Equal(i), fmt.Sprintf("node %s scheduled and ready overlap with node %s", name, otherName)) + } + } + } +} + +func SplitPodsById(pods []corev1.Pod, idFunction func(corev1.Pod) string) map[string][]corev1.Pod { + m := make(map[string][]corev1.Pod) + for _, pod := range pods { + nodeName := idFunction(pod) + podsForNode, found := m[nodeName] + if !found { + podsForNode = make([]corev1.Pod, 0) + } + podsForNode = append(podsForNode, pod) + m[nodeName] = podsForNode + } + return m +} + +func SortPodsByScheduledTime(pods []corev1.Pod) { + sorter := func(a, b corev1.Pod) int { + aTime := GetPodScheduledTime(a) + bTime := GetPodScheduledTime(b) + return aTime.Compare(bTime) + } + slices.SortFunc(pods, sorter) +} + +func GetPodScheduledTime(pod corev1.Pod) time.Time { + for _, c := range pod.Status.Conditions { + if c.Type == corev1.PodScheduled && c.Status == corev1.ConditionTrue { + return c.LastTransitionTime.Time + } + } + return time.Time{} +} + +func GetPodReadyTime(pod corev1.Pod) time.Time { + for _, c := range pod.Status.Conditions { + if c.Type == corev1.PodReady && c.Status == corev1.ConditionTrue { + return c.LastTransitionTime.Time + } + } + return time.Time{} +} + +func FindPod(name string, pods []corev1.Pod) (corev1.Pod, bool) { + for _, pod := range pods { + if pod.Name == name { + return pod, true + } + } + return corev1.Pod{}, false +} From aae8cb18d17f9ad1489f19fa35c68c76737dfb6a Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Fri, 20 Sep 2024 17:19:15 +0300 Subject: [PATCH 2/9] Fixing broken test harness --- .github/workflows/build.yaml | 6 +- .github/workflows/coherence-matrix.yaml | 3 +- .github/workflows/compatibility-tests.yaml | 3 +- .github/workflows/istio-tests.yaml | 3 +- .github/workflows/k3d-tests.yaml | 16 +--- .github/workflows/k8s-matrix.yaml | 3 +- .github/workflows/minikube-matrix.yaml | 3 +- .github/workflows/prometheus-tests.yaml | 3 +- .github/workflows/tanzu-tests.yaml | 6 +- docs/applications/032_rolling_upgrade.adoc | 104 ++++++++++++++++++++- test/certification/suite_test.go | 2 +- test/coherence_compatibility/suite_test.go | 2 +- test/e2e/clients/suite_test.go | 2 +- test/e2e/compatibility/suite_test.go | 2 +- test/e2e/helm/suite_test.go | 2 +- test/e2e/helper/test_context.go | 43 ++++----- test/e2e/local/suite_test.go | 2 +- test/e2e/prometheus/suite_test.go | 2 +- test/e2e/remote/e2e_suite_test.go | 2 +- 19 files changed, 156 insertions(+), 53 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 93ebdf6f2..ee9ce0844 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -151,6 +151,7 @@ jobs: with: name: coherence-operator-manifests.tar.gz path: build/_output/coherence-operator-manifests.tar.gz + if-no-files-found: ignore - name: Upload Yaml uses: actions/upload-artifact@v4 @@ -158,6 +159,7 @@ jobs: with: name: coherence-operator.yaml path: build/_output/coherence-operator.yaml + if-no-files-found: ignore - name: Upload CRD uses: actions/upload-artifact@v4 @@ -165,12 +167,14 @@ jobs: with: name: coherence.oracle.com_coherence.yaml path: build/_output/manifests/crd/coherence.oracle.com_coherence.yaml + if-no-files-found: ignore - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output path: build/_output/test-logs + if-no-files-found: ignore - name: Deploy Snapshots & Docs if: ${{ github.ref == 'refs/heads/main' && success() }} diff --git a/.github/workflows/coherence-matrix.yaml b/.github/workflows/coherence-matrix.yaml index fc6caffa8..75574a527 100644 --- a/.github/workflows/coherence-matrix.yaml +++ b/.github/workflows/coherence-matrix.yaml @@ -213,7 +213,8 @@ jobs: make coherence-compatibility-test - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output-${{ matrix.matrixName }} path: build/_output/test-logs + if-no-files-found: ignore diff --git a/.github/workflows/compatibility-tests.yaml b/.github/workflows/compatibility-tests.yaml index 4ca14b088..cb28a8aa8 100644 --- a/.github/workflows/compatibility-tests.yaml +++ b/.github/workflows/compatibility-tests.yaml @@ -193,7 +193,8 @@ jobs: make compatibility-test - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output-${{ matrix.compatibilityVersion }} path: build/_output/test-logs + if-no-files-found: ignore diff --git a/.github/workflows/istio-tests.yaml b/.github/workflows/istio-tests.yaml index 3958a3907..7d96ea5b6 100644 --- a/.github/workflows/istio-tests.yaml +++ b/.github/workflows/istio-tests.yaml @@ -139,7 +139,8 @@ jobs: ISTIO_VERSION=${{ matrix.istioVersion }} make uninstall-istio - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output-${{ matrix.istioVersion }} path: build/_output/test-logs + if-no-files-found: ignore diff --git a/.github/workflows/k3d-tests.yaml b/.github/workflows/k3d-tests.yaml index 123de91d0..dcb2139c4 100644 --- a/.github/workflows/k3d-tests.yaml +++ b/.github/workflows/k3d-tests.yaml @@ -88,23 +88,17 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | sudo chown -R runner:runner /run/systemd/resolve/stub-resolv.conf sudo echo nameserver 8.8.8.8 > /run/systemd/resolve/stub-resolv.conf - - name: Start K3d Cluster + - name: Run K3d Tests shell: bash run: | echo "${{ secrets.GITHUB_TOKEN }}" | docker login ghcr.io -u $ --password-stdin + make build-operator make k3d kubectl version kubectl get nodes @@ -116,12 +110,10 @@ jobs: shell: bash timeout-minutes: 60 run: | - make build-operator - make kind-load - make e2e-prometheus-test + make e2e-k3d-test - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output path: build/_output/test-logs diff --git a/.github/workflows/k8s-matrix.yaml b/.github/workflows/k8s-matrix.yaml index 1113b511a..4d78cedd5 100644 --- a/.github/workflows/k8s-matrix.yaml +++ b/.github/workflows/k8s-matrix.yaml @@ -161,7 +161,8 @@ jobs: ./hack/k8s-certification.sh - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output-${{ matrix.matrixName }} path: build/_output/test-logs + if-no-files-found: ignore diff --git a/.github/workflows/minikube-matrix.yaml b/.github/workflows/minikube-matrix.yaml index e1011589d..96043159c 100644 --- a/.github/workflows/minikube-matrix.yaml +++ b/.github/workflows/minikube-matrix.yaml @@ -141,7 +141,8 @@ jobs: ./hack/k8s-certification.sh - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output-${{ matrix.matrixName }} path: build/_output/test-logs + if-no-files-found: ignore diff --git a/.github/workflows/prometheus-tests.yaml b/.github/workflows/prometheus-tests.yaml index 980126d75..fb90252fd 100644 --- a/.github/workflows/prometheus-tests.yaml +++ b/.github/workflows/prometheus-tests.yaml @@ -121,7 +121,8 @@ jobs: make e2e-prometheus-test - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output path: build/_output/test-logs + if-no-files-found: ignore diff --git a/.github/workflows/tanzu-tests.yaml b/.github/workflows/tanzu-tests.yaml index dd488b25e..5dda6de5d 100644 --- a/.github/workflows/tanzu-tests.yaml +++ b/.github/workflows/tanzu-tests.yaml @@ -140,13 +140,15 @@ jobs: make run-certification OPERATOR_NAMESPACE=coherence - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: tanzu-artifacts path: build/_output/tanzu + if-no-files-found: ignore - uses: actions/upload-artifact@v4 - if: failure() + if: ${{ failure() || cancelled() }} with: name: test-output path: build/_output/test-logs + if-no-files-found: ignore diff --git a/docs/applications/032_rolling_upgrade.adoc b/docs/applications/032_rolling_upgrade.adoc index 10b0bedec..0c64eb262 100644 --- a/docs/applications/032_rolling_upgrade.adoc +++ b/docs/applications/032_rolling_upgrade.adoc @@ -28,7 +28,6 @@ to kill the next Pod. The Coherence resource yaml has a field named `RollingUpdateStrategy` which can be used to override the default rolling upgrade strategy. The field can be set to one of the following values: - |=== |RollingUpdateStrategy |Description @@ -46,6 +45,32 @@ rolling upgrade strategy. The field can be set to one of the following values: |This strategy is the same as the `Manual` rolling upgrade configuration for a StatefulSet. |=== +The default "by Pod" strategy is the slowest but safest strategy. +For a very large cluster upgrading by Pod may take a long time. For example, if each Pod takes two minutes to be +rescheduled and become ready, and a cluster has 100 Pods, that will be 200 minutes, (over three hours) to upgrade. +In a lot of use cases the time taken to upgrade is not an issue, Coherence continues to serve requests while the +upgrade is in progress. But, sometimes a faster upgrade is required, which is where the other strategies can be used. + +=== Upgrade By Pod + +Upgrading by Pod is the default strategy described above. + +The `Pod` strategy is configured by omitting the `rollingUpdateStrategy` field, +or by setting the `rollingUpdateStrategy` field to `Pod` as shown below: + +[source,yaml] +.cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: test +spec: + rollingUpdateStrategy: Pod + image: my-app:1.0.0 +---- + + === Upgrade By Node By default, the Operator configures Coherence to be at least "machine safe", @@ -79,6 +104,9 @@ spec: === Upgrade By Node Label +The `NodeLabel` strategy will perform a rolling upgrade by using a label on Nodes to group Nodes together. +Then all Pods on all the Nodes in a group (i.e. with the same label value) will be upgraded together. + In many production Kubernetes clusters, there is a concept of zones and fault domains, with each Node belonging to one of these zones and domains. Typically, Nodes are labelled to indicate which zone and domain they are in. For example the `topology.kubernetes.io/zone` is a standard label for the zone name. @@ -86,7 +114,77 @@ For example the `topology.kubernetes.io/zone` is a standard label for the zone n These labels are used by the Coherence Operator to configure the site and rack names for a Coherence cluster. (see the documentation on <>). In a properly configured cluster that is site or rack safe, it is possible to upgrade all Pods in a site or rack -at the same time. - +at the same time. In a typical Cloud Kubernetes Cluster there may be three zones, so a rolling upgrade by zon (or site) +would upgrade the cluster in three parts, which would be much faster than Pod by Pod. This is a more extreme version of the `Node` strategy. +[NOTE] +==== +When using the `NodeLabel` strategy where multiple Pods will be killed, the remaining cluster must have enough +capacity to recover the data and backups from the Pods that are killed. + +For example, if a cluster of 18 Pods is distributed over Nodes in three zones, each zone will be running six Pods. +When upgrading by Node label, six Pods will be killed at the same time, so there must be enough capacity in the +remaining 12 Pods to hold all the data that was in the original 18 Pods. +==== + +The `Node` strategy is configured by setting the `rollingUpdateStrategy` field to `NodeLabel` +and also setting the `rollingUpdateLabel` field to the name of the label to use. + +For example, to perform a rolling upgrade of all Pods by zone the yaml below could be used: + +[source,yaml] +.cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: test +spec: + rollingUpdateStrategy: NodeLabel + rollingUpdateLabel: "topology.kubernetes.io/zone" + image: my-app:1.0.0 +---- + +[CAUTION] +==== +It is up to the customer to verify that the label used is appropriate, i.e. is one of the labels used for the +Coherence site or rack configuration. It is also important to ensure that all Nodes in the cluster actually have +the label. + +It is also up to the customer to verify that the Coherence cluster to be upgraded is site or rack safe before the +upgrade begins. The Coherence Operator can determine that no services are endangered, but it cannot determine site +or rack safety. +==== + +=== Manual Upgrade + +If the `rollingUpdateStrategy` is set to `Manual` then neither the Coherence Operator, nor the StatefulSet controller in +Kubernetes will upgrade the Pods. +When the manual strategy is used the StatefulSet's `spec.` field is set to `OnDelete`. +After updating a Coherence resource, the StatefulSet will be updated with the new state, but none of the Pods will be upgraded. +Pods must then be manually deleted so that they are rescheduled with the new configuration. +Pods can be deleted in any order and any number at a time. +(see https://kubernetes.io/docs/concepts/workloads/controllers/statefulset/#update-strategies[StatefulSet Update Strategies] +in the Kubernetes documentation). + +The `Manual` strategy is configured by setting the `rollingUpdateStrategy` field to `Manual` as shown below: + +[source,yaml] +.cluster.yaml +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: test +spec: + rollingUpdateStrategy: Manual + image: my-app:1.0.0 +---- + +[CAUTION] +==== +When using the manual upgrade strategy, the customer is in full control of the upgrade process. +The Operator will not do anything. It is important that the customer understands how to perform +a safe rolling upgrade if no data loss is desired. +==== diff --git a/test/certification/suite_test.go b/test/certification/suite_test.go index 406817096..de0184d34 100644 --- a/test/certification/suite_test.go +++ b/test/certification/suite_test.go @@ -21,7 +21,7 @@ func TestMain(m *testing.M) { var err error // Create a new TestContext - DO NOT start any controllers. - if testContext, err = helper.NewContext(false); err != nil { + if testContext, err = helper.NewStartedContext(false); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } diff --git a/test/coherence_compatibility/suite_test.go b/test/coherence_compatibility/suite_test.go index f6826d760..3db945270 100644 --- a/test/coherence_compatibility/suite_test.go +++ b/test/coherence_compatibility/suite_test.go @@ -23,7 +23,7 @@ func TestMain(m *testing.M) { var err error // Create a new TestContext - DO NOT start any controllers. - if testContext, err = helper.NewContext(false); err != nil { + if testContext, err = helper.NewStartedContext(false); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } diff --git a/test/e2e/clients/suite_test.go b/test/e2e/clients/suite_test.go index 55356fc19..6ecfcff20 100644 --- a/test/e2e/clients/suite_test.go +++ b/test/e2e/clients/suite_test.go @@ -23,7 +23,7 @@ func TestMain(m *testing.M) { helper.EnsureTestEnvVars() - if testContext, err = helper.NewContext(true); err != nil { + if testContext, err = helper.NewStartedContext(true); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } diff --git a/test/e2e/compatibility/suite_test.go b/test/e2e/compatibility/suite_test.go index 28a91b8c4..26d5a7afe 100644 --- a/test/e2e/compatibility/suite_test.go +++ b/test/e2e/compatibility/suite_test.go @@ -20,7 +20,7 @@ func TestMain(m *testing.M) { var err error // Create a new TestContext - DO NOT start any controllers. - if testContext, err = helper.NewContext(false); err != nil { + if testContext, err = helper.NewStartedContext(false); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } diff --git a/test/e2e/helm/suite_test.go b/test/e2e/helm/suite_test.go index 7d0c63316..e2817f191 100644 --- a/test/e2e/helm/suite_test.go +++ b/test/e2e/helm/suite_test.go @@ -20,7 +20,7 @@ func TestMain(m *testing.M) { var err error // Create a new TestContext - DO NOT start any controllers. - if testContext, err = helper.NewContext(false); err != nil { + if testContext, err = helper.NewStartedContext(false); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } diff --git a/test/e2e/helper/test_context.go b/test/e2e/helper/test_context.go index 302c1eff4..8e0098c08 100644 --- a/test/e2e/helper/test_context.go +++ b/test/e2e/helper/test_context.go @@ -23,7 +23,6 @@ import ( v1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" - "k8s.io/utils/ptr" "net/http" "os" ctrl "sigs.k8s.io/controller-runtime" @@ -162,27 +161,17 @@ func (in *TestContext) Start() error { return err } -// NewContext creates a new TestContext. -func NewContext(startController bool, watchNamespaces ...string) (TestContext, error) { - var err error - var tc TestContext - - testEnv := &envtest.Environment{ - // We need a real cluster for these tests - UseExistingCluster: ptr.To(true), - AttachControlPlaneOutput: true, - CRDs: []*v1.CustomResourceDefinition{}, - } - - tc, err = NewContextWithTestEnv(testEnv, startController, watchNamespaces...) - if err != nil { - return TestContext{}, err +// NewStartedContext creates a new TestContext starts it. +func NewStartedContext(startController bool, watchNamespaces ...string) (TestContext, error) { + ctx, err := NewContext(startController, watchNamespaces...) + if err == nil { + err = ctx.Start() } - return tc, err + return ctx, err } -// NewContextWithTestEnv creates a new TestContext. -func NewContextWithTestEnv(testEnv *envtest.Environment, startController bool, watchNamespaces ...string) (TestContext, error) { +// NewContext creates a new TestContext. +func NewContext(startController bool, watchNamespaces ...string) (TestContext, error) { testLogger := zap.New(zap.UseDevMode(true), zap.WriteTo(os.Stdout)) logf.SetLogger(testLogger) @@ -200,12 +189,18 @@ func NewContextWithTestEnv(testEnv *envtest.Environment, startController bool, w return TestContext{}, err } + // We need a real cluster for these tests + useCluster := true + testLogger.WithName("test").Info("bootstrapping test environment") + testEnv := &envtest.Environment{ + UseExistingCluster: &useCluster, + AttachControlPlaneOutput: true, + CRDs: []*v1.CustomResourceDefinition{}, + } var err error - k8sCfg := ctrl.GetConfigOrDie() - err = corev1.AddToScheme(scheme.Scheme) if err != nil { return TestContext{}, err @@ -215,6 +210,11 @@ func NewContextWithTestEnv(testEnv *envtest.Environment, startController bool, w return TestContext{}, err } + k8sCfg, err := testEnv.Start() + if err != nil { + return TestContext{}, err + } + cl, err := client.New(k8sCfg, client.Options{Scheme: scheme.Scheme}) if err != nil { return TestContext{}, err @@ -292,6 +292,7 @@ func NewContextWithTestEnv(testEnv *envtest.Environment, startController bool, w } ep := make(map[string]func(w http.ResponseWriter, r *http.Request)) + return TestContext{ Config: k8sCfg, Client: k8sClient, diff --git a/test/e2e/local/suite_test.go b/test/e2e/local/suite_test.go index 9f4068b9c..e32dd29b7 100644 --- a/test/e2e/local/suite_test.go +++ b/test/e2e/local/suite_test.go @@ -23,7 +23,7 @@ func TestMain(m *testing.M) { helper.EnsureTestEnvVars() - if testContext, err = helper.NewContext(true); err != nil { + if testContext, err = helper.NewStartedContext(true); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } diff --git a/test/e2e/prometheus/suite_test.go b/test/e2e/prometheus/suite_test.go index 72b1d6ace..7c394cfb9 100644 --- a/test/e2e/prometheus/suite_test.go +++ b/test/e2e/prometheus/suite_test.go @@ -21,7 +21,7 @@ func TestMain(m *testing.M) { var err error // Create a new TestContext - DO NOT start any controllers. - if testContext, err = helper.NewContext(false); err != nil { + if testContext, err = helper.NewStartedContext(false); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } diff --git a/test/e2e/remote/e2e_suite_test.go b/test/e2e/remote/e2e_suite_test.go index 12bca5ce4..6699ad5be 100644 --- a/test/e2e/remote/e2e_suite_test.go +++ b/test/e2e/remote/e2e_suite_test.go @@ -26,7 +26,7 @@ func TestMain(m *testing.M) { helper.EnsureTestEnvVars() // Create a new TestContext - DO NOT start any controllers. - if testContext, err = helper.NewContext(false); err != nil { + if testContext, err = helper.NewStartedContext(false); err != nil { fmt.Printf("Error: %+v", err) os.Exit(1) } From 4efb5ed4f0e0b48b8ee80f86d7d3243dcbd308b3 Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Fri, 20 Sep 2024 17:35:41 +0300 Subject: [PATCH 3/9] Fix copyrights --- test/certification/suite_test.go | 2 +- test/coherence_compatibility/suite_test.go | 2 +- test/e2e/clients/suite_test.go | 2 +- test/e2e/compatibility/suite_test.go | 2 +- test/e2e/helm/suite_test.go | 2 +- test/e2e/local/suite_test.go | 2 +- test/e2e/prometheus/suite_test.go | 2 +- test/e2e/remote/e2e_suite_test.go | 2 +- 8 files changed, 8 insertions(+), 8 deletions(-) diff --git a/test/certification/suite_test.go b/test/certification/suite_test.go index de0184d34..d4c826395 100644 --- a/test/certification/suite_test.go +++ b/test/certification/suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2023, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ diff --git a/test/coherence_compatibility/suite_test.go b/test/coherence_compatibility/suite_test.go index 3db945270..0631ab3a6 100644 --- a/test/coherence_compatibility/suite_test.go +++ b/test/coherence_compatibility/suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2023, Oracle and/or its affiliates. + * Copyright (c) 2021, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ diff --git a/test/e2e/clients/suite_test.go b/test/e2e/clients/suite_test.go index 6ecfcff20..d32d12e1d 100644 --- a/test/e2e/clients/suite_test.go +++ b/test/e2e/clients/suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, Oracle and/or its affiliates. + * Copyright (c) 2021, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ diff --git a/test/e2e/compatibility/suite_test.go b/test/e2e/compatibility/suite_test.go index 26d5a7afe..dead82013 100644 --- a/test/e2e/compatibility/suite_test.go +++ b/test/e2e/compatibility/suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ diff --git a/test/e2e/helm/suite_test.go b/test/e2e/helm/suite_test.go index e2817f191..8d436dafc 100644 --- a/test/e2e/helm/suite_test.go +++ b/test/e2e/helm/suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ diff --git a/test/e2e/local/suite_test.go b/test/e2e/local/suite_test.go index e32dd29b7..2e5420a9c 100644 --- a/test/e2e/local/suite_test.go +++ b/test/e2e/local/suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2021, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ diff --git a/test/e2e/prometheus/suite_test.go b/test/e2e/prometheus/suite_test.go index 7c394cfb9..480d1d2f3 100644 --- a/test/e2e/prometheus/suite_test.go +++ b/test/e2e/prometheus/suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2023, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ diff --git a/test/e2e/remote/e2e_suite_test.go b/test/e2e/remote/e2e_suite_test.go index 6699ad5be..b28231be6 100644 --- a/test/e2e/remote/e2e_suite_test.go +++ b/test/e2e/remote/e2e_suite_test.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2023, Oracle and/or its affiliates. + * Copyright (c) 2019, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ From 1f212843a4bc4893cc0f5dd206d2ef13a7a06128 Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Fri, 20 Sep 2024 20:09:53 +0300 Subject: [PATCH 4/9] Fix builds --- .github/workflows/build.yaml | 7 ---- .github/workflows/coherence-matrix.yaml | 7 ---- .github/workflows/compatibility-tests.yaml | 7 ---- .github/workflows/doc-check.yaml | 7 ---- .github/workflows/istio-tests.yaml | 7 ---- .github/workflows/k8s-matrix.yaml | 7 ---- .github/workflows/minikube-matrix.yaml | 7 ---- .github/workflows/prometheus-tests.yaml | 7 ---- .github/workflows/release.yml | 29 +++++++++++---- .github/workflows/tanzu-tests.yaml | 7 ---- .github/workflows/trivy.yaml | 7 ---- Makefile | 41 +++++++++++++++++++--- 12 files changed, 58 insertions(+), 82 deletions(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index ee9ce0844..2b9bd6c39 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -88,13 +88,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/coherence-matrix.yaml b/.github/workflows/coherence-matrix.yaml index 75574a527..f9091e3fe 100644 --- a/.github/workflows/coherence-matrix.yaml +++ b/.github/workflows/coherence-matrix.yaml @@ -164,13 +164,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/compatibility-tests.yaml b/.github/workflows/compatibility-tests.yaml index cb28a8aa8..7ff535e41 100644 --- a/.github/workflows/compatibility-tests.yaml +++ b/.github/workflows/compatibility-tests.yaml @@ -147,13 +147,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/doc-check.yaml b/.github/workflows/doc-check.yaml index 1501b0ede..8a4da668d 100644 --- a/.github/workflows/doc-check.yaml +++ b/.github/workflows/doc-check.yaml @@ -56,13 +56,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/istio-tests.yaml b/.github/workflows/istio-tests.yaml index 7d96ea5b6..b7fe53d3e 100644 --- a/.github/workflows/istio-tests.yaml +++ b/.github/workflows/istio-tests.yaml @@ -94,13 +94,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/k8s-matrix.yaml b/.github/workflows/k8s-matrix.yaml index 4d78cedd5..f6f5ce7b0 100644 --- a/.github/workflows/k8s-matrix.yaml +++ b/.github/workflows/k8s-matrix.yaml @@ -130,13 +130,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/minikube-matrix.yaml b/.github/workflows/minikube-matrix.yaml index 96043159c..4deb588c3 100644 --- a/.github/workflows/minikube-matrix.yaml +++ b/.github/workflows/minikube-matrix.yaml @@ -110,13 +110,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/prometheus-tests.yaml b/.github/workflows/prometheus-tests.yaml index fb90252fd..603622627 100644 --- a/.github/workflows/prometheus-tests.yaml +++ b/.github/workflows/prometheus-tests.yaml @@ -88,13 +88,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/release.yml b/.github/workflows/release.yml index b3c21d0d3..5113fc3e5 100644 --- a/.github/workflows/release.yml +++ b/.github/workflows/release.yml @@ -49,13 +49,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Start KinD Cluster shell: bash run: | @@ -124,6 +117,28 @@ jobs: asset_name: coherence.oracle.com_coherence_small.yaml asset_content_type: text/plain + - name: Upload Release Job CRD + id: upload-release-crd + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ github.event.release.upload_url }} + asset_path: /tmp/coherence-operator/_output/manifests/crd/coherencejob.oracle.com_coherence.yaml + asset_name: coherence.oracle.com_coherence.yaml + asset_content_type: text/plain + + - name: Upload Release Small Job CRD + id: upload-release-small-crd + uses: actions/upload-release-asset@v1 + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + with: + upload_url: ${{ github.event.release.upload_url }} + asset_path: /tmp/coherence-operator/_output/manifests/crd-small/coherencejob.oracle.com_coherence.yaml + asset_name: coherence.oracle.com_coherence_small.yaml + asset_content_type: text/plain + - name: Upload Release Dashboards id: upload-release-dashboards uses: actions/upload-release-asset@v1 diff --git a/.github/workflows/tanzu-tests.yaml b/.github/workflows/tanzu-tests.yaml index 5dda6de5d..9683a4a68 100644 --- a/.github/workflows/tanzu-tests.yaml +++ b/.github/workflows/tanzu-tests.yaml @@ -92,13 +92,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/.github/workflows/trivy.yaml b/.github/workflows/trivy.yaml index a97560065..33dd8ef60 100644 --- a/.github/workflows/trivy.yaml +++ b/.github/workflows/trivy.yaml @@ -78,13 +78,6 @@ jobs: key: ${{ runner.os }}-m2-${{ hashFiles('**/pom.xml') }} restore-keys: ${{ runner.os }}-m2 - - name: Cache Tools - uses: actions/cache@v4 - with: - path: build/tools - key: ${{ runner.os }}-build-tools-${{ hashFiles('**/Makefile') }} - restore-keys: ${{ runner.os }}-build-tools - - name: Edit DNS Resolve shell: bash run: | diff --git a/Makefile b/Makefile index 591cea62d..4d77cfc45 100644 --- a/Makefile +++ b/Makefile @@ -1484,12 +1484,12 @@ tail-logs: ## Tail the Coherence Operator Pod logs (with follow) kubectl -n $(OPERATOR_NAMESPACE) logs $(POD) -c manager -f -$(BUILD_MANIFESTS_PKG): $(TOOLS_BIN)/kustomize +$(BUILD_MANIFESTS_PKG): $(TOOLS_BIN)/kustomize $(TOOLS_BIN)/yq rm -rf $(BUILD_MANIFESTS) || true mkdir -p $(BUILD_MANIFESTS)/crd - $(KUSTOMIZE) build config/crd > $(BUILD_MANIFESTS)/crd/coherence.oracle.com_coherence.yaml + $(KUSTOMIZE) build config/crd > $(BUILD_MANIFESTS)/crd/temp.yaml mkdir -p $(BUILD_MANIFESTS)/crd-small - $(KUSTOMIZE) build config/crd-small > $(BUILD_MANIFESTS)/crd-small/coherence.oracle.com_coherence.yaml + $(KUSTOMIZE) build config/crd-small > $(BUILD_MANIFESTS)/crd-small/temp.yaml cp -R config/default/ $(BUILD_MANIFESTS)/default cp -R config/manager/ $(BUILD_MANIFESTS)/manager cp -R config/rbac/ $(BUILD_MANIFESTS)/rbac @@ -1498,6 +1498,10 @@ $(BUILD_MANIFESTS_PKG): $(TOOLS_BIN)/kustomize cp config/namespace/namespace.yaml $(BUILD_OUTPUT)/coherence-operator.yaml $(KUSTOMIZE) build $(BUILD_DEPLOY)/default >> $(BUILD_OUTPUT)/coherence-operator.yaml $(SED) -e 's/name: coherence-operator-env-vars-.*/name: coherence-operator-env-vars/g' $(BUILD_OUTPUT)/coherence-operator.yaml + cd $(BUILD_MANIFESTS)/crd && $(TOOLS_BIN)/yq --no-doc -s '.metadata.name + ".yaml"' temp.yaml + rm $(BUILD_MANIFESTS)/crd/temp.yaml + cd $(BUILD_MANIFESTS)/crd-small && $(TOOLS_BIN)/yq --no-doc -s '.metadata.name + ".yaml"' temp.yaml + rm $(BUILD_MANIFESTS)/crd-small/temp.yaml # ---------------------------------------------------------------------------------------------------------------------- # Delete and re-create the test namespace @@ -1741,11 +1745,15 @@ k3d-load-coherence: $(TOOLS_BIN)/k3d ## Load the Coherence images into the k3d .PHONY: k3d-load-all k3d-load-all: $(TOOLS_BIN)/k3d k3d-load-operator k3d-load-coherence ## Load all the test images into the k3d cluster - +.PHONY: k3d-get k3d-get: $(TOOLS_BIN)/k3d ## Install k3d +K3D_PATH = ${PATH} $(TOOLS_BIN)/k3d: - export K3D_INSTALL_DIR=$(TOOLS_BIN) && export USE_SUDO=false && curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash + export K3D_INSTALL_DIR=$(TOOLS_BIN) \ + && export USE_SUDO=false \ + && export PATH="$(TOOLS_BIN):$(K3D_PATH)" \ + && curl -s https://raw.githubusercontent.com/k3d-io/k3d/main/install.sh | bash # ====================================================================================================================== # Targets related to running Minikube @@ -1801,6 +1809,29 @@ else rm minikube-linux-amd64 endif +# ---------------------------------------------------------------------------------------------------------------------- +# Install yq +# ---------------------------------------------------------------------------------------------------------------------- +YQ = $(TOOLS_BIN)/yq +YQ_VERSION = v4.44.3 + +.PHONY: yq-install +yq-install: $(TOOLS_BIN)/yq ## Install yq (defaults to the latest version, can be changed by setting YQ_VERSION) + $(YQ) version + +$(TOOLS_BIN)/yq: + mkdir -p $(TOOLS_BIN) || true +ifeq (Darwin, $(UNAME_S)) +ifeq (x86_64, $(UNAME_M)) + curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_darwin_amd64 -o $(TOOLS_BIN)/yq +else + curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_darwin_arm64 -o $(TOOLS_BIN)/yq +endif +else + curl -L https://github.com/mikefarah/yq/releases/download/${YQ_VERSION}/yq_linux_amd64 -o $(TOOLS_BIN)/yq +endif + chmod +x $(TOOLS_BIN)/yq + # ====================================================================================================================== # Kubernetes Cert Manager targets # ====================================================================================================================== From e2a7503f2c71fda9762415cf8aef2fceb281b01d Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Sat, 21 Sep 2024 08:04:04 +0300 Subject: [PATCH 5/9] Fix k3d build --- Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 4d77cfc45..4f1f871ba 100644 --- a/Makefile +++ b/Makefile @@ -1736,11 +1736,11 @@ k3d-stop: $(TOOLS_BIN)/k3d ## Stop a default k3d cluster .PHONY: k3d-load-operator k3d-load-operator: $(TOOLS_BIN)/k3d ## Load the Operator images into the k3d cluster - k3d image import $(OPERATOR_IMAGE) -c $(K3D_CLUSTER) + $(TOOLS_BIN)/k3d image import $(OPERATOR_IMAGE) -c $(K3D_CLUSTER) .PHONY: k3d-load-coherence k3d-load-coherence: $(TOOLS_BIN)/k3d ## Load the Coherence images into the k3d cluster - k3d image import $(COHERENCE_IMAGE) -c $(K3D_CLUSTER) + $(TOOLS_BIN)/k3d image import $(COHERENCE_IMAGE) -c $(K3D_CLUSTER) .PHONY: k3d-load-all k3d-load-all: $(TOOLS_BIN)/k3d k3d-load-operator k3d-load-coherence ## Load all the test images into the k3d cluster From 5a5b9c634871cb18439ee396a04f3dcd43a2f956 Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Mon, 23 Sep 2024 11:40:10 +0300 Subject: [PATCH 6/9] Enhance site and rack configuration --- api/v1/coherenceresourcespec_types.go | 4 +- api/v1/constants.go | 4 +- docs/coherence/021_member_identity.adoc | 71 ++++++++- pkg/runner/cmd_server.go | 2 +- pkg/runner/run_details.go | 7 +- pkg/runner/runner.go | 167 ++++++++++++-------- pkg/runner/runner_coherence_test.go | 201 ++++++++++++++++++++++++ pkg/runner/runner_test.go | 16 +- pkg/runner/test-rack.txt | 2 + pkg/runner/test-site.txt | 2 + 10 files changed, 403 insertions(+), 73 deletions(-) create mode 100644 pkg/runner/test-rack.txt create mode 100644 pkg/runner/test-site.txt diff --git a/api/v1/coherenceresourcespec_types.go b/api/v1/coherenceresourcespec_types.go index 502f2a80c..bbbb24d6b 100644 --- a/api/v1/coherenceresourcespec_types.go +++ b/api/v1/coherenceresourcespec_types.go @@ -793,7 +793,6 @@ func (in *CoherenceResourceSpec) CreateCoherenceContainer(deployment CoherenceRe Name: ContainerNameCoherence, Image: cohImage, Command: cmd, - Env: in.Env, Ports: []corev1.ContainerPort{ { Name: PortNameCoherence, @@ -826,7 +825,8 @@ func (in *CoherenceResourceSpec) CreateCoherenceContainer(deployment CoherenceRe c.ImagePullPolicy = *in.ImagePullPolicy } - c.Env = append(c.Env, in.CreateDefaultEnv(deployment)...) + c.Env = in.CreateDefaultEnv(deployment) + c.Env = append(c.Env, in.Env...) forceExit := deployment.IsForceExit() if forceExit { diff --git a/api/v1/constants.go b/api/v1/constants.go index 074793c58..e1d2f7d74 100644 --- a/api/v1/constants.go +++ b/api/v1/constants.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, 2023, Oracle and/or its affiliates. + * Copyright (c) 2020, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -193,7 +193,9 @@ const ( EnvVarCohPodUID = "COH_POD_UID" EnvVarCohSkipSite = "COH_SKIP_SITE" EnvVarCohSite = "COH_SITE_INFO_LOCATION" + EnvVarCoherenceSite = "COHERENCE_SITE" EnvVarCohRack = "COH_RACK_INFO_LOCATION" + EnvVarCoherenceRack = "COHERENCE_RACK" EnvVarCohRole = "COH_ROLE" EnvVarCohUtilDir = "COH_UTIL_DIR" EnvVarCohUtilLibDir = "COH_UTIL_LIB_DIR" diff --git a/docs/coherence/021_member_identity.adoc b/docs/coherence/021_member_identity.adoc index fb87a5cf2..044124c77 100644 --- a/docs/coherence/021_member_identity.adoc +++ b/docs/coherence/021_member_identity.adoc @@ -1,6 +1,6 @@ /////////////////////////////////////////////////////////////////////////////// - Copyright (c) 2021, Oracle and/or its affiliates. + Copyright (c) 2021, 2024, Oracle and/or its affiliates. Licensed under the Universal Permissive License v 1.0 as shown at http://oss.oracle.com/licenses/upl. @@ -35,7 +35,32 @@ Ideally this would be on a member with a different site; failing that, a differe You should not usually need to change the default values applied for the `member` and `machine` names, but you may need to change the values used for the site, or rack. The labels used for the `site` and `rack` are standard k8s labels but -the k8s cluster being used may not have these labels set +the k8s cluster being used may not have these labels set. + +[NOTE] +==== +If the Coherence site is specified but no value is set for rack, the Operator will configure the +rack value to be the same as the site. Coherence will not set the site if any of the identity values +below it are missing (i.e. rack, machine, member). +==== + +[IMPORTANT] +==== +*Maintaining Site and Rack Safety* + +The details below show alternate approaches to set the Coherence site and rack identity. +If site and rack are set to a fixed value for the deployment, then all Coherence members in that +deployment will have the same value. This means it would be impossible for Coherence to become +site or rack safe. + +A work-around for this would be to use multiple Coherence deployments configured with the same cluster name +and each having different site and rack values. +For examples, if a Kubernetes cluster has two fault domains, two separate Coherence resources could +be created with different site and rack values. Each Coherence resource would then have different +Pod scheduling rules, so that each Coherence deployment is targeted at only one fault domain. +All the Pods in the two Coherence deployments would form a single Cluster, and because there will be Pods with +different site and rack values, Coherence will be able to reach site safety. +==== === Apply Node Labels @@ -47,6 +72,48 @@ For example the command below labels the node in Docker dDesktop on MacOS to "tw kubectl label node docker-desktop topology.kubernetes.io/zone=twighlight-zone ---- +=== Specify Site and Rack using Environment Variables + +The site and rack values can be set by setting the `COHERENCE_SITE` and `COHERENCE_RACK` environment variables. + +If these values are set then the Operator will not set the `coherence.site` or `coherence.rack` system properties +as Coherence will read the environment variable values. + +For example, the yaml below will set the sit to `test-site` and the rack to `test-rack`: + +[source,yaml] +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: my-cluster +spec: + env: + - name: COHERENCE_SITE + value: test-site + - name: COHERENCE_RACK + value: test-rack +---- + +Site and rack environment variables will be expanded if they reference other variables. + +For example, the yaml below will set the site to the value of the `MY_SITE` environment variables, +and rack to the value of the `MY_RACK` environment variable. + +[source,yaml] +---- +apiVersion: coherence.oracle.com/v1 +kind: Coherence +metadata: + name: my-cluster +spec: + env: + - name: COHERENCE_SITE + value: "${MY_SITE}" + - name: COHERENCE_RACK + value: "${MY_RACK}" +---- + === Specify Site and Rack Using System Properties The site and rack values can be specified as system properties as part of the Coherence deployment yaml. diff --git a/pkg/runner/cmd_server.go b/pkg/runner/cmd_server.go index 083d3f6a5..fece45e53 100644 --- a/pkg/runner/cmd_server.go +++ b/pkg/runner/cmd_server.go @@ -48,7 +48,7 @@ func server(details *RunDetails, _ *cobra.Command) { fi, err := os.Stat(jibMainClassFileName) mainCls := "" if err == nil && (fi.Size() != 0) { - mainCls = readFirstLineFromFile(jibMainClassFileName, fi) + mainCls, _ = readFirstLineFromFile(fi.Name()) } if !found && (len(mainCls) != 0) { mc = mainCls diff --git a/pkg/runner/run_details.go b/pkg/runner/run_details.go index 86d05bd37..70e97e401 100644 --- a/pkg/runner/run_details.go +++ b/pkg/runner/run_details.go @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, 2023, Oracle and/or its affiliates. + * Copyright (c) 2021, 2024, Oracle and/or its affiliates. * Licensed under the Universal Permissive License v 1.0 as shown at * http://oss.oracle.com/licenses/upl. */ @@ -17,7 +17,8 @@ import ( ) func NewRunDetails(v *viper.Viper) *RunDetails { - skipSite := v.GetString(v1.EnvVarCohSkipSite) + skipSiteVar := v.GetString(v1.EnvVarCohSkipSite) + skipSite := strings.ToLower(skipSiteVar) != "true" details := &RunDetails{ env: v, @@ -27,7 +28,7 @@ func NewRunDetails(v *viper.Viper) *RunDetails { AppType: strings.ToLower(v.GetString(v1.EnvVarAppType)), Dir: v.GetString(v1.EnvVarCohAppDir), MainClass: DefaultMain, - GetSite: strings.ToLower(skipSite) != "true", + GetSite: skipSite, } // add any Classpath items diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 71c842f94..66565d900 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -322,7 +322,7 @@ func createCommand(details *RunDetails) (string, *exec.Cmd, error) { appDir := details.getenvOrDefault(v1.EnvVarCohAppDir, "/app") fi, e := os.Stat(appDir + "/jib-classpath-file") if e == nil && (fi.Size() != 0) { - clsPath := readFirstLineFromFile(appDir+"/jib-classpath-file", fi) + clsPath, _ := readFirstLineFromFile(fi.Name()) if len(clsPath) != 0 { details.addClasspath(clsPath) } @@ -495,26 +495,26 @@ func createCommand(details *RunDetails) (string, *exec.Cmd, error) { } } - max := details.Getenv(v1.EnvVarJvmMaxRAMPercentage) - if max != "" { - q, err := resource.ParseQuantity(max) + maxRam := details.Getenv(v1.EnvVarJvmMaxRAMPercentage) + if maxRam != "" { + q, err := resource.ParseQuantity(maxRam) if err == nil { d := q.AsDec() details.addArg("-XX:MaxRAMPercentage=" + d.String()) } else { - log.Info("ERROR: MaxRAMPercentage is not a valid resource.Quantity", "Value", max, "Error", err.Error()) + log.Info("ERROR: MaxRAMPercentage is not a valid resource.Quantity", "Value", maxRam, "Error", err.Error()) os.Exit(1) } } - min := details.Getenv(v1.EnvVarJvmMinRAMPercentage) - if min != "" { - q, err := resource.ParseQuantity(min) + minRam := details.Getenv(v1.EnvVarJvmMinRAMPercentage) + if minRam != "" { + q, err := resource.ParseQuantity(minRam) if err == nil { d := q.AsDec() details.addArg("-XX:MinRAMPercentage=" + d.String()) } else { - log.Info("ERROR: MinRAMPercentage is not a valid resource.Quantity", "Value", min, "Error", err.Error()) + log.Info("ERROR: MinRAMPercentage is not a valid resource.Quantity", "Value", minRam, "Error", err.Error()) os.Exit(1) } } @@ -623,22 +623,25 @@ func createJavaCommand(javaCmd string, details *RunDetails) (*exec.Cmd, error) { return _createJavaCommand(javaCmd, details, args) } -func readFirstLineFromFile(fqfn string, fi os.FileInfo) string { - log.Info(fmt.Sprintf("%s size=%d", fi.Name(), fi.Size())) - file, _ := os.Open(fqfn) - scanner := bufio.NewScanner(file) - scanner.Split(bufio.ScanLines) - var text []string - for scanner.Scan() { - text = append(text, scanner.Text()) +func readFirstLineFromFile(path string) (string, error) { + st, err := os.Stat(maybeStripFileScheme(path)) + if err != nil { + return "", err } - closeFile(file, log) - - if len(text) == 0 { - return "" + if st.IsDir() { + return "", fmt.Errorf("%s is a directory", path) + } + file, err := os.Open(st.Name()) + if err != nil { + return "", err } - log.Info(fmt.Sprintf("First Line of the %s:\n%s\n", fi.Name(), text[0])) - return text[0] + defer closeFile(file, log) + rd := bufio.NewReader(file) + data, _, err := rd.ReadLine() + if err != nil { + return "", err + } + return string(data), nil } func createSpringBootCommand(javaCmd string, details *RunDetails) (*exec.Cmd, error) { @@ -743,64 +746,102 @@ func createGraalCommand(details *RunDetails) (*exec.Cmd, error) { // Set the Coherence site and rack values func configureSiteAndRack(details *RunDetails) { - log.Info("Configuring Coherence site and rack") + var err error if !details.GetSite { return } - var site string - - siteLocation := details.ExpandEnv(details.Getenv(v1.EnvVarCohSite)) - log.Info("Configuring Coherence site", "url", siteLocation) - if siteLocation != "" { - switch { - case strings.ToLower(siteLocation) == "http://": - site = "" - case strings.HasPrefix(siteLocation, "http://"): - // do http get - site = httpGetWithBackoff(siteLocation, details) - default: - st, err := os.Stat(siteLocation) - if err == nil && !st.IsDir() { - d, err := os.ReadFile(siteLocation) + log.Info("Configuring Coherence site and rack") + + site := details.Getenv(v1.EnvVarCoherenceSite) + if site == "" { + siteLocation := details.ExpandEnv(details.Getenv(v1.EnvVarCohSite)) + log.Info("Configuring Coherence site", "url", siteLocation) + if siteLocation != "" { + switch { + case strings.ToLower(siteLocation) == "http://": + site = "" + case strings.HasPrefix(siteLocation, "http://"): + // do http get + site = httpGetWithBackoff(siteLocation, details) + case strings.HasPrefix(siteLocation, "https://"): + // https not supported + log.Info("Cannot read site URI, https is not supported", "URI", siteLocation) + default: + site, err = readFirstLineFromFile(siteLocation) if err != nil { - site = string(d) + log.Error(err, "error reading site info", "Location", siteLocation) } } } - } - if site != "" { - details.addArg("-Dcoherence.site=" + site) + if site != "" { + details.addArg("-Dcoherence.site=" + site) + } + } else { + expanded := details.ExpandEnv(site) + if expanded != site { + log.Info("Coherence site property set from expanded "+v1.EnvVarCoherenceSite+" environment variable", v1.EnvVarCoherenceSite, site, "Site", expanded) + site = expanded + if strings.TrimSpace(site) != "" { + details.addArg("-Dcoherence.site=" + site) + } + } else { + log.Info("Coherence site property not set as "+v1.EnvVarCoherenceSite+" environment variable is set", "Site", site) + } } - var rack string - - rackLocation := details.ExpandEnv(details.Getenv(v1.EnvVarCohRack)) - log.Info("Configuring Coherence rack", "url", rackLocation) - if rackLocation != "" { - switch { - case strings.ToLower(rackLocation) == "http://": - rack = "" - case strings.HasPrefix(rackLocation, "http://"): - // do http get - rack = httpGetWithBackoff(rackLocation, details) - default: - st, err := os.Stat(rackLocation) - if err == nil && !st.IsDir() { - d, err := os.ReadFile(rackLocation) + rack := details.Getenv(v1.EnvVarCoherenceRack) + if rack == "" { + rackLocation := details.ExpandEnv(details.Getenv(v1.EnvVarCohRack)) + log.Info("Configuring Coherence rack", "url", rackLocation) + if rackLocation != "" { + switch { + case strings.ToLower(rackLocation) == "http://": + rack = "" + case strings.HasPrefix(rackLocation, "http://"): + // do http get + rack = httpGetWithBackoff(rackLocation, details) + case strings.HasPrefix(rackLocation, "https://"): + // https not supported + log.Info("Cannot read rack URI, https is not supported", "URI", rackLocation) + default: + rack, err = readFirstLineFromFile(rackLocation) if err != nil { - rack = string(d) + log.Error(err, "error reading site info", "Location", rackLocation) } } } + + if rack != "" { + details.addArg("-Dcoherence.rack=" + rack) + } else if site != "" { + details.addArg("-Dcoherence.rack=" + site) + } + } else { + expanded := details.ExpandEnv(rack) + if expanded != rack { + log.Info("Coherence site property set from expanded "+v1.EnvVarCoherenceRack+" environment variable", v1.EnvVarCoherenceRack, rack, "Rack", expanded) + rack = expanded + if len(rack) == 0 { + // if the expanded COHERENCE_RACK value is blank then set rack to site as + // the rack cannot be blank if site is set + rack = site + } + if strings.TrimSpace(rack) != "" { + details.addArg("-Dcoherence.rack=" + rack) + } + } else { + log.Info("Coherence rack property not set as "+v1.EnvVarCoherenceRack+" environment variable is set", "Rack", rack) + } } +} - if rack != "" { - details.addArg("-Dcoherence.rack=" + rack) - } else if site != "" { - details.addArg("-Dcoherence.rack=" + site) +func maybeStripFileScheme(uri string) string { + if strings.HasPrefix(uri, "file://") { + return strings.TrimPrefix(uri, "file://") } + return uri } // httpGetWithBackoff does a http get for the specified url with retry back-off for errors. diff --git a/pkg/runner/runner_coherence_test.go b/pkg/runner/runner_coherence_test.go index 571069883..8a2766204 100644 --- a/pkg/runner/runner_coherence_test.go +++ b/pkg/runner/runner_coherence_test.go @@ -9,6 +9,7 @@ package runner import ( . "github.com/onsi/gomega" coh "github.com/oracle/coherence-operator/api/v1" + corev1 "k8s.io/api/core/v1" "k8s.io/apimachinery/pkg/api/resource" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/utils/ptr" @@ -475,3 +476,203 @@ func TestCoherenceDefaultIpMonitor(t *testing.T) { g.Expect(e.OsCmd.Path).To(Equal(expectedCommand)) g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) } + +func TestCoherenceSiteEAndRackEnvVarSet(t *testing.T) { + g := NewGomegaWithT(t) + + d := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Env: []corev1.EnvVar{ + { + Name: coh.EnvVarCoherenceSite, + Value: "test-site", + }, + { + Name: coh.EnvVarCoherenceRack, + Value: "test-rack", + }, + }, + }, + }, + } + + args := []string{"server", "--dry-run"} + env := EnvVarsFromDeploymentWithSkipSite(d, false) + + expectedCommand := GetJavaCommand() + // site and rack system properties should not be set + expectedArgs := RemoveArg(GetMinimalExpectedArgs(), "-Dcoherence.site") + expectedArgs = RemoveArg(expectedArgs, "-Dcoherence.rack") + + e, err := ExecuteWithArgsAndNewViper(env, args) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(e).NotTo(BeNil()) + g.Expect(e.OsCmd).NotTo(BeNil()) + + g.Expect(e.OsCmd.Dir).To(Equal(TestAppDir)) + g.Expect(e.OsCmd.Path).To(Equal(expectedCommand)) + g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) +} + +func TestCoherenceSiteAndRackEnvVarSetFromOtherEnvVar(t *testing.T) { + g := NewGomegaWithT(t) + + d := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Env: []corev1.EnvVar{ + { + Name: coh.EnvVarCoherenceSite, + Value: "${TEST_SITE_VAR}", + }, + { + Name: "TEST_SITE_VAR", + Value: "test-site", + }, + { + Name: coh.EnvVarCoherenceRack, + Value: "${TEST_RACK_VAR}", + }, + { + Name: "TEST_RACK_VAR", + Value: "test-rack", + }, + }, + }, + }, + } + + args := []string{"server", "--dry-run"} + env := EnvVarsFromDeploymentWithSkipSite(d, false) + + expectedCommand := GetJavaCommand() + expectedArgs := append(GetMinimalExpectedArgs(), "-Dcoherence.site=test-site") + expectedArgs = append(expectedArgs, "-Dcoherence.rack=test-rack") + + e, err := ExecuteWithArgsAndNewViper(env, args) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(e).NotTo(BeNil()) + g.Expect(e.OsCmd).NotTo(BeNil()) + + g.Expect(e.OsCmd.Dir).To(Equal(TestAppDir)) + g.Expect(e.OsCmd.Path).To(Equal(expectedCommand)) + g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) +} + +func TestCoherenceSiteAndRackEnvVarSetFromOtherEnvVarWhenRackIsMissing(t *testing.T) { + g := NewGomegaWithT(t) + + d := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Env: []corev1.EnvVar{ + { + Name: coh.EnvVarCoherenceSite, + Value: "${TEST_SITE_VAR}", + }, + { + Name: "TEST_SITE_VAR", + Value: "test-site", + }, + { + Name: coh.EnvVarCoherenceRack, + Value: "${TEST_RACK_VAR}", + }, + }, + }, + }, + } + + args := []string{"server", "--dry-run"} + env := EnvVarsFromDeploymentWithSkipSite(d, false) + + expectedCommand := GetJavaCommand() + expectedArgs := append(GetMinimalExpectedArgs(), "-Dcoherence.site=test-site") + // rack should be set to site + expectedArgs = append(expectedArgs, "-Dcoherence.rack=test-site") + + e, err := ExecuteWithArgsAndNewViper(env, args) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(e).NotTo(BeNil()) + g.Expect(e.OsCmd).NotTo(BeNil()) + + g.Expect(e.OsCmd.Dir).To(Equal(TestAppDir)) + g.Expect(e.OsCmd.Path).To(Equal(expectedCommand)) + g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) +} + +func TestCoherenceSiteFromFile(t *testing.T) { + g := NewGomegaWithT(t) + + d := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Env: []corev1.EnvVar{ + { + Name: coh.EnvVarCohSite, + Value: "test-site.txt", + }, + }, + }, + }, + } + + args := []string{"server", "--dry-run"} + env := EnvVarsFromDeploymentWithSkipSite(d, false) + + expectedCommand := GetJavaCommand() + expectedArgs := append(GetMinimalExpectedArgs(), "-Dcoherence.site=site-from-file") + expectedArgs = append(expectedArgs, "-Dcoherence.rack=site-from-file") + + e, err := ExecuteWithArgsAndNewViper(env, args) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(e).NotTo(BeNil()) + g.Expect(e.OsCmd).NotTo(BeNil()) + + g.Expect(e.OsCmd.Dir).To(Equal(TestAppDir)) + g.Expect(e.OsCmd.Path).To(Equal(expectedCommand)) + g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) +} + +func TestCoherenceSiteAndRackFromFile(t *testing.T) { + g := NewGomegaWithT(t) + + d := &coh.Coherence{ + ObjectMeta: metav1.ObjectMeta{Name: "test"}, + Spec: coh.CoherenceStatefulSetResourceSpec{ + CoherenceResourceSpec: coh.CoherenceResourceSpec{ + Env: []corev1.EnvVar{ + { + Name: coh.EnvVarCohSite, + Value: "test-site.txt", + }, + { + Name: coh.EnvVarCohRack, + Value: "test-rack.txt", + }, + }, + }, + }, + } + + args := []string{"server", "--dry-run"} + env := EnvVarsFromDeploymentWithSkipSite(d, false) + + expectedCommand := GetJavaCommand() + expectedArgs := append(GetMinimalExpectedArgs(), "-Dcoherence.site=site-from-file") + expectedArgs = append(expectedArgs, "-Dcoherence.rack=rack-from-file") + + e, err := ExecuteWithArgsAndNewViper(env, args) + g.Expect(err).NotTo(HaveOccurred()) + g.Expect(e).NotTo(BeNil()) + g.Expect(e.OsCmd).NotTo(BeNil()) + + g.Expect(e.OsCmd.Dir).To(Equal(TestAppDir)) + g.Expect(e.OsCmd.Path).To(Equal(expectedCommand)) + g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) +} diff --git a/pkg/runner/runner_test.go b/pkg/runner/runner_test.go index ee9ed2420..225ee294b 100644 --- a/pkg/runner/runner_test.go +++ b/pkg/runner/runner_test.go @@ -88,6 +88,16 @@ func ReplaceArg(args []string, toReplace, replaceWith string) []string { return args } +func RemoveArg(args []string, toRemove string) []string { + result := make([]string, 0, len(args)) + for _, a := range args { + if a != toRemove { + result = append(result, a) + } + } + return result +} + func GetJavaArg() string { var javaCmd = "java" javaHome, found := os.LookupEnv("JAVA_HOME") @@ -185,9 +195,13 @@ func GetJavaCommand() string { } func EnvVarsFromDeployment(d *coh.Coherence) map[string]string { + return EnvVarsFromDeploymentWithSkipSite(d, true) +} + +func EnvVarsFromDeploymentWithSkipSite(d *coh.Coherence, skipSite bool) map[string]string { envVars := make(map[string]string) envVars[coh.EnvVarCohAppDir] = TestAppDir - envVars[coh.EnvVarCohSkipSite] = "true" + envVars[coh.EnvVarCohSkipSite] = fmt.Sprintf("%t", skipSite) if d.Spec.JVM == nil { d.Spec.JVM = &coh.JVMSpec{} diff --git a/pkg/runner/test-rack.txt b/pkg/runner/test-rack.txt new file mode 100644 index 000000000..3248ecd8e --- /dev/null +++ b/pkg/runner/test-rack.txt @@ -0,0 +1,2 @@ +rack-from-file +should not see this line \ No newline at end of file diff --git a/pkg/runner/test-site.txt b/pkg/runner/test-site.txt new file mode 100644 index 000000000..d84e7795a --- /dev/null +++ b/pkg/runner/test-site.txt @@ -0,0 +1,2 @@ +site-from-file +should not see this line \ No newline at end of file From bdae89ccbead483dea86b669f30e859d20832d4c Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Mon, 23 Sep 2024 12:46:21 +0300 Subject: [PATCH 7/9] Fix broken code --- pkg/runner/cmd_server.go | 5 +++-- pkg/runner/runner.go | 8 ++++--- pkg/runner/runner_coherence_test.go | 34 ----------------------------- 3 files changed, 8 insertions(+), 39 deletions(-) diff --git a/pkg/runner/cmd_server.go b/pkg/runner/cmd_server.go index fece45e53..b00a48d3f 100644 --- a/pkg/runner/cmd_server.go +++ b/pkg/runner/cmd_server.go @@ -12,6 +12,7 @@ import ( "github.com/spf13/cobra" "k8s.io/apimachinery/pkg/api/resource" "os" + "path/filepath" "strconv" "strings" ) @@ -44,11 +45,11 @@ func server(details *RunDetails, _ *cobra.Command) { // otherwise run Coherence DefaultMain. mc, found := details.lookupEnv(v1.EnvVarAppMainClass) appDir := details.getenvOrDefault(v1.EnvVarCohAppDir, "/app") - jibMainClassFileName := appDir + "/jib-main-class-file" + jibMainClassFileName := filepath.Join(appDir, "jib-main-class-file") fi, err := os.Stat(jibMainClassFileName) mainCls := "" if err == nil && (fi.Size() != 0) { - mainCls, _ = readFirstLineFromFile(fi.Name()) + mainCls, _ = readFirstLineFromFile(jibMainClassFileName) } if !found && (len(mainCls) != 0) { mc = mainCls diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 66565d900..4bf49327e 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -23,6 +23,7 @@ import ( "net/http" "os" "os/exec" + "path/filepath" ctrl "sigs.k8s.io/controller-runtime" "strconv" "strings" @@ -320,9 +321,10 @@ func createCommand(details *RunDetails) (string, *exec.Cmd, error) { // are running a Spring Boot application. if !details.isBuildPacks() && details.AppType != AppTypeSpring && details.isEnvTrueOrBlank(v1.EnvVarJvmClasspathJib) { appDir := details.getenvOrDefault(v1.EnvVarCohAppDir, "/app") - fi, e := os.Stat(appDir + "/jib-classpath-file") + cpFile := filepath.Join(appDir, "jib-classpath-file") + fi, e := os.Stat(cpFile) if e == nil && (fi.Size() != 0) { - clsPath, _ := readFirstLineFromFile(fi.Name()) + clsPath, _ := readFirstLineFromFile(cpFile) if len(clsPath) != 0 { details.addClasspath(clsPath) } @@ -631,7 +633,7 @@ func readFirstLineFromFile(path string) (string, error) { if st.IsDir() { return "", fmt.Errorf("%s is a directory", path) } - file, err := os.Open(st.Name()) + file, err := os.Open(path) if err != nil { return "", err } diff --git a/pkg/runner/runner_coherence_test.go b/pkg/runner/runner_coherence_test.go index 8a2766204..d2af76187 100644 --- a/pkg/runner/runner_coherence_test.go +++ b/pkg/runner/runner_coherence_test.go @@ -605,40 +605,6 @@ func TestCoherenceSiteAndRackEnvVarSetFromOtherEnvVarWhenRackIsMissing(t *testin g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) } -func TestCoherenceSiteFromFile(t *testing.T) { - g := NewGomegaWithT(t) - - d := &coh.Coherence{ - ObjectMeta: metav1.ObjectMeta{Name: "test"}, - Spec: coh.CoherenceStatefulSetResourceSpec{ - CoherenceResourceSpec: coh.CoherenceResourceSpec{ - Env: []corev1.EnvVar{ - { - Name: coh.EnvVarCohSite, - Value: "test-site.txt", - }, - }, - }, - }, - } - - args := []string{"server", "--dry-run"} - env := EnvVarsFromDeploymentWithSkipSite(d, false) - - expectedCommand := GetJavaCommand() - expectedArgs := append(GetMinimalExpectedArgs(), "-Dcoherence.site=site-from-file") - expectedArgs = append(expectedArgs, "-Dcoherence.rack=site-from-file") - - e, err := ExecuteWithArgsAndNewViper(env, args) - g.Expect(err).NotTo(HaveOccurred()) - g.Expect(e).NotTo(BeNil()) - g.Expect(e.OsCmd).NotTo(BeNil()) - - g.Expect(e.OsCmd.Dir).To(Equal(TestAppDir)) - g.Expect(e.OsCmd.Path).To(Equal(expectedCommand)) - g.Expect(e.OsCmd.Args).To(ConsistOf(expectedArgs)) -} - func TestCoherenceSiteAndRackFromFile(t *testing.T) { g := NewGomegaWithT(t) From d1ef44995742fa103f241ca549e26954183462a5 Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Mon, 23 Sep 2024 14:27:33 +0300 Subject: [PATCH 8/9] Fix broken file reader --- pkg/runner/runner.go | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/pkg/runner/runner.go b/pkg/runner/runner.go index 4bf49327e..03b714664 100644 --- a/pkg/runner/runner.go +++ b/pkg/runner/runner.go @@ -626,24 +626,22 @@ func createJavaCommand(javaCmd string, details *RunDetails) (*exec.Cmd, error) { } func readFirstLineFromFile(path string) (string, error) { - st, err := os.Stat(maybeStripFileScheme(path)) - if err != nil { - return "", err - } - if st.IsDir() { - return "", fmt.Errorf("%s is a directory", path) - } - file, err := os.Open(path) + file, err := os.Open(maybeStripFileScheme(path)) if err != nil { return "", err } defer closeFile(file, log) - rd := bufio.NewReader(file) - data, _, err := rd.ReadLine() - if err != nil { - return "", err + + scanner := bufio.NewScanner(file) + scanner.Split(bufio.ScanLines) + var text []string + for scanner.Scan() { + text = append(text, scanner.Text()) + } + if len(text) == 0 { + return "", nil } - return string(data), nil + return text[0], nil } func createSpringBootCommand(javaCmd string, details *RunDetails) (*exec.Cmd, error) { From 198efac6e2ae1c35396d67f36b6b09347848b280 Mon Sep 17 00:00:00 2001 From: Jonathan Knight Date: Mon, 23 Sep 2024 15:53:14 +0300 Subject: [PATCH 9/9] Use topology.kubernetes.io/subzone Node label for Coherence rack if the label is present on the node. --- pkg/operator/operator.go | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 616a64103..a20de6d09 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -87,6 +87,8 @@ const ( // LabelOciNodeFaultDomain is the OCI Node label for the fault domain. LabelOciNodeFaultDomain = "oci.oraclecloud.com/fault-domain" + // LabelTopologySubZone is the k8s topology label for sub-zone. + LabelTopologySubZone = "topology.kubernetes.io/subzone" // LabelHostName is the Node label for the Node's hostname. LabelHostName = "kubernetes.io/hostname" @@ -104,7 +106,7 @@ var currentViper *viper.Viper var ( operatorVersion = "999.0.0" DefaultSiteLabels = []string{corev1.LabelTopologyZone, corev1.LabelFailureDomainBetaZone} - DefaultRackLabels = []string{LabelOciNodeFaultDomain, corev1.LabelTopologyZone, corev1.LabelFailureDomainBetaZone} + DefaultRackLabels = []string{LabelTopologySubZone, LabelOciNodeFaultDomain, corev1.LabelTopologyZone, corev1.LabelFailureDomainBetaZone} ) func SetupOperatorManagerFlags(cmd *cobra.Command, v *viper.Viper) {