From 7caa29af51d16f2dd401f14f515cd17dbdd799a3 Mon Sep 17 00:00:00 2001 From: Bastian Krol Date: Thu, 10 Oct 2024 07:59:57 +0200 Subject: [PATCH] feat(checkrules): manage check rules via the operator --- .golangci.yml | 4 + cmd/main.go | 34 +- config/rbac/role.yaml | 8 + go.mod | 1 + go.sum | 2 + helm-chart/dash0-operator/README.md | 41 +- .../templates/operator/cluster-roles.yaml | 38 +- .../__snapshot__/cluster-roles_test.yaml.snap | 8 + .../otelcolresources/collector_config_maps.go | 2 +- .../operator_configuration_controller.go | 39 +- .../operator_configuration_controller_test.go | 179 ++++- .../perses_dashboards_controller.go | 635 ++++++------------ .../perses_dashboards_controller_test.go | 173 ++--- .../controller/prometheus_rules_controller.go | 589 ++++++++++++++++ .../prometheus_rules_controller_test.go | 569 ++++++++++++++++ .../controller/third_party_crd_common.go | 533 +++++++++++++++ .../self_monitoring_and_api_access.go | 2 +- internal/dash0/util/constants.go | 1 + test-resources/bin/test-cleanup.sh | 7 +- .../bin/test-scenario-01-aum-operator-cr.sh | 10 +- .../bin/test-scenario-02-operator-cr-aum.sh | 10 +- test-resources/bin/util | 15 +- .../prometheusrule/prometheusrule.yaml | 44 ++ test/e2e/application_under_test.go | 2 +- test/e2e/collector.go | 2 +- test/e2e/container_images.go | 2 +- test/e2e/dash0_monitoring_resource.go | 2 +- .../dash0_operator_configuration_resource.go | 2 +- test/e2e/e2e_test.go | 6 +- test/e2e/events.go | 1 - test/e2e/labels.go | 2 +- test/e2e/operator.go | 12 +- test/e2e/run_command.go | 7 +- test/e2e/setup_teardown.go | 2 +- test/e2e/spans.go | 3 +- test/e2e/util.go | 14 + test/e2e/verify_instrumentation.go | 2 +- ...monitoring.coreos.com_prometheusrules.yaml | 141 ++++ .../crds/perses.dev_persesdashboards.yaml | 258 +++++++ test/util/operator_resource.go | 60 ++ test/util/third_party_resource.go | 73 ++ 41 files changed, 2876 insertions(+), 659 deletions(-) create mode 100644 internal/dash0/controller/prometheus_rules_controller.go create mode 100644 internal/dash0/controller/prometheus_rules_controller_test.go create mode 100644 internal/dash0/controller/third_party_crd_common.go create mode 100644 test-resources/customresources/prometheusrule/prometheusrule.yaml create mode 100644 test/e2e/util.go create mode 100644 test/util/crds/monitoring.coreos.com_prometheusrules.yaml create mode 100644 test/util/crds/perses.dev_persesdashboards.yaml create mode 100644 test/util/third_party_resource.go diff --git a/.golangci.yml b/.golangci.yml index 836c8892..e71beb79 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -38,3 +38,7 @@ linters: - unconvert - unparam - unused +linters-settings: + errcheck: + exclude-functions: + - fmt.Fprintf \ No newline at end of file diff --git a/cmd/main.go b/cmd/main.go index 43feff56..b64e76f3 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -17,6 +17,7 @@ import ( "github.com/go-logr/logr" persesv1alpha1 "github.com/perses/perses-operator/api/v1alpha1" + prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" semconv "go.opentelemetry.io/collector/semconv/v1.27.0" otelmetric "go.opentelemetry.io/otel/metric" appsv1 "k8s.io/api/apps/v1" @@ -105,9 +106,10 @@ func init() { utilruntime.Must(clientgoscheme.AddToScheme(scheme)) utilruntime.Must(dash0v1alpha1.AddToScheme(scheme)) - // for perses dashboard controller, prometheus scrape config controller etc. + // required for Perses dashboard controller and Prometheus rules controller. utilruntime.Must(apiextensionsv1.AddToScheme(scheme)) utilruntime.Must(persesv1alpha1.AddToScheme(scheme)) + utilruntime.Must(prometheusv1.AddToScheme(scheme)) } func main() { @@ -529,16 +531,30 @@ func startDash0Controllers( metricNamePrefix, &setupLog, ) + prometheusRuleCrdReconciler := &controller.PrometheusRuleCrdReconciler{ + AuthToken: envVars.selfMonitoringAndApiAuthToken, + } + if err := prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, startupTasksK8sClient, &setupLog); err != nil { + return fmt.Errorf("unable to set up the Prometheus rule reconciler: %w", err) + } + prometheusRuleCrdReconciler.InitializeSelfMonitoringMetrics( + meter, + metricNamePrefix, + &setupLog, + ) operatorConfigurationReconciler := &controller.OperatorConfigurationReconciler{ - Client: k8sClient, - Clientset: clientset, - PersesDashboardCrdReconciler: persesDashboardCrdReconciler, - Scheme: mgr.GetScheme(), - Recorder: mgr.GetEventRecorderFor("dash0-operator-configuration-controller"), - DeploymentSelfReference: deploymentSelfReference, - Images: images, - DevelopmentMode: developmentMode, + Client: k8sClient, + Clientset: clientset, + ApiClients: []controller.ApiClient{ + persesDashboardCrdReconciler, + prometheusRuleCrdReconciler, + }, + Scheme: mgr.GetScheme(), + Recorder: mgr.GetEventRecorderFor("dash0-operator-configuration-controller"), + DeploymentSelfReference: deploymentSelfReference, + Images: images, + DevelopmentMode: developmentMode, } if err := operatorConfigurationReconciler.SetupWithManager(mgr); err != nil { return fmt.Errorf("unable to set up the operator configuration reconciler: %w", err) diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 180edd7a..7df2fe96 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -65,6 +65,14 @@ rules: - delete - get - list +- apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules + verbs: + - get + - list + - watch - apiGroups: - operator.dash0.com resources: diff --git a/go.mod b/go.mod index 7a03830b..71e6d694 100644 --- a/go.mod +++ b/go.mod @@ -12,6 +12,7 @@ require ( github.com/onsi/ginkgo/v2 v2.20.2 github.com/onsi/gomega v1.34.2 github.com/perses/perses-operator v0.0.0-20240402153734-4ccf03f6c8e6 + github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.1 github.com/wI2L/jsondiff v0.6.0 go.opentelemetry.io/collector/pdata v1.17.0 go.opentelemetry.io/collector/semconv v0.111.0 diff --git a/go.sum b/go.sum index 7aeaf990..7680bdf7 100644 --- a/go.sum +++ b/go.sum @@ -115,6 +115,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.1 h1:XGoEXT6WTTihO+MD8MAao+YaQIH905HbK0WK2lyo28k= +github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.1/go.mod h1:D0KY8md81DQKdaR/cXwnhoWB3MYYyc/UjvqE8GFkIvA= github.com/prometheus/client_golang v1.20.0 h1:jBzTZ7B099Rg24tny+qngoynol8LtVYlA2bqx3vEloI= github.com/prometheus/client_golang v1.20.0/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE= github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= diff --git a/helm-chart/dash0-operator/README.md b/helm-chart/dash0-operator/README.md index 3f51b6d1..386bd5c6 100644 --- a/helm-chart/dash0-operator/README.md +++ b/helm-chart/dash0-operator/README.md @@ -561,13 +561,8 @@ ways to achieve this: ```console kubectl apply --server-side -f https://raw.githubusercontent.com/perses/perses-operator/main/config/crd/bases/perses.dev_persesdashboards.yaml ``` -* Alternatively, install the full Perses operator: Go to and follow the installation - instructions there. - -Note that the custom resource definition needs to be installed before the Dash0 operator is started. -If you have installed the Dash0 operator before installing the Perses dashboard custom resource definition, you need to -restart the Dash0 operator once, for example by deleting the operator's controller pod: -`kubectl --namespace dash0-system delete pod -l app.kubernetes.io/component=controller` +* Alternatively, install the full Perses operator: Go to and follow the + installation instructions there. With the prerequisites in place, you can manage Dash0 dashboards via the operator. The Dash0 operator will watch for Perses dashboard resources in the cluster and synchronize them with the Dash0 backend: @@ -579,3 +574,35 @@ The dashboards created by the operator will be in read-only mode in the Dash0 UI If the Dash0 operator configuration resource has the `dataset` property set, the operator will create the dashboards in that specified dataset, otherwise they will be created in the `default` dataset. + +## Managing Dash0 Check Rules with the Operator + +You can manage your Dash0 check rules via the Dash0 Kubernetes operator. + +Pre-requisites for this feature: +* A Dash0 operator configuration resource has to be installed in the cluster. +* The operator configuration resource must have the `apiEndpoint` property. +* The operator configuration resource must have a Dash0 export configured with authorization + (either `token` or `secret-ref`). + +Furthermore, the custom resource definition for Prometheus rules needs to be installed in the cluster. There are two +ways to achieve this: +* Install the Prometheus rules custom resource definition with the following command: +```console +kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.77.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml +``` +* Alternatively, install the full kube-prometheus stack Helm chart: Go to + and follow the + installation instructions there. + +With the prerequisites in place, you can manage Dash0 check rules via the operator. +The Dash0 operator will watch for Prometheus rule resources in the cluster and synchronize them with the Dash0 backend: +* When a new Prometheus rule resource is created, the operator will create corresponding check rules via Dash0's API. +* When a Prometheus rule resource is changed, the operator will update the corresponding check rules via Dash0's API. +* When a Prometheus rule resource is deleted, the operator will delete the corresponding check rules via Dash0's API. + +Note that a Prometheus rule resource can contain multiple groups, and each of those groups can have multiple rules. +The Dash0 operator will create individual check rules for each rule in each group. + +If the Dash0 operator configuration resource has the `dataset` property set, the operator will create the rules +in that specified dataset, otherwise they will be created in the `default` dataset. diff --git a/helm-chart/dash0-operator/templates/operator/cluster-roles.yaml b/helm-chart/dash0-operator/templates/operator/cluster-roles.yaml index 792eae3f..bace1e3a 100644 --- a/helm-chart/dash0-operator/templates/operator/cluster-roles.yaml +++ b/helm-chart/dash0-operator/templates/operator/cluster-roles.yaml @@ -10,7 +10,7 @@ metadata: rules: -# Permissions required to watch for the foreign CRD (Perses dashboards, Prometheus scrape configs). +# Permissions required to watch for the third-party CRD (Perses dashboards, Prometheus check rules): - apiGroups: - apiextensions.k8s.io resources: @@ -20,7 +20,7 @@ rules: - list - watch -# Permissions required to instrument workloads in the apps API group. +# Permissions required to instrument workloads in the apps API group: - apiGroups: - apps resources: @@ -35,7 +35,7 @@ rules: - update - watch -# Permissions required to instrument workloads in the batch API group. +# Permissions required to instrument workloads in the batch API group: - apiGroups: - batch resources: @@ -48,7 +48,7 @@ rules: - update - watch -# Pmrmissions required top create a Dash0 operator configuration resources +# Pmrmissions required to create a Dash0 operator configuration resource: - apiGroups: - "" resources: @@ -57,7 +57,7 @@ rules: - get # Permissions required to queue events to report about the operator's actions, and to attach dangling events to their -# respective involved objects. +# respective involved objects: - apiGroups: - "" resources: @@ -75,7 +75,7 @@ rules: - get # Permissions required to automatically restart (i.e. delete) pods when instrumenting replicasets that are not part of a -# higher order workload (e.g. a deployment, daemonset). +# higher order workload (e.g. a deployment, daemonset): - apiGroups: - "" resources: @@ -85,7 +85,7 @@ rules: - get - list -# Permissions required to watch for the Perses dashboard resources. +# Permissions required to watch Perses dashboard resources: - apiGroups: - perses.dev resources: @@ -95,7 +95,17 @@ rules: - list - watch -# Permissions required to manage the Dash0 monitoring resource, its finalizers and status. +# Permissions required to watch Prometheus rule resources: +- apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules + verbs: + - get + - list + - watch + +# Permissions required to manage the Dash0 monitoring resource, its finalizers and status: - apiGroups: - operator.dash0.com resources: @@ -110,7 +120,7 @@ rules: - update - watch -# Permissions required to manage the Dash0 monitoring resource, its finalizers and status. +# Permissions required to manage the Dash0 monitoring resource, its finalizers and status: - apiGroups: - operator.dash0.com resources: @@ -118,7 +128,7 @@ rules: verbs: - update -# Permissions required to manage the Dash0 monitoring resource, its finalizers and status. +# Permissions required to manage the Dash0 monitoring resource, its finalizers and status: - apiGroups: - operator.dash0.com resources: @@ -128,7 +138,7 @@ rules: - patch - update -# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status. +# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status: - apiGroups: - operator.dash0.com resources: @@ -143,7 +153,7 @@ rules: - update - watch -# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status. +# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status: - apiGroups: - operator.dash0.com resources: @@ -151,7 +161,7 @@ rules: verbs: - update -# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status. +# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status: - apiGroups: - operator.dash0.com resources: @@ -161,7 +171,7 @@ rules: - patch - update -# Permissions required to manage OTel collector resources. +# Permissions required to manage OTel collector resources: - apiGroups: - "" resources: diff --git a/helm-chart/dash0-operator/tests/operator/__snapshot__/cluster-roles_test.yaml.snap b/helm-chart/dash0-operator/tests/operator/__snapshot__/cluster-roles_test.yaml.snap index c8f1048b..da3e5e13 100644 --- a/helm-chart/dash0-operator/tests/operator/__snapshot__/cluster-roles_test.yaml.snap +++ b/helm-chart/dash0-operator/tests/operator/__snapshot__/cluster-roles_test.yaml.snap @@ -82,6 +82,14 @@ cluster roles should match snapshot: - get - list - watch + - apiGroups: + - monitoring.coreos.com + resources: + - prometheusrules + verbs: + - get + - list + - watch - apiGroups: - operator.dash0.com resources: diff --git a/internal/backendconnection/otelcolresources/collector_config_maps.go b/internal/backendconnection/otelcolresources/collector_config_maps.go index 6bc78f57..596f42f5 100644 --- a/internal/backendconnection/otelcolresources/collector_config_maps.go +++ b/internal/backendconnection/otelcolresources/collector_config_maps.go @@ -121,7 +121,7 @@ func ConvertExportSettingsToExporterList(export dash0v1alpha1.Export) ([]OtlpExp Name: util.AuthorizationHeaderName, Value: authHeaderValue, }} - if d0.Dataset != "" && d0.Dataset != "default" { + if d0.Dataset != "" && d0.Dataset != util.DatasetDefault { headers = append(headers, dash0v1alpha1.Header{ Name: util.Dash0DatasetHeaderName, Value: d0.Dataset, diff --git a/internal/dash0/controller/operator_configuration_controller.go b/internal/dash0/controller/operator_configuration_controller.go index 9938b66b..be2c2adc 100644 --- a/internal/dash0/controller/operator_configuration_controller.go +++ b/internal/dash0/controller/operator_configuration_controller.go @@ -25,14 +25,14 @@ import ( type OperatorConfigurationReconciler struct { client.Client - Clientset *kubernetes.Clientset - PersesDashboardCrdReconciler *PersesDashboardCrdReconciler - Scheme *runtime.Scheme - Recorder record.EventRecorder - DeploymentSelfReference *appsv1.Deployment - DanglingEventsTimeouts *util.DanglingEventsTimeouts - Images util.Images - DevelopmentMode bool + Clientset *kubernetes.Clientset + ApiClients []ApiClient + Scheme *runtime.Scheme + Recorder record.EventRecorder + DeploymentSelfReference *appsv1.Deployment + DanglingEventsTimeouts *util.DanglingEventsTimeouts + Images util.Images + DevelopmentMode bool } const ( @@ -138,6 +138,9 @@ func (r *OperatorConfigurationReconciler) Reconcile(ctx context.Context, req ctr if resourceDeleted { logger.Info("Reconciling the deletion of the operator configuration resource", "name", req.Name) + for _, apiClient := range r.ApiClients { + apiClient.RemoveApiEndpointAndDataset() + } if err = r.removeSelfMonitoringAndApiAccessAndUpdate(ctx); err != nil { logger.Error(err, "cannot disable self-monitoring/API access of the controller deployment, requeuing reconcile request.") return ctrl.Result{ @@ -163,16 +166,20 @@ func (r *OperatorConfigurationReconciler) Reconcile(ctx context.Context, req ctr if resource.HasDash0ApiAccessConfigured() { dataset := resource.Spec.Export.Dash0.Dataset if dataset == "" { - dataset = "default" + dataset = util.DatasetDefault + } + for _, apiClient := range r.ApiClients { + apiClient.SetApiEndpointAndDataset(&ApiConfig{ + Endpoint: resource.Spec.Export.Dash0.ApiEndpoint, + Dataset: dataset, + }, &logger) } - r.PersesDashboardCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ - Endpoint: resource.Spec.Export.Dash0.ApiEndpoint, - Dataset: dataset, - }, &logger) } else { - logger.Info("Settings required for managing dashboards via the operator are missing, the operator will not " + - "update dashboards in Dash0.") - r.PersesDashboardCrdReconciler.RemoveApiEndpointAndDataset() + logger.Info("Settings required for managing dashboards or check rules via the operator are missing, the " + + "operator will not update dashboards nor check rules in Dash0.") + for _, apiClient := range r.ApiClients { + apiClient.RemoveApiEndpointAndDataset() + } } currentSelfMonitoringAndApiAccessConfiguration, err := diff --git a/internal/dash0/controller/operator_configuration_controller_test.go b/internal/dash0/controller/operator_configuration_controller_test.go index 9454b723..ff55d01b 100644 --- a/internal/dash0/controller/operator_configuration_controller_test.go +++ b/internal/dash0/controller/operator_configuration_controller_test.go @@ -8,6 +8,7 @@ import ( "fmt" "reflect" + "github.com/go-logr/logr" json "github.com/json-iterator/go" "github.com/wI2L/jsondiff" appsv1 "k8s.io/api/apps/v1" @@ -33,6 +34,13 @@ type SelfMonitoringAndApiAccessTestConfig struct { expectK8sClientUpdate bool } +type ApiClientSetRemoveTestConfig struct { + operatorConfigurationResourceSpec dash0v1alpha1.Dash0OperatorConfigurationSpec + dataset string + expectSetApiEndpointAndDataset bool + expectRemoveApiEndpointAndDataset bool +} + type SelfMonitoringTestConfig struct { createExport func() dash0v1alpha1.Export verify func(Gomega, selfmonitoringapiaccess.SelfMonitoringAndApiAccessConfiguration, *appsv1.Deployment) @@ -40,6 +48,8 @@ type SelfMonitoringTestConfig struct { var ( reconciler *OperatorConfigurationReconciler + apiClient1 *DummyApiClient + apiClient2 *DummyApiClient ) var _ = Describe("The operation configuration resource controller", Ordered, func() { @@ -51,6 +61,11 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun EnsureOperatorNamespaceExists(ctx, k8sClient) }) + BeforeEach(func() { + apiClient1 = &DummyApiClient{} + apiClient2 = &DummyApiClient{} + }) + Describe("updates the controller deployment", func() { AfterEach(func() { RemoveOperatorConfigurationResource(ctx, k8sClient) @@ -209,7 +224,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithToken, expectK8sClientUpdate: false, }), - // | no self-monitoring, but token | no sm, secret-ref | no sm, auth via secret-ref | Entry("no self-monitoring, token -> no self-monitoring, secret-ref", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithoutSelfMonitoringWithToken, @@ -217,7 +231,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, expectK8sClientUpdate: true, }), - // | no self-monitoring, but secret-ref | no sm, token | no sm, auth via token | Entry("no self-monitoring, secret-ref -> no self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, @@ -225,7 +238,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithToken, expectK8sClientUpdate: true, }), - // | no self-monitoring, but secret-ref | no sm, secret-ref | no sm, auth via secret-ref | Entry("no self-monitoring, secret-ref -> no self-monitoring, secret-ref", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, @@ -233,7 +245,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, expectK8sClientUpdate: false, }), - // | no self-monitoring, but token | with sm, token | has sm, auth via token | Entry("no self-monitoring, token -> self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithoutSelfMonitoringWithToken, @@ -241,7 +252,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithSelfMonitoringWithToken, expectK8sClientUpdate: true, }), - // | no self-monitoring, but token | with sm, secret-ref | has sm, auth via secret-ref | Entry("no self-monitoring, token -> self-monitoring, secret-ref", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithoutSelfMonitoringWithToken, @@ -249,7 +259,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithSelfMonitoringWithSecretRef, expectK8sClientUpdate: true, }), - // | no self-monitoring, but secret-ref | with sm, token | has sm, auth via token | Entry("no self-monitoring, secret-ref -> self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, @@ -257,7 +266,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithSelfMonitoringWithToken, expectK8sClientUpdate: true, }), - // | no self-monitoring, but secret-ref | with sm, secret-ref | has sm, auth via secret-ref | Entry("no self-monitoring, secret-ref -> self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, @@ -274,7 +282,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithoutAuth, expectK8sClientUpdate: true, }), - // | self-monitoring with secret-ref | no sm, no auth | no sm, no auth | Entry("self-monitoring, secret-ref -> no self-monitoring, no auth", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithSecretRef, @@ -282,7 +289,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithoutAuth, expectK8sClientUpdate: true, }), - // | self-monitoring with token | no sm, token | no sm, auth via token | Entry("self-monitoring, token -> no self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithToken, @@ -290,7 +296,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithToken, expectK8sClientUpdate: true, }), - // | self-monitoring with token | no sm, secret-ref | no sm, auth via secret-ref | Entry("self-monitoring, token -> no self-monitoring, secret-ref", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithToken, @@ -298,7 +303,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, expectK8sClientUpdate: true, }), - // | self-monitoring with secret-ref | no sm, token | no sm, auth via token | Entry("self-monitoring, secret-ref -> no self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithSecretRef, @@ -306,7 +310,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithToken, expectK8sClientUpdate: true, }), - // | self-monitoring with secret-ref | no sm, secret-ref | no sm, auth via secret-ref | Entry("self-monitoring, secret-ref -> no self-monitoring, secret-ref", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithSecretRef, @@ -314,7 +317,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithoutSelfMonitoringWithSecretRef, expectK8sClientUpdate: true, }), - // | self-monitoring with token | with sm, token | has sm, auth via token | Entry("self-monitoring, token -> self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithToken, @@ -322,7 +324,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithSelfMonitoringWithToken, expectK8sClientUpdate: false, }), - // | self-monitoring with token | with sm, secret-ref | has sm, auth via secret-ref | Entry("self-monitoring, token -> self-monitoring, secret-ref", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithToken, @@ -330,7 +331,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithSelfMonitoringWithSecretRef, expectK8sClientUpdate: true, }), - // | self-monitoring with secret-ref | with sm, token | has sm, auth via token | Entry("self-monitoring, secret-ref -> self-monitoring, token", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithSecretRef, @@ -338,7 +338,6 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun expectedControllerDeploymentAfterReconcile: CreateControllerDeploymentWithSelfMonitoringWithToken, expectK8sClientUpdate: true, }), - // | self-monitoring with secret-ref | with sm, secret-ref | has sm, auth via secret-ref | Entry("self-monitoring, secret-ref -> self-monitoring, secret-ref", SelfMonitoringAndApiAccessTestConfig{ existingControllerDeployment: CreateControllerDeploymentWithSelfMonitoringWithSecretRef, @@ -349,6 +348,105 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun ) }) + Describe("updates all registered API clients", func() { + AfterEach(func() { + RemoveOperatorConfigurationResource(ctx, k8sClient) + }) + + DescribeTable("by settings or removing the API config", func(config ApiClientSetRemoveTestConfig) { + controllerDeployment = EnsureControllerDeploymentExists( + ctx, + k8sClient, + CreateControllerDeploymentWithoutSelfMonitoringWithoutAuth(), + ) + reconciler = createReconciler(controllerDeployment) + + operatorConfigurationResource := CreateOperatorConfigurationResourceWithSpec( + ctx, + k8sClient, + config.operatorConfigurationResourceSpec, + ) + + expectedDataset := "default" + if config.dataset != "" { + operatorConfigurationResource.Spec.Export.Dash0.Dataset = config.dataset + Expect(k8sClient.Update(ctx, operatorConfigurationResource)).To(Succeed()) + expectedDataset = config.dataset + } + + triggerOperatorConfigurationReconcileRequest(ctx, reconciler) + verifyOperatorConfigurationResourceIsAvailable(ctx) + + for _, apiClient := range []*DummyApiClient{apiClient1, apiClient2} { + if config.expectSetApiEndpointAndDataset { + Expect(apiClient.setCalls).To(Equal(1)) + Expect(apiClient.removeCalls).To(Equal(0)) + Expect(apiClient.apiConfig.Endpoint).To(Equal(ApiEndpointTest)) + Expect(apiClient.apiConfig.Dataset).To(Equal(expectedDataset)) + } + if config.expectRemoveApiEndpointAndDataset { + Expect(apiClient.setCalls).To(Equal(0)) + Expect(apiClient.removeCalls).To(Equal(1)) + Expect(apiClient.apiConfig).To(BeNil()) + } + } + }, + + // | operator config resource | expected calls | + // |-------------------------------------------|----------------| + // | no Dash0 export | remove | + // | no API endpoint, token | remove | + // | no API endpoint, secret-ref | remove | + // | API endpoint, token | set | + // | API endpoint, secret-ref | set | + + // | no Dash0 export | remove | + Entry("no API endpoint, no Dash0 export", ApiClientSetRemoveTestConfig{ + operatorConfigurationResourceSpec: OperatorConfigurationResourceWithoutExport, + expectSetApiEndpointAndDataset: false, + expectRemoveApiEndpointAndDataset: true, + }), + // | no API endpoint, token | remove | + Entry("no API endpoint, Dash0 export with token", ApiClientSetRemoveTestConfig{ + operatorConfigurationResourceSpec: OperatorConfigurationResourceDash0ExportWithoutApiEndpointWithToken, + expectSetApiEndpointAndDataset: false, + expectRemoveApiEndpointAndDataset: true, + }), + // | no API endpoint, secret-ref | remove | + Entry("no API endpoint, Dash0 export with secret-ref", ApiClientSetRemoveTestConfig{ + operatorConfigurationResourceSpec: OperatorConfigurationResourceDash0ExportWithoutApiEndpointWithSecretRef, + expectSetApiEndpointAndDataset: false, + expectRemoveApiEndpointAndDataset: true, + }), + // | API endpoint, token | set | + Entry("API endpoint, Dash0 export with token", ApiClientSetRemoveTestConfig{ + operatorConfigurationResourceSpec: OperatorConfigurationResourceDash0ExportWithApiEndpointWithToken, + expectSetApiEndpointAndDataset: true, + expectRemoveApiEndpointAndDataset: false, + }), + // | API endpoint, secret-ref | set | + Entry("API endpoint, Dash0 export with secret-ref", ApiClientSetRemoveTestConfig{ + operatorConfigurationResourceSpec: OperatorConfigurationResourceDash0ExportWithApiEndpointWithSecretRef, + expectSetApiEndpointAndDataset: true, + expectRemoveApiEndpointAndDataset: false, + }), + // | API endpoint, token, custom dataset | set | + Entry("API endpoint, Dash0 export with token, custom dataset", ApiClientSetRemoveTestConfig{ + operatorConfigurationResourceSpec: OperatorConfigurationResourceDash0ExportWithApiEndpointWithToken, + dataset: "custom-dataset", + expectSetApiEndpointAndDataset: true, + expectRemoveApiEndpointAndDataset: false, + }), + // | API endpoint, secret-ref, custom dataset | set | + Entry("API endpoint, Dash0 export with secret-ref, custom dataset", ApiClientSetRemoveTestConfig{ + operatorConfigurationResourceSpec: OperatorConfigurationResourceDash0ExportWithApiEndpointWithSecretRef, + dataset: "custom-dataset", + expectSetApiEndpointAndDataset: true, + expectRemoveApiEndpointAndDataset: false, + }), + ) + }) + Describe("when creating the operator configuration resource", func() { BeforeEach(func() { @@ -713,6 +811,12 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun triggerOperatorConfigurationReconcileRequest(ctx, reconciler) VerifyOperatorConfigurationResourceByNameDoesNotExist(ctx, k8sClient, Default, resource.Name) + for _, apiClient := range []*DummyApiClient{apiClient1, apiClient2} { + Expect(apiClient.setCalls).To(Equal(0)) + Expect(apiClient.removeCalls).To(Equal(1)) + Expect(apiClient.apiConfig).To(BeNil()) + } + Eventually(func(g Gomega) { updatedDeployment := LoadOperatorDeploymentOrFail(ctx, k8sClient, g) selfMonitoringAndApiAccessConfiguration, err := @@ -766,6 +870,12 @@ var _ = Describe("The operation configuration resource controller", Ordered, fun triggerOperatorConfigurationReconcileRequest(ctx, reconciler) VerifyOperatorConfigurationResourceByNameDoesNotExist(ctx, k8sClient, Default, resource.Name) + for _, apiClient := range []*DummyApiClient{apiClient1, apiClient2} { + Expect(apiClient.setCalls).To(Equal(0)) + Expect(apiClient.removeCalls).To(Equal(1)) + Expect(apiClient.apiConfig).To(BeNil()) + } + Consistently(func(g Gomega) { updatedDeployment := LoadOperatorDeploymentOrFail(ctx, k8sClient, g) selfMonitoringAndApiAccessConfiguration, err := @@ -797,17 +907,18 @@ func cleanUpDeploymentSpecForDiff(spec *appsv1.DeploymentSpec) { } func createReconciler(controllerDeployment *appsv1.Deployment) *OperatorConfigurationReconciler { - persesDashboardCrdReconciler := &PersesDashboardCrdReconciler{ - persesDashboardReconciler: &PersesDashboardReconciler{}, - } + return &OperatorConfigurationReconciler{ - Client: k8sClient, - Clientset: clientset, - Recorder: recorder, - PersesDashboardCrdReconciler: persesDashboardCrdReconciler, - DeploymentSelfReference: controllerDeployment, - DanglingEventsTimeouts: &DanglingEventsTimeoutsTest, - Images: TestImages, + Client: k8sClient, + Clientset: clientset, + Recorder: recorder, + ApiClients: []ApiClient{ + apiClient1, + apiClient2, + }, + DeploymentSelfReference: controllerDeployment, + DanglingEventsTimeouts: &DanglingEventsTimeoutsTest, + Images: TestImages, } } @@ -932,3 +1043,19 @@ func verifyNoSelfMonitoringButAuthTokenEnvVarFromSecretRef( g.Expect(container.Env).To( ContainElement(MatchEnvVarValueFrom("SELF_MONITORING_AND_API_AUTH_TOKEN", "secret-ref", "key"))) } + +type DummyApiClient struct { + setCalls int + removeCalls int + apiConfig *ApiConfig +} + +func (c *DummyApiClient) SetApiEndpointAndDataset(apiConfig *ApiConfig, _ *logr.Logger) { + c.setCalls++ + c.apiConfig = apiConfig +} + +func (c *DummyApiClient) RemoveApiEndpointAndDataset() { + c.removeCalls++ + c.apiConfig = nil +} diff --git a/internal/dash0/controller/perses_dashboards_controller.go b/internal/dash0/controller/perses_dashboards_controller.go index 5533fba0..a37d5aef 100644 --- a/internal/dash0/controller/perses_dashboards_controller.go +++ b/internal/dash0/controller/perses_dashboards_controller.go @@ -8,30 +8,20 @@ import ( "context" "encoding/json" "fmt" - "io" "net/http" + "net/url" "strings" "sync/atomic" - "time" "github.com/go-logr/logr" otelmetric "go.opentelemetry.io/otel/metric" - corev1 "k8s.io/api/core/v1" - apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" - apierrors "k8s.io/apimachinery/pkg/api/errors" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apimachinery/pkg/types" - "k8s.io/apimachinery/pkg/util/wait" "k8s.io/client-go/util/workqueue" - "k8s.io/utils/ptr" ctrl "sigs.k8s.io/controller-runtime" - "sigs.k8s.io/controller-runtime/pkg/builder" "sigs.k8s.io/controller-runtime/pkg/client" - "sigs.k8s.io/controller-runtime/pkg/controller" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/log" - "sigs.k8s.io/controller-runtime/pkg/predicate" "sigs.k8s.io/controller-runtime/pkg/reconcile" "github.com/dash0hq/dash0-operator/internal/dash0/util" @@ -47,129 +37,99 @@ type PersesDashboardCrdReconciler struct { //+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch -type ApiConfig struct { - Endpoint string - Dataset string +var ( + persesDashboardCrdReconcileRequestMetric otelmetric.Int64Counter + persesDashboardReconcileRequestMetric otelmetric.Int64Counter +) + +func (r *PersesDashboardCrdReconciler) Manager() ctrl.Manager { + return r.mgr } -type validationResult struct { - namespace string - name string - url string - origin string - authToken string +func (r *PersesDashboardCrdReconciler) GetAuthToken() string { + return r.AuthToken } -var ( - // persesDashboardCrdReconcileRequestMetric otelmetric.Int64Counter - persesDashboardReconcileRequestMetric otelmetric.Int64Counter +func (r *PersesDashboardCrdReconciler) KindDisplayName() string { + return "Perses dashboard" +} - retrySettings = wait.Backoff{ - Duration: 5 * time.Second, - Factor: 1.5, - Steps: 3, - } -) +func (r *PersesDashboardCrdReconciler) Group() string { + return "perses.dev" +} -func (r *PersesDashboardCrdReconciler) SetupWithManager( - ctx context.Context, - mgr ctrl.Manager, - startupK8sClient client.Client, - logger *logr.Logger, -) error { - if r.AuthToken == "" { - logger.Info("No Dash0 auth token has been provided via the operator configuration resource. The operator " + - "will not watch for Perses dashboard resources.") - return nil - } +func (r *PersesDashboardCrdReconciler) Kind() string { + return "PersesDashboard" +} - kubeSystemNamespace := &corev1.Namespace{} - if err := startupK8sClient.Get(ctx, client.ObjectKey{Name: "kube-system"}, kubeSystemNamespace); err != nil { - msg := "unable to get the kube-system namespace uid" - logger.Error(err, msg) - return fmt.Errorf("%s: %w", msg, err) - } +func (r *PersesDashboardCrdReconciler) Version() string { + return "v1alpha1" +} - r.mgr = mgr - r.persesDashboardReconciler = &PersesDashboardReconciler{ - pseudoClusterUid: kubeSystemNamespace.UID, - httpClient: &http.Client{}, - authToken: r.AuthToken, - } +func (r *PersesDashboardCrdReconciler) QualifiedKind() string { + return "persesdashboards.perses.dev" +} - if err := startupK8sClient.Get(ctx, client.ObjectKey{ - Name: "persesdashboards.perses.dev", - }, &apiextensionsv1.CustomResourceDefinition{}); err != nil { - if !apierrors.IsNotFound(err) { - logger.Error(err, "unable to call get the persesdashboards.perses.dev custom resource definition") - return err - } - } else { - r.persesDashboardCrdExists.Store(true) - r.maybeStartWatchingPersesDashboardResources(true, logger) - } +func (r *PersesDashboardCrdReconciler) ControllerName() string { + return "dash0_perses_dashboard_crd_controller" +} - controllerBuilder := ctrl.NewControllerManagedBy(mgr). - Named("dash0_perses_dashboard_crd_controller"). - Watches( - &apiextensionsv1.CustomResourceDefinition{}, - // Deliberately not using a convenience mechanism like &handler.EnqueueRequestForObject{} (which would - // feed all events into the Reconcile method) here, since using the lower-level TypedEventHandler interface - // directly allows us to distinguish between create and delete events more easily. - r, - builder.WithPredicates(makeFilterPredicate())) - if r.skipNameValidation { - controllerBuilder = controllerBuilder.WithOptions(controller.TypedOptions[reconcile.Request]{ - SkipNameValidation: ptr.To(true), - }) - } - if err := controllerBuilder.Complete(r); err != nil { - logger.Error(err, "unable to build the controller for the Perses Dashboard CRD reconciler") - return err - } +func (r *PersesDashboardCrdReconciler) DoesCrdExist() *atomic.Bool { + return &r.persesDashboardCrdExists +} - return nil +func (r *PersesDashboardCrdReconciler) SetCrdExists(exists bool) { + r.persesDashboardCrdExists.Store(exists) } -//+kubebuilder:rbac:groups=perses.dev,resources=persesdashboards,verbs=get;list;watch +func (r *PersesDashboardCrdReconciler) SkipNameValidation() bool { + return r.skipNameValidation +} -func makeFilterPredicate() predicate.Funcs { - return predicate.Funcs{ - CreateFunc: func(e event.CreateEvent) bool { - return isPersesDashboardCrd(e.Object) - }, - UpdateFunc: func(e event.UpdateEvent) bool { - // We are not interested in updates, but we still need to define a filter predicate for it, otherwise _all_ - // update events for CRDs would be passed to our event handler. We always return false to ignore update - // events entirely. Same for generic events. - return false - }, - DeleteFunc: func(e event.DeleteEvent) bool { - return isPersesDashboardCrd(e.Object) - }, - GenericFunc: func(e event.GenericEvent) bool { - return false - }, +func (r *PersesDashboardCrdReconciler) CreateResourceReconciler( + pseudoClusterUid types.UID, + authToken string, + httpClient *http.Client, +) { + r.persesDashboardReconciler = &PersesDashboardReconciler{ + pseudoClusterUid: pseudoClusterUid, + authToken: authToken, + httpClient: httpClient, } } -func isPersesDashboardCrd(crd client.Object) bool { - if crdCasted, ok := crd.(*apiextensionsv1.CustomResourceDefinition); ok { - return crdCasted.Spec.Group == "perses.dev" && - crdCasted.Spec.Names.Kind == "PersesDashboard" - } else { - return false - } +func (r *PersesDashboardCrdReconciler) ResourceReconciler() ThirdPartyResourceReconciler { + return r.persesDashboardReconciler } +func (r *PersesDashboardCrdReconciler) SetupWithManager( + ctx context.Context, + mgr ctrl.Manager, + startupK8sClient client.Client, + logger *logr.Logger, +) error { + r.mgr = mgr + return SetupThirdPartyCrdReconcilerWithManager( + ctx, + startupK8sClient, + r, + logger, + ) +} + +//+kubebuilder:rbac:groups=perses.dev,resources=persesdashboards,verbs=get;list;watch + func (r *PersesDashboardCrdReconciler) Create( ctx context.Context, _ event.TypedCreateEvent[client.Object], _ workqueue.TypedRateLimitingInterface[reconcile.Request], ) { + if persesDashboardCrdReconcileRequestMetric != nil { + persesDashboardCrdReconcileRequestMetric.Add(ctx, 1) + } logger := log.FromContext(ctx) r.persesDashboardCrdExists.Store(true) - r.maybeStartWatchingPersesDashboardResources(false, &logger) + maybeStartWatchingThirdPartyResources(r, false, &logger) } func (r *PersesDashboardCrdReconciler) Update( @@ -186,6 +146,9 @@ func (r *PersesDashboardCrdReconciler) Delete( _ event.TypedDeleteEvent[client.Object], _ workqueue.TypedRateLimitingInterface[reconcile.Request], ) { + if persesDashboardCrdReconcileRequestMetric != nil { + persesDashboardCrdReconcileRequestMetric.Add(ctx, 1) + } logger := log.FromContext(ctx) logger.Info("The PersesDashboard custom resource definition has been deleted.") r.persesDashboardCrdExists.Store(false) @@ -218,18 +181,15 @@ func (r *PersesDashboardCrdReconciler) InitializeSelfMonitoringMetrics( metricNamePrefix string, logger *logr.Logger, ) { - // Note: The persesDashboardCrdReconcileRequestMetric is unused until we actually implement watching the - // PersesDashboard _CRD_, see comment above in SetupWithManager. - - // reconcileRequestMetricName := fmt.Sprintf("%s%s", metricNamePrefix, "persesdashboardcrd.reconcile_requests") - // var err error - // if persesDashboardCrdReconcileRequestMetric, err = meter.Int64Counter( - // reconcileRequestMetricName, - // otelmetric.WithUnit("1"), - // otelmetric.WithDescription("Counter for persesdashboard CRD reconcile requests"), - // ); err != nil { - // logger.Error(err, "Cannot initialize the metric %s.") - // } + reconcileRequestMetricName := fmt.Sprintf("%s%s", metricNamePrefix, "persesdashboardcrd.reconcile_requests") + var err error + if persesDashboardCrdReconcileRequestMetric, err = meter.Int64Counter( + reconcileRequestMetricName, + otelmetric.WithUnit("1"), + otelmetric.WithDescription("Counter for persesdashboard CRD reconcile requests"), + ); err != nil { + logger.Error(err, "Cannot initialize the metric %s.") + } r.persesDashboardReconciler.InitializeSelfMonitoringMetrics( meter, @@ -247,7 +207,7 @@ func (r *PersesDashboardCrdReconciler) SetApiEndpointAndDataset( return } r.persesDashboardReconciler.apiConfig.Store(apiConfig) - r.maybeStartWatchingPersesDashboardResources(false, logger) + maybeStartWatchingThirdPartyResources(r, false, logger) } func (r *PersesDashboardCrdReconciler) RemoveApiEndpointAndDataset() { @@ -259,72 +219,8 @@ func (r *PersesDashboardCrdReconciler) RemoveApiEndpointAndDataset() { r.persesDashboardReconciler.apiConfig.Store(nil) } -func (r *PersesDashboardCrdReconciler) maybeStartWatchingPersesDashboardResources(isStartup bool, logger *logr.Logger) { - if r.persesDashboardReconciler.isWatching { - // we are already watching, do not start a second watch - return - } - - if !r.persesDashboardCrdExists.Load() { - logger.Info("The persesdashboards.perses.dev custom resource definition does not exist in this cluster, the " + - "operator will not watch for Perses dashboard resources.") - return - } - - apiConfig := r.persesDashboardReconciler.apiConfig.Load() - if !isValidApiConfig(apiConfig) { - if !isStartup { - // Silently ignore this missing precondition if it happens during the startup of the operator. It will - // be remedied automatically once the operator configuration resource is reconciled for the first time. - logger.Info("The persesdashboards.perses.dev custom resource definition is present in this " + - "cluster, but no Dash0 API endpoint been provided via the operator configuration resource, or the " + - "operator configuration resource has not been reconciled yet. The operator will not watch for Perses " + - "dashboard resources. (If there is an operator configuration resource with an API endpoint present in " + - "the cluster, it will be reconciled in a few seconds and this message can be safely ignored.)") - } - return - } - - logger.Info("The persesdashboards.perses.dev custom resource definition is present in this " + - "cluster, and a Dash0 API endpoint has been provided. The operator will watch for Perses dashboard resources.") - r.startWatchingPersesDashboardResources(logger) -} - -func (r *PersesDashboardCrdReconciler) startWatchingPersesDashboardResources( - logger *logr.Logger, -) { - logger.Info("Setting up a watch for Perses dashboard custom resources.") - - unstructuredGvkForPersesDashboards := &unstructured.Unstructured{} - unstructuredGvkForPersesDashboards.SetGroupVersionKind(schema.GroupVersionKind{ - Kind: "PersesDashboard", - Group: "perses.dev", - Version: "v1alpha1", - }) - - controllerBuilder := ctrl.NewControllerManagedBy(r.mgr). - Named("dash0_perses_dashboard_controller"). - Watches( - unstructuredGvkForPersesDashboards, - // Deliberately not using a convenience mechanism like &handler.EnqueueRequestForObject{} (which would - // feed all events into the Reconcile method) here, since using the lower-level TypedEventHandler interface - // directly allows us to distinguish between create and delete events more easily. - r.persesDashboardReconciler, - ) - if r.skipNameValidation { - controllerBuilder = controllerBuilder.WithOptions(controller.TypedOptions[reconcile.Request]{ - SkipNameValidation: ptr.To(true), - }) - } - if err := controllerBuilder.Complete(r.persesDashboardReconciler); err != nil { - logger.Error(err, "unable to create a new controller for watching Perses Dashboards") - return - } - r.persesDashboardReconciler.isWatching = true -} - type PersesDashboardReconciler struct { - isWatching bool + isWatching atomic.Bool pseudoClusterUid types.UID httpClient *http.Client apiConfig atomic.Pointer[ApiConfig] @@ -347,6 +243,38 @@ func (r *PersesDashboardReconciler) InitializeSelfMonitoringMetrics( } } +func (r *PersesDashboardReconciler) KindDisplayName() string { + return "Perses dashboard" +} + +func (r *PersesDashboardReconciler) ShortName() string { + return "dashboard" +} + +func (r *PersesDashboardReconciler) IsWatching() *atomic.Bool { + return &r.isWatching +} + +func (r *PersesDashboardReconciler) SetIsWatching(isWatching bool) { + r.isWatching.Store(isWatching) +} + +func (r *PersesDashboardReconciler) GetAuthToken() string { + return r.authToken +} + +func (r *PersesDashboardReconciler) GetApiConfig() *atomic.Pointer[ApiConfig] { + return &r.apiConfig +} + +func (r *PersesDashboardReconciler) ControllerName() string { + return "dash0_perses_dashboard_controller" +} + +func (r *PersesDashboardReconciler) HttpClient() *http.Client { + return r.httpClient +} + func (r *PersesDashboardReconciler) Create( ctx context.Context, e event.TypedCreateEvent[client.Object], @@ -364,8 +292,17 @@ func (r *PersesDashboardReconciler) Create( "name", e.Object.GetName(), ) - if err := r.UpsertDashboard(e.Object.(*unstructured.Unstructured), &logger); err != nil { - logger.Error(err, "unable to upsert the dashboard") + + if err := util.RetryWithCustomBackoff( + "create dashboard", + func() error { + return upsertViaApi(r, e.Object.(*unstructured.Unstructured), &logger) + }, + retrySettings, + true, + &logger, + ); err != nil { + logger.Error(err, "failed to create the dashboard") } } @@ -387,15 +324,17 @@ func (r *PersesDashboardReconciler) Update( e.ObjectNew.GetName(), ) - _ = util.RetryWithCustomBackoff( - "upsert dashboard", + if err := util.RetryWithCustomBackoff( + "update dashboard", func() error { - return r.UpsertDashboard(e.ObjectNew.(*unstructured.Unstructured), &logger) + return upsertViaApi(r, e.ObjectNew.(*unstructured.Unstructured), &logger) }, retrySettings, true, &logger, - ) + ); err != nil { + logger.Error(err, "failed to update the dashboard") + } } func (r *PersesDashboardReconciler) Delete( @@ -416,15 +355,17 @@ func (r *PersesDashboardReconciler) Delete( e.Object.GetName(), ) - _ = util.RetryWithCustomBackoff( + if err := util.RetryWithCustomBackoff( "delete dashboard", func() error { - return r.DeleteDashboard(e.Object.(*unstructured.Unstructured), &logger) + return deleteViaApi(r, e.Object.(*unstructured.Unstructured), &logger) }, retrySettings, true, &logger, - ) + ); err != nil { + logger.Error(err, "failed to delete the dashboard") + } } func (r *PersesDashboardReconciler) Generic( @@ -445,268 +386,94 @@ func (r *PersesDashboardReconciler) Reconcile( return reconcile.Result{}, nil } -func (r *PersesDashboardReconciler) UpsertDashboard( - persesDashboard *unstructured.Unstructured, +func (r *PersesDashboardReconciler) MapResourceToHttpRequests( + preconditionChecksResult *preconditionValidationResult, + action apiAction, logger *logr.Logger, -) error { - apiConfig := r.apiConfig.Load() - valResult, executeRequest := r.validateConfigAndRenderUrl( - persesDashboard, - apiConfig, - logger, - ) - if !executeRequest { - return nil - } +) ([]*http.Request, error) { + dashboardUrl := r.renderDashboardUrl(preconditionChecksResult) - specRaw := persesDashboard.Object["spec"] - if specRaw == nil { - logger.Info("Perses dashboard has no spec, the dashboard will not be updated in Dash0.") - return nil - } - spec, ok := specRaw.(map[string]interface{}) - if !ok { - logger.Info("Perses dashboard spec is not a map, the dashboard will not be updated in Dash0.") - return nil - } - displayRaw := spec["display"] - if displayRaw == nil { - spec["display"] = map[string]interface{}{} - displayRaw = spec["display"] - } - display, ok := displayRaw.(map[string]interface{}) - if !ok { - logger.Info("Perses dashboard spec.display is not a map, the dashboard will not be updated in Dash0.") - return nil - } - - displayName, ok := display["name"] - if !ok || displayName == "" { - // Let the dashboard name default to the perses dashboard resource's namespace + name, if unset. - display["name"] = fmt.Sprintf("%s/%s", valResult.namespace, valResult.name) - } - - // Remove all unnecessary metadata (labels & annotations), we basically only need the dashboard spec. - serializedDashboard, _ := json.Marshal( - map[string]interface{}{ - "kind": "PersesDashboard", - "spec": spec, - }) - requestPayload := bytes.NewBuffer(serializedDashboard) - - req, err := http.NewRequest( - http.MethodPut, - valResult.url, - requestPayload, - ) - if err != nil { - logger.Error(err, "unable to create a new HTTP request to upsert the dashboard") - return err - } - req.Header.Set("Content-Type", "application/json") - req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", valResult.authToken)) - logger.Info(fmt.Sprintf("Updating/creating dashboard %s in Dash0", valResult.origin)) - res, err := r.httpClient.Do(req) - if err != nil { - logger.Error(err, fmt.Sprintf("unable to execute the HTTP request to update the dashboard %s", valResult.origin)) - return err - } - - if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { - return r.handleNon2xxStatusCode(res, valResult.origin, logger) - } - - // http status code was 2xx, discard the response body and close it - defer func() { - _, _ = io.Copy(io.Discard, res.Body) - _ = res.Body.Close() - }() + var req *http.Request + var err error - return nil -} + //nolint:ineffassign + actionLabel := "?" + switch action { + case upsert: + actionLabel = "upsert" + spec := preconditionChecksResult.spec + displayRaw := spec["display"] + if displayRaw == nil { + spec["display"] = map[string]interface{}{} + displayRaw = spec["display"] + } + display, ok := displayRaw.(map[string]interface{}) + if !ok { + logger.Info("Perses dashboard spec.display is not a map, the dashboard will not be updated in Dash0.") + return nil, nil + } + displayName, ok := display["name"] + if !ok || displayName == "" { + // Let the dashboard name default to the perses dashboard resource's namespace + name, if unset. + display["name"] = fmt.Sprintf("%s/%s", preconditionChecksResult.k8sNamespace, preconditionChecksResult.k8sName) + } -func (r *PersesDashboardReconciler) DeleteDashboard( - persesDashboard *unstructured.Unstructured, - logger *logr.Logger, -) error { - apiConfig := r.apiConfig.Load() - valResult, executeRequest := r.validateConfigAndRenderUrl( - persesDashboard, - apiConfig, - logger, - ) - if !executeRequest { - return nil + // Remove all unnecessary metadata (labels & annotations), we basically only need the dashboard spec. + serializedDashboard, _ := json.Marshal( + map[string]interface{}{ + "kind": "PersesDashboard", + "spec": spec, + }) + requestPayload := bytes.NewBuffer(serializedDashboard) + + req, err = http.NewRequest( + http.MethodPut, + dashboardUrl, + requestPayload, + ) + case delete: + actionLabel = "delete" + req, err = http.NewRequest( + http.MethodDelete, + dashboardUrl, + nil, + ) + default: + logger.Error(fmt.Errorf("unknown API action: %d", action), "unknown API action") + return nil, nil } - req, err := http.NewRequest( - http.MethodDelete, - valResult.url, - nil, - ) if err != nil { - logger.Error(err, "unable to create a new HTTP request to delete the dashboard") - return err - } - req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", valResult.authToken)) - logger.Info(fmt.Sprintf("Deleting dashboard %s in Dash0", valResult.origin)) - res, err := r.httpClient.Do(req) - if err != nil { - logger.Error(err, fmt.Sprintf("unable to execute the HTTP request to delete the dashboard %s", valResult.origin)) - return err + logger.Error(err, fmt.Sprintf("unable to create a new HTTP request to %s the dashboard", actionLabel)) + return nil, err } - if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { - return r.handleNon2xxStatusCode(res, valResult.origin, logger) + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", preconditionChecksResult.authToken)) + if action == upsert { + req.Header.Set("Content-Type", "application/json") } - // http status code was 2xx, discard the response body and close it - defer func() { - _, _ = io.Copy(io.Discard, res.Body) - _ = res.Body.Close() - }() - - return nil + return []*http.Request{req}, nil } -func (r *PersesDashboardReconciler) validateConfigAndRenderUrl( - persesDashboard *unstructured.Unstructured, - apiConfig *ApiConfig, - logger *logr.Logger, -) (*validationResult, bool) { - if !isValidApiConfig(apiConfig) { - logger.Info("No Dash0 API endpoint has been provided via the operator configuration resource, the dashboard " + - "will not be updated in Dash0.") - return nil, false - } - if r.authToken == "" { - logger.Info("No auth token is set on the controller deployment, the dashboard will not be updated " + - "in Dash0.") - return nil, false - } - - dataset := apiConfig.Dataset - if dataset == "" { - dataset = "default" - } - - namespace, name, ok := readNamespaceAndName(persesDashboard, logger) - if !ok { - return nil, false - } - - dashboardUrl, dashboardOrigin := r.renderDashboardUrl( - apiConfig.Endpoint, - namespace, - name, - dataset, - ) - return &validationResult{ - namespace: namespace, - name: name, - url: dashboardUrl, - origin: dashboardOrigin, - authToken: r.authToken, - }, true -} - -func isValidApiConfig(apiConfig *ApiConfig) bool { - return apiConfig != nil && apiConfig.Endpoint != "" -} - -func (r *PersesDashboardReconciler) renderDashboardUrl( - dash0ApiEndpoint string, - namespace string, - name string, - dataset string, -) (string, string) { +func (r *PersesDashboardReconciler) renderDashboardUrl(preconditionCheckResult *preconditionValidationResult) string { dashboardOrigin := fmt.Sprintf( // we deliberately use _ as the separator, since that is an illegal character in Kubernetes names. This avoids // any potential naming collisions (e.g. namespace="abc" & name="def-ghi" vs. namespace="abc-def" & name="ghi"). "dash0-operator_%s_%s_%s_%s", r.pseudoClusterUid, - dataset, - namespace, - name, + urlEncodePathSegment(preconditionCheckResult.dataset), + preconditionCheckResult.k8sNamespace, + preconditionCheckResult.k8sName, ) - if !strings.HasSuffix(dash0ApiEndpoint, "/") { - dash0ApiEndpoint += "/" + if !strings.HasSuffix(preconditionCheckResult.apiEndpoint, "/") { + preconditionCheckResult.apiEndpoint += "/" } return fmt.Sprintf( "%sapi/dashboards/%s?dataset=%s", - dash0ApiEndpoint, + preconditionCheckResult.apiEndpoint, dashboardOrigin, - dataset, - ), dashboardOrigin -} - -func (r *PersesDashboardReconciler) handleNon2xxStatusCode( - res *http.Response, - dashboardOrigin string, - logger *logr.Logger, -) error { - defer func() { - _ = res.Body.Close() - }() - responseBody, readErr := io.ReadAll(res.Body) - if readErr != nil { - readBodyErr := fmt.Errorf("unable to read the API response payload after receiving status code %d when "+ - "trying to udpate/create/delete the dashboard %s", res.StatusCode, dashboardOrigin) - logger.Error(readBodyErr, "unable to read the API response payload") - return readBodyErr - } - - statusCodeErr := fmt.Errorf( - "unexpected status code %d when updating/creating/deleting the dashboard %s, response body is %s", - res.StatusCode, - dashboardOrigin, - string(responseBody), + url.QueryEscape(preconditionCheckResult.dataset), ) - logger.Error(statusCodeErr, "unexpected status code") - return statusCodeErr -} - -func readNamespaceAndName(persesDashboard *unstructured.Unstructured, logger *logr.Logger) (string, string, bool) { - metadataRaw := persesDashboard.Object["metadata"] - if metadataRaw == nil { - logger.Info("Perses dashboard payload has no metadata section, the dashboard will not be updated in Dash0.") - return "", "", false - } - metadata, ok := metadataRaw.(map[string]interface{}) - if !ok { - logger.Info("Perses dashboard payload metadata section is not a map, the dashboard will not be updated in " + - "Dash0.") - return "", "", false - } - namespace, ok := readStringAttribute(metadata, "namespace", logger) - if !ok { - return "", "", false - } - name, ok := readStringAttribute(metadata, "name", logger) - if !ok { - return "", "", false - } - return namespace, name, true -} - -func readStringAttribute(metadata map[string]interface{}, attributeName string, logger *logr.Logger) (string, bool) { - valueRaw := metadata[attributeName] - if valueRaw == nil { - logger.Info(fmt.Sprintf("Perses dashboard has no attribute metadata.%s, the dashboard will not be updated in "+ - "Dash0.", attributeName)) - return "", false - } - value, ok := valueRaw.(string) - if !ok { - logger.Info(fmt.Sprintf("Perses dashboard metadata.%s is not a string, the dashboard will not be updated "+ - "in Dash0.", attributeName)) - return "", false - } - if value == "" { - logger.Info(fmt.Sprintf("Perses dashboard has no attribute metadata.%s, the dashboard will not be updated in "+ - "Dash0.", attributeName)) - return "", false - } - return value, true } diff --git a/internal/dash0/controller/perses_dashboards_controller_test.go b/internal/dash0/controller/perses_dashboards_controller_test.go index a1e5ba63..71ead7ba 100644 --- a/internal/dash0/controller/perses_dashboards_controller_test.go +++ b/internal/dash0/controller/perses_dashboards_controller_test.go @@ -6,21 +6,21 @@ package controller import ( "context" "encoding/json" + "fmt" "time" - "github.com/h2non/gock" persesv1alpha1 "github.com/perses/perses-operator/api/v1alpha1" apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" - "k8s.io/apimachinery/pkg/types" "sigs.k8s.io/controller-runtime/pkg/client" "sigs.k8s.io/controller-runtime/pkg/controller/controllertest" "sigs.k8s.io/controller-runtime/pkg/event" "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/reconcile" + "github.com/h2non/gock" . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" @@ -28,12 +28,12 @@ import ( ) var ( - crdReconciler *PersesDashboardCrdReconciler - crd *apiextensionsv1.CustomResourceDefinition + persesDashboardCrdReconciler *PersesDashboardCrdReconciler + persesDashboardCrd *apiextensionsv1.CustomResourceDefinition - crdQualifiedName = types.NamespacedName{ - Name: "persesdashboards.perses.dev", - } + dashboardApiBasePath = "/api/dashboards/" + + defaultExpectedPathDashboard = fmt.Sprintf("%s.*%s", dashboardApiBasePath, "dash0-operator_.*_test-dataset_test-namespace_test-dashboard") ) var _ = Describe("The Perses dashboard controller", Ordered, func() { @@ -52,78 +52,78 @@ var _ = Describe("The Perses dashboard controller", Ordered, func() { }) It("does not create a Perses dashboard resource reconciler if there is no auth token", func() { - createCrdReconcilerWithoutAuthToken() + createPersesDashboardCrdReconcilerWithoutAuthToken() ensurePersesDashboardCrdExists(ctx) - Expect(crdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) - Expect(crdReconciler.persesDashboardReconciler).To(BeNil()) - crdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + Expect(persesDashboardCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler).To(BeNil()) + persesDashboardCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ Endpoint: ApiEndpointTest, Dataset: DatasetTest, }, &logger) - Expect(crdReconciler.persesDashboardReconciler).To(BeNil()) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler).To(BeNil()) }) It("does not start watching Perses dashboards if the CRD does not exist and the API endpoint has not been provided", func() { - createCrdReconcilerWithAuthToken() - Expect(crdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) - Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeFalse()) + createPersesDashboardCrdReconcilerWithAuthToken() + Expect(persesDashboardCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeFalse()) }) It("does not start watching Perses dashboards if the API endpoint has been provided but the CRD does not exist", func() { - createCrdReconcilerWithAuthToken() - Expect(crdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) - crdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + createPersesDashboardCrdReconcilerWithAuthToken() + Expect(persesDashboardCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + persesDashboardCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ Endpoint: ApiEndpointTest, Dataset: DatasetTest, }, &logger) - Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeFalse()) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeFalse()) }) It("does not start watching Perses dashboards if the CRD exists but the API endpoint has not been provided", func() { - createCrdReconcilerWithAuthToken() + createPersesDashboardCrdReconcilerWithAuthToken() ensurePersesDashboardCrdExists(ctx) - Expect(crdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) - Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeFalse()) + Expect(persesDashboardCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeFalse()) }) It("starts watching Perses dashboards if the CRD exists and the API endpoint has been provided", func() { - createCrdReconcilerWithAuthToken() + createPersesDashboardCrdReconcilerWithAuthToken() ensurePersesDashboardCrdExists(ctx) - Expect(crdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) - Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeFalse()) - crdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + Expect(persesDashboardCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeFalse()) + persesDashboardCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ Endpoint: ApiEndpointTest, Dataset: DatasetTest, }, &logger) - Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeTrue()) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeTrue()) }) It("starts watching Perses dashboards if API endpoint is provided and the CRD is created later on", func() { - createCrdReconcilerWithAuthToken() - Expect(crdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + createPersesDashboardCrdReconcilerWithAuthToken() + Expect(persesDashboardCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) // provide the API endpoint first - crdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + persesDashboardCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ Endpoint: ApiEndpointTest, Dataset: DatasetTest, }, &logger) // create the CRD a bit later time.Sleep(100 * time.Millisecond) - Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeFalse()) - dashboardCrd := ensurePersesDashboardCrdExists(ctx) + Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeFalse()) + ensurePersesDashboardCrdExists(ctx) // watches are not triggered in unit tests - crdReconciler.Create( + persesDashboardCrdReconciler.Create( ctx, event.TypedCreateEvent[client.Object]{ - Object: dashboardCrd, + Object: persesDashboardCrd, }, &controllertest.TypedQueue[reconcile.Request]{}, ) // verify that the controller starts watching when it sees the CRD being created Eventually(func(g Gomega) { - g.Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeTrue()) + g.Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeTrue()) }).Should(Succeed()) }) }) @@ -132,27 +132,27 @@ var _ = Describe("The Perses dashboard controller", Ordered, func() { var persesDashboardReconciler *PersesDashboardReconciler BeforeAll(func() { - createCrdReconcilerWithAuthToken() + createPersesDashboardCrdReconcilerWithAuthToken() ensurePersesDashboardCrdExists(ctx) - Expect(crdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(persesDashboardCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) }) BeforeEach(func() { - crdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + persesDashboardCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ Endpoint: ApiEndpointTest, Dataset: DatasetTest, }, &logger) - Expect(crdReconciler.persesDashboardReconciler.isWatching).To(BeTrue()) - persesDashboardReconciler = crdReconciler.persesDashboardReconciler + Expect(persesDashboardCrdReconciler.persesDashboardReconciler.isWatching.Load()).To(BeTrue()) + persesDashboardReconciler = persesDashboardCrdReconciler.persesDashboardReconciler }) AfterAll(func() { ensurePersesDashboardCrdDoesNotExist(ctx) }) - It("creates a Perses dashboard resource", func() { - expectPutRequest() + It("creates a dashboard", func() { + expectDashboardPutRequest(defaultExpectedPathDashboard) defer gock.Off() dashboardResource := createDashboardResource() @@ -167,8 +167,8 @@ var _ = Describe("The Perses dashboard controller", Ordered, func() { Expect(gock.IsDone()).To(BeTrue()) }) - It("updates a Perses dashboard resource", func() { - expectPutRequest() + It("updates a dashboard", func() { + expectDashboardPutRequest(defaultExpectedPathDashboard) defer gock.Off() dashboardResource := createDashboardResource() @@ -183,8 +183,8 @@ var _ = Describe("The Perses dashboard controller", Ordered, func() { Expect(gock.IsDone()).To(BeTrue()) }) - It("deletes a Perses dashboard resource", func() { - expectDeleteRequest() + It("deletes a dashboard", func() { + expectDashboardDeleteRequest(defaultExpectedPathDashboard) defer gock.Off() dashboardResource := createDashboardResource() @@ -200,10 +200,10 @@ var _ = Describe("The Perses dashboard controller", Ordered, func() { }) It("it ignores Perses dashboard resource changes if API endpoint is not configured", func() { - expectPutRequest() + expectDashboardPutRequest(defaultExpectedPathDashboard) defer gock.Off() - crdReconciler.RemoveApiEndpointAndDataset() + persesDashboardCrdReconciler.RemoveApiEndpointAndDataset() dashboardResource := createDashboardResource() persesDashboardReconciler.Create( @@ -219,16 +219,16 @@ var _ = Describe("The Perses dashboard controller", Ordered, func() { }) }) -func createCrdReconcilerWithoutAuthToken() { - crdReconciler = &PersesDashboardCrdReconciler{ +func createPersesDashboardCrdReconcilerWithoutAuthToken() { + persesDashboardCrdReconciler = &PersesDashboardCrdReconciler{ // We create the controller multiple times in tests, this option is required, otherwise the controller // runtime will complain. skipNameValidation: true, } } -func createCrdReconcilerWithAuthToken() { - crdReconciler = &PersesDashboardCrdReconciler{ +func createPersesDashboardCrdReconcilerWithAuthToken() { + persesDashboardCrdReconciler = &PersesDashboardCrdReconciler{ AuthToken: AuthorizationTokenTest, // We create the controller multiple times in tests, this option is required, otherwise the controller @@ -237,18 +237,20 @@ func createCrdReconcilerWithAuthToken() { } } -func expectPutRequest() { +func expectDashboardPutRequest(expectedPath string) { gock.New(ApiEndpointTest). - Put("/api/dashboards/.*"). + Put(expectedPath). MatchParam("dataset", DatasetTest). + Times(1). Reply(200). JSON(map[string]string{}) } -func expectDeleteRequest() { +func expectDashboardDeleteRequest(expectedPath string) { gock.New(ApiEndpointTest). - Delete("/api/dashboards/.*"). + Delete(expectedPath). MatchParam("dataset", DatasetTest). + Times(1). Reply(200). JSON(map[string]string{}) } @@ -273,62 +275,16 @@ func createDashboardResource() unstructured.Unstructured { return unstructuredObject } -func ensurePersesDashboardCrdExists(ctx context.Context) *apiextensionsv1.CustomResourceDefinition { - crd_ := EnsureKubernetesObjectExists( +func ensurePersesDashboardCrdExists(ctx context.Context) { + persesDashboardCrd = EnsurePersesDashboardCrdExists( ctx, k8sClient, - crdQualifiedName, - &apiextensionsv1.CustomResourceDefinition{}, - &apiextensionsv1.CustomResourceDefinition{ - TypeMeta: metav1.TypeMeta{ - APIVersion: "apiextensions.k8s.io/v1", - Kind: "CustomResourceDefinition", - }, - ObjectMeta: metav1.ObjectMeta{ - Name: "persesdashboards.perses.dev", - }, - Spec: apiextensionsv1.CustomResourceDefinitionSpec{ - Group: "perses.dev", - Names: apiextensionsv1.CustomResourceDefinitionNames{ - Kind: "PersesDashboard", - ListKind: "PersesDashboardList", - Plural: "persesdashboards", - Singular: "persesdashboard", - }, - Scope: "Namespaced", - Versions: []apiextensionsv1.CustomResourceDefinitionVersion{ - { - Name: "v1alpha1", - Schema: &apiextensionsv1.CustomResourceValidation{ - OpenAPIV3Schema: &apiextensionsv1.JSONSchemaProps{ - Type: "object", - Properties: map[string]apiextensionsv1.JSONSchemaProps{ - "apiVersion": {Type: "string"}, - "kind": {Type: "string"}, - "metadata": {Type: "object"}, - "spec": {Type: "object"}, - }, - Required: []string{ - "kind", - "spec", - }, - }, - }, - Served: true, - Storage: true, - }, - }, - }, - }, ) - - crd = crd_.(*apiextensionsv1.CustomResourceDefinition) - return crd } func ensurePersesDashboardCrdDoesNotExist(ctx context.Context) { - if crd != nil { - err := k8sClient.Delete(ctx, crd, &client.DeleteOptions{ + if persesDashboardCrd != nil { + err := k8sClient.Delete(ctx, persesDashboardCrd, &client.DeleteOptions{ GracePeriodSeconds: new(int64), }) if err != nil && apierrors.IsNotFound(err) { @@ -338,10 +294,11 @@ func ensurePersesDashboardCrdDoesNotExist(ctx context.Context) { } Eventually(func(g Gomega) { - err := k8sClient.Get(ctx, crdQualifiedName, &apiextensionsv1.CustomResourceDefinition{}) + err := k8sClient.Get(ctx, PersesDashboardCrdQualifiedName, &apiextensionsv1.CustomResourceDefinition{}) g.Expect(err).To(HaveOccurred()) g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) }).Should(Succeed()) - } + persesDashboardCrd = nil + } } diff --git a/internal/dash0/controller/prometheus_rules_controller.go b/internal/dash0/controller/prometheus_rules_controller.go new file mode 100644 index 00000000..493d3c5a --- /dev/null +++ b/internal/dash0/controller/prometheus_rules_controller.go @@ -0,0 +1,589 @@ +// SPDX-FileCopyrightText: Copyright 2024 Dash0 Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "bytes" + "context" + "encoding/json" + "fmt" + "net/http" + "net/url" + "strconv" + "strings" + "sync/atomic" + + "github.com/go-logr/logr" + prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + otelmetric "go.opentelemetry.io/otel/metric" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/client-go/util/workqueue" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + "sigs.k8s.io/yaml" + + "github.com/dash0hq/dash0-operator/internal/dash0/util" +) + +type PrometheusRuleCrdReconciler struct { + AuthToken string + mgr ctrl.Manager + skipNameValidation bool + prometheusRuleReconciler *PrometheusRuleReconciler + prometheusRuleCrdExists atomic.Bool +} + +//+kubebuilder:rbac:groups=apiextensions.k8s.io,resources=customresourcedefinitions,verbs=get;list;watch + +type CheckRule struct { + Name string `json:"name"` + Expression string `json:"expression"` + For string `json:"for,omitempty"` + Interval string `json:"interval,omitempty"` + KeepFiringFor string `json:"keepFiringFor,omitempty"` + Annotations map[string]string `json:"annotations"` // √ + Labels map[string]string `json:"labels"` +} + +var ( + prometheusRuleCrdReconcileRequestMetric otelmetric.Int64Counter + prometheusRuleReconcileRequestMetric otelmetric.Int64Counter +) + +func (r *PrometheusRuleCrdReconciler) Manager() ctrl.Manager { + return r.mgr +} + +func (r *PrometheusRuleCrdReconciler) GetAuthToken() string { + return r.AuthToken +} + +func (r *PrometheusRuleCrdReconciler) KindDisplayName() string { + return "Prometheus rule" +} + +func (r *PrometheusRuleCrdReconciler) Group() string { + return "monitoring.coreos.com" +} + +func (r *PrometheusRuleCrdReconciler) Kind() string { + return "PrometheusRule" +} + +func (r *PrometheusRuleCrdReconciler) Version() string { + return "v1" +} + +func (r *PrometheusRuleCrdReconciler) QualifiedKind() string { + return "prometheusrules.monitoring.coreos.com" +} + +func (r *PrometheusRuleCrdReconciler) ControllerName() string { + return "dash0_prometheus_rule_crd_controller" +} + +func (r *PrometheusRuleCrdReconciler) DoesCrdExist() *atomic.Bool { + return &r.prometheusRuleCrdExists +} + +func (r *PrometheusRuleCrdReconciler) SetCrdExists(exists bool) { + r.prometheusRuleCrdExists.Store(exists) +} + +func (r *PrometheusRuleCrdReconciler) SkipNameValidation() bool { + return r.skipNameValidation +} + +func (r *PrometheusRuleCrdReconciler) CreateResourceReconciler( + pseudoClusterUid types.UID, + authToken string, + httpClient *http.Client, +) { + r.prometheusRuleReconciler = &PrometheusRuleReconciler{ + pseudoClusterUid: pseudoClusterUid, + authToken: authToken, + httpClient: httpClient, + } +} + +func (r *PrometheusRuleCrdReconciler) ResourceReconciler() ThirdPartyResourceReconciler { + return r.prometheusRuleReconciler +} + +func (r *PrometheusRuleCrdReconciler) SetupWithManager( + ctx context.Context, + mgr ctrl.Manager, + startupK8sClient client.Client, + logger *logr.Logger, +) error { + r.mgr = mgr + return SetupThirdPartyCrdReconcilerWithManager( + ctx, + startupK8sClient, + r, + logger, + ) +} + +//+kubebuilder:rbac:groups=monitoring.coreos.com,resources=prometheusrules,verbs=get;list;watch + +func (r *PrometheusRuleCrdReconciler) Create( + ctx context.Context, + _ event.TypedCreateEvent[client.Object], + _ workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + if prometheusRuleCrdReconcileRequestMetric != nil { + prometheusRuleCrdReconcileRequestMetric.Add(ctx, 1) + } + logger := log.FromContext(ctx) + r.prometheusRuleCrdExists.Store(true) + maybeStartWatchingThirdPartyResources(r, false, &logger) +} + +func (r *PrometheusRuleCrdReconciler) Update( + context.Context, + event.TypedUpdateEvent[client.Object], + workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + // should not be called, we are not interested in updates + // note: update is called twice prior to delete, it is also called twice after an actual create +} + +func (r *PrometheusRuleCrdReconciler) Delete( + ctx context.Context, + _ event.TypedDeleteEvent[client.Object], + _ workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + if prometheusRuleCrdReconcileRequestMetric != nil { + prometheusRuleCrdReconcileRequestMetric.Add(ctx, 1) + } + logger := log.FromContext(ctx) + logger.Info("The PrometheusRule custom resource definition has been deleted.") + r.prometheusRuleCrdExists.Store(false) + + // Known issue: We would need to stop the watch for the Prometheus rule resources here, but the controller-runtime + // does not provide any API to stop a watch. + // An error will be logged every ten seconds until the controller process is restarted. +} + +func (r *PrometheusRuleCrdReconciler) Generic( + context.Context, + event.TypedGenericEvent[client.Object], + workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + // Should not be called, we are not interested in generic events. +} + +func (r *PrometheusRuleCrdReconciler) Reconcile( + _ context.Context, + _ reconcile.Request, +) (reconcile.Result, error) { + // Reconcile should not be called for the PrometheusRuleCrdReconciler CRD, as we are using the + // TypedEventHandler interface directly when setting up the watch. We still need to implement the method, as the + // controller builder's Complete method requires implementing the Reconciler interface. + return reconcile.Result{}, nil +} + +func (r *PrometheusRuleCrdReconciler) InitializeSelfMonitoringMetrics( + meter otelmetric.Meter, + metricNamePrefix string, + logger *logr.Logger, +) { + reconcileRequestMetricName := fmt.Sprintf("%s%s", metricNamePrefix, "prometheusrulecrd.reconcile_requests") + var err error + if prometheusRuleCrdReconcileRequestMetric, err = meter.Int64Counter( + reconcileRequestMetricName, + otelmetric.WithUnit("1"), + otelmetric.WithDescription("Counter for PrometheusRule CRD reconcile requests"), + ); err != nil { + logger.Error(err, "Cannot initialize the metric %s.") + } + + r.prometheusRuleReconciler.InitializeSelfMonitoringMetrics( + meter, + metricNamePrefix, + logger, + ) +} + +func (r *PrometheusRuleCrdReconciler) SetApiEndpointAndDataset( + apiConfig *ApiConfig, + logger *logr.Logger) { + if r.prometheusRuleReconciler == nil { + // If no auth token has been set via environment variable, we do not even create the prometheusRuleReconciler, + // hence this nil check is necessary. + return + } + r.prometheusRuleReconciler.apiConfig.Store(apiConfig) + maybeStartWatchingThirdPartyResources(r, false, logger) +} + +func (r *PrometheusRuleCrdReconciler) RemoveApiEndpointAndDataset() { + if r.prometheusRuleReconciler == nil { + // If no auth token has been set via environment variable, we do not even create the prometheusRuleReconciler, + // hence this nil check is necessary. + return + } + r.prometheusRuleReconciler.apiConfig.Store(nil) +} + +type PrometheusRuleReconciler struct { + isWatching atomic.Bool + pseudoClusterUid types.UID + httpClient *http.Client + apiConfig atomic.Pointer[ApiConfig] + authToken string +} + +func (r *PrometheusRuleReconciler) InitializeSelfMonitoringMetrics( + meter otelmetric.Meter, + metricNamePrefix string, + logger *logr.Logger, +) { + reconcileRequestMetricName := fmt.Sprintf("%s%s", metricNamePrefix, "prometheusrule.reconcile_requests") + var err error + if prometheusRuleReconcileRequestMetric, err = meter.Int64Counter( + reconcileRequestMetricName, + otelmetric.WithUnit("1"), + otelmetric.WithDescription("Counter for prometheus rule reconcile requests"), + ); err != nil { + logger.Error(err, "Cannot initialize the metric %s.") + } +} + +func (r *PrometheusRuleReconciler) KindDisplayName() string { + return "Prometheus rule" +} + +func (r *PrometheusRuleReconciler) ShortName() string { + return "rule" +} + +func (r *PrometheusRuleReconciler) IsWatching() *atomic.Bool { + return &r.isWatching +} + +func (r *PrometheusRuleReconciler) SetIsWatching(isWatching bool) { + r.isWatching.Store(isWatching) +} + +func (r *PrometheusRuleReconciler) GetAuthToken() string { + return r.authToken +} + +func (r *PrometheusRuleReconciler) GetApiConfig() *atomic.Pointer[ApiConfig] { + return &r.apiConfig +} + +func (r *PrometheusRuleReconciler) ControllerName() string { + return "dash0_prometheus_rule_controller" +} + +func (r *PrometheusRuleReconciler) HttpClient() *http.Client { + return r.httpClient +} + +func (r *PrometheusRuleReconciler) Create( + ctx context.Context, + e event.TypedCreateEvent[client.Object], + _ workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + if prometheusRuleReconcileRequestMetric != nil { + prometheusRuleReconcileRequestMetric.Add(ctx, 1) + } + + logger := log.FromContext(ctx) + logger.Info( + "Detected a new Prometheus rule resource", + "namespace", + e.Object.GetNamespace(), + "name", + e.Object.GetName(), + ) + + if err := util.RetryWithCustomBackoff( + "create rule(s)", + func() error { + return upsertViaApi(r, e.Object.(*unstructured.Unstructured), &logger) + }, + retrySettings, + true, + &logger, + ); err != nil { + logger.Error(err, "failed to create the rule(s)") + } +} + +func (r *PrometheusRuleReconciler) Update( + ctx context.Context, + e event.TypedUpdateEvent[client.Object], + _ workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + if prometheusRuleReconcileRequestMetric != nil { + prometheusRuleReconcileRequestMetric.Add(ctx, 1) + } + + logger := log.FromContext(ctx) + logger.Info( + "Detected a change for a Prometheus rule resource", + "namespace", + e.ObjectNew.GetNamespace(), + "name", + e.ObjectNew.GetName(), + ) + + if err := util.RetryWithCustomBackoff( + "update rule(s)", + func() error { + return upsertViaApi(r, e.ObjectNew.(*unstructured.Unstructured), &logger) + }, + retrySettings, + true, + &logger, + ); err != nil { + logger.Error(err, "failed to update the rule(s)") + } +} + +func (r *PrometheusRuleReconciler) Delete( + ctx context.Context, + e event.TypedDeleteEvent[client.Object], + _ workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + if prometheusRuleReconcileRequestMetric != nil { + prometheusRuleReconcileRequestMetric.Add(ctx, 1) + } + + logger := log.FromContext(ctx) + logger.Info( + "Detected the deletion of a Prometheus rule resource", + "namespace", + e.Object.GetNamespace(), + "name", + e.Object.GetName(), + ) + + if err := util.RetryWithCustomBackoff( + "delete rule(s)", + func() error { + return deleteViaApi(r, e.Object.(*unstructured.Unstructured), &logger) + }, + retrySettings, + true, + &logger, + ); err != nil { + logger.Error(err, "failed to delete the rule(s)") + } +} + +func (r *PrometheusRuleReconciler) Generic( + _ context.Context, + _ event.TypedGenericEvent[client.Object], + _ workqueue.TypedRateLimitingInterface[reconcile.Request], +) { + // ignoring generic events +} + +func (r *PrometheusRuleReconciler) Reconcile( + context.Context, + reconcile.Request, +) (reconcile.Result, error) { + // Reconcile should not be called on the PrometheusRuleReconciler, as we are using the TypedEventHandler interface + // directly when setting up the watch. We still need to implement the method, as the controller builder's Complete + // method requires implementing the Reconciler interface. + return reconcile.Result{}, nil +} + +func (r *PrometheusRuleReconciler) MapResourceToHttpRequests( + preconditionChecksResult *preconditionValidationResult, + action apiAction, + logger *logr.Logger, +) ([]*http.Request, error) { + specRaw := preconditionChecksResult.spec + + specAsYaml, err := yaml.Marshal(specRaw) + if err != nil { + logger.Error(err, "unable to marshal the Prometheus rule spec") + return nil, err + } + ruleSpec := prometheusv1.PrometheusRuleSpec{} + if err = yaml.Unmarshal(specAsYaml, &ruleSpec); err != nil { + logger.Error(err, "unable to unmarshal the Prometheus rule spec") + return nil, err + } + + urlPrefix := r.renderUrlPrefix(preconditionChecksResult) + requests := make([]*http.Request, 0) + for _, group := range ruleSpec.Groups { + for ruleIdx, rule := range group.Rules { + checkRuleUrl := fmt.Sprintf( + "%s_%s_%d?dataset=%s", + urlPrefix, + urlEncodePathSegment(group.Name), + ruleIdx, + url.QueryEscape(preconditionChecksResult.dataset), + ) + if request, ok := convertRuleToRequest( + checkRuleUrl, + action, + rule, + preconditionChecksResult, + group.Name, + group.Interval, + logger, + ); ok { + requests = append(requests, request) + } + } + } + + return requests, nil +} + +func (r *PrometheusRuleReconciler) renderUrlPrefix(preconditionCheckResult *preconditionValidationResult) string { + + ruleOriginPrefix := fmt.Sprintf( + // we deliberately use _ as the separator, since that is an illegal character in Kubernetes names. This avoids + // any potential naming collisions (e.g. namespace="abc" & name="def-ghi" vs. namespace="abc-def" & name="ghi"). + "dash0-operator_%s_%s_%s_%s", + r.pseudoClusterUid, + urlEncodePathSegment(preconditionCheckResult.dataset), + preconditionCheckResult.k8sNamespace, + preconditionCheckResult.k8sName, + ) + if !strings.HasSuffix(preconditionCheckResult.apiEndpoint, "/") { + preconditionCheckResult.apiEndpoint += "/" + } + return fmt.Sprintf( + "%sapi/alerting/check-rules/%s", + preconditionCheckResult.apiEndpoint, + ruleOriginPrefix, + ) +} + +func convertRuleToRequest( + checkRuleUrl string, + action apiAction, + rule prometheusv1.Rule, + preconditionCheckResult *preconditionValidationResult, + groupName string, + interval *prometheusv1.Duration, + logger *logr.Logger, +) (*http.Request, bool) { + checkRule, ok := convertRuleToCheckRule(rule, action, groupName, interval, logger) + if !ok { + return nil, false + } + + var req *http.Request + var err error + + //nolint:ineffassign + actionLabel := "?" + switch action { + case upsert: + actionLabel = "upsert" + serializedCheckRule, _ := json.Marshal(checkRule) + requestPayload := bytes.NewBuffer(serializedCheckRule) + req, err = http.NewRequest( + http.MethodPut, + checkRuleUrl, + requestPayload, + ) + case delete: + actionLabel = "delete" + req, err = http.NewRequest( + http.MethodDelete, + checkRuleUrl, + nil, + ) + default: + logger.Error(fmt.Errorf("unknown API action: %d", action), "unknown API action") + return nil, false + } + + if err != nil { + logger.Error( + err, + fmt.Sprintf( + "unable to create a new HTTP request to %s the rule at %s", + actionLabel, + checkRuleUrl, + )) + return nil, false + } + + req.Header.Set("Authorization", fmt.Sprintf("Bearer %s", preconditionCheckResult.authToken)) + if action == upsert { + req.Header.Set("Content-Type", "application/json") + } + + return req, true +} + +func convertRuleToCheckRule( + rule prometheusv1.Rule, + action apiAction, + groupName string, + interval *prometheusv1.Duration, + logger *logr.Logger, +) (*CheckRule, bool) { + if rule.Record != "" { + logger.Info("Skipping rule with record attribute", "record", rule.Record) + return nil, false + } + if rule.Alert == "" { + logger.Info("Skipping rule without alert attribute", "record", rule.Record) + return nil, false + } + + if action == delete { + // When deleting a rule, we do not need an actual payload, but do need to skip rules with rule.Record or without + // rule.Alert (that is why we still call convertRuleToCheckRule for deletions). + return &CheckRule{}, true + } + + // If action is not delete, it is upsert, and for that we need to create an actual payload, hence we need to convert + // the rule to a CheckRule. + checkRule := &CheckRule{ + Name: fmt.Sprintf("%s - %s", groupName, rule.Alert), + Interval: convertDuration(interval), + Annotations: rule.Annotations, + Labels: rule.Labels, + Expression: convertIntOrString(rule.Expr), + For: convertDuration(rule.For), + KeepFiringFor: convertNonEmptyDuration(rule.KeepFiringFor), + } + + return checkRule, true +} + +func convertDuration(duration *prometheusv1.Duration) string { + if duration == nil { + return "" + } + return string(*duration) +} + +func convertNonEmptyDuration(duration *prometheusv1.NonEmptyDuration) string { + if duration == nil { + return "" + } + return string(*duration) +} + +func convertIntOrString(intOrString intstr.IntOrString) string { + switch intOrString.Type { + case intstr.String: + return intOrString.StrVal + case intstr.Int: + return strconv.FormatInt(int64(intOrString.IntVal), 10) + } + return "" +} diff --git a/internal/dash0/controller/prometheus_rules_controller_test.go b/internal/dash0/controller/prometheus_rules_controller_test.go new file mode 100644 index 00000000..251b4fc5 --- /dev/null +++ b/internal/dash0/controller/prometheus_rules_controller_test.go @@ -0,0 +1,569 @@ +// SPDX-FileCopyrightText: Copyright 2024 Dash0 Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "encoding/json" + "fmt" + "time" + + prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/utils/ptr" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller/controllertest" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/h2non/gock" + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + . "github.com/dash0hq/dash0-operator/test/util" +) + +var ( + prometheusRuleCrdReconciler *PrometheusRuleCrdReconciler + prometheusRuleCrd *apiextensionsv1.CustomResourceDefinition + + checkRuleApiBasePath = "/api/alerting/check-rules/" + + defaultExpectedPathsCheckRules = []string{ + fmt.Sprintf("%s.*%s", checkRuleApiBasePath, "dash0-operator_.*_test-dataset_test-namespace_test-rule_group_1_0"), + fmt.Sprintf("%s.*%s", checkRuleApiBasePath, "dash0-operator_.*_test-dataset_test-namespace_test-rule_group_1_1"), + fmt.Sprintf("%s.*%s", checkRuleApiBasePath, "dash0-operator_.*_test-dataset_test-namespace_test-rule_group_2_0"), + fmt.Sprintf("%s.*%s", checkRuleApiBasePath, "dash0-operator_.*_test-dataset_test-namespace_test-rule_group_2_1"), + } +) + +var _ = Describe("The Prometheus rule controller", Ordered, func() { + ctx := context.Background() + logger := log.FromContext(ctx) + + BeforeAll(func() { + EnsureTestNamespaceExists(ctx, k8sClient) + EnsureOperatorNamespaceExists(ctx, k8sClient) + }) + + Describe("the Prometheus rule CRD reconciler", func() { + + AfterEach(func() { + ensurePrometheusRuleCrdDoesNotExist(ctx) + }) + + It("does not create a Prometheus rule resource reconciler if there is no auth token", func() { + createPrometheusRuleCrdReconcilerWithoutAuthToken() + ensurePrometheusRuleCrdExists(ctx) + Expect(prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler).To(BeNil()) + prometheusRuleCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + Endpoint: ApiEndpointTest, + Dataset: DatasetTest, + }, &logger) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler).To(BeNil()) + }) + + It("does not start watching Prometheus rules if the CRD does not exist and the API endpoint has not been provided", func() { + createPrometheusRuleCrdReconcilerWithAuthToken() + Expect(prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeFalse()) + }) + + It("does not start watching Prometheus rules if the API endpoint has been provided but the CRD does not exist", func() { + createPrometheusRuleCrdReconcilerWithAuthToken() + Expect(prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + prometheusRuleCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + Endpoint: ApiEndpointTest, + Dataset: DatasetTest, + }, &logger) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeFalse()) + }) + + It("does not start watching Prometheus rules if the CRD exists but the API endpoint has not been provided", func() { + createPrometheusRuleCrdReconcilerWithAuthToken() + ensurePrometheusRuleCrdExists(ctx) + Expect(prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeFalse()) + }) + + It("starts watching Prometheus rules if the CRD exists and the API endpoint has been provided", func() { + createPrometheusRuleCrdReconcilerWithAuthToken() + ensurePrometheusRuleCrdExists(ctx) + Expect(prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeFalse()) + prometheusRuleCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + Endpoint: ApiEndpointTest, + Dataset: DatasetTest, + }, &logger) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeTrue()) + }) + + It("starts watching Prometheus rules if API endpoint is provided and the CRD is created later on", func() { + createPrometheusRuleCrdReconcilerWithAuthToken() + Expect(prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + + // provide the API endpoint first + prometheusRuleCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + Endpoint: ApiEndpointTest, + Dataset: DatasetTest, + }, &logger) + + // create the CRD a bit later + time.Sleep(100 * time.Millisecond) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeFalse()) + ensurePrometheusRuleCrdExists(ctx) + // watches are not triggered in unit tests + prometheusRuleCrdReconciler.Create( + ctx, + event.TypedCreateEvent[client.Object]{ + Object: prometheusRuleCrd, + }, + &controllertest.TypedQueue[reconcile.Request]{}, + ) + + // verify that the controller starts watching when it sees the CRD being created + Eventually(func(g Gomega) { + g.Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeTrue()) + }).Should(Succeed()) + }) + }) + + Describe("the Prometheus rule resource reconciler", func() { + var prometheusRuleReconciler *PrometheusRuleReconciler + + BeforeAll(func() { + createPrometheusRuleCrdReconcilerWithAuthToken() + ensurePrometheusRuleCrdExists(ctx) + + Expect(prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, k8sClient, &logger)).To(Succeed()) + }) + + BeforeEach(func() { + prometheusRuleCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{ + Endpoint: ApiEndpointTest, + Dataset: DatasetTest, + }, &logger) + Expect(prometheusRuleCrdReconciler.prometheusRuleReconciler.isWatching.Load()).To(BeTrue()) + prometheusRuleReconciler = prometheusRuleCrdReconciler.prometheusRuleReconciler + }) + + AfterAll(func() { + ensurePrometheusRuleCrdDoesNotExist(ctx) + }) + + It("creates check rules", func() { + expectRulePutRequests(defaultExpectedPathsCheckRules) + defer gock.Off() + + ruleResource := createRuleResource() + prometheusRuleReconciler.Create( + ctx, + event.TypedCreateEvent[client.Object]{ + Object: &ruleResource, + }, + &controllertest.TypedQueue[reconcile.Request]{}, + ) + + Expect(gock.IsDone()).To(BeTrue()) + }) + + It("updates check rules", func() { + expectRulePutRequests(defaultExpectedPathsCheckRules) + defer gock.Off() + + ruleResource := createRuleResource() + prometheusRuleReconciler.Update( + ctx, + event.TypedUpdateEvent[client.Object]{ + ObjectNew: &ruleResource, + }, + &controllertest.TypedQueue[reconcile.Request]{}, + ) + + Expect(gock.IsDone()).To(BeTrue()) + }) + + It("deletes check rules", func() { + expectRuleDeleteRequests(defaultExpectedPathsCheckRules) + defer gock.Off() + + ruleResource := createRuleResource() + prometheusRuleReconciler.Delete( + ctx, + event.TypedDeleteEvent[client.Object]{ + Object: &ruleResource, + }, + &controllertest.TypedQueue[reconcile.Request]{}, + ) + + Expect(gock.IsDone()).To(BeTrue()) + }) + + It("it ignores Prometheus rule resource changes if API endpoint is not configured", func() { + expectRulePutRequests(defaultExpectedPathsCheckRules) + defer gock.Off() + + prometheusRuleCrdReconciler.RemoveApiEndpointAndDataset() + + ruleResource := createRuleResource() + prometheusRuleReconciler.Create( + ctx, + event.TypedCreateEvent[client.Object]{ + Object: &ruleResource, + }, + &controllertest.TypedQueue[reconcile.Request]{}, + ) + + Expect(gock.IsPending()).To(BeTrue()) + }) + }) + + Describe("converting a single Prometheus rule to a check rule", func() { + It("should convert an empty rule to nil", func() { + rule, ok := convertRuleToCheckRule( + prometheusv1.Rule{}, + upsert, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeFalse()) + Expect(rule).To(BeNil()) + }) + + It("should convert a record rule to nil", func() { + rule, ok := convertRuleToCheckRule( + prometheusv1.Rule{ + Record: "record", + Alert: "alert", + }, + upsert, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeFalse()) + Expect(rule).To(BeNil()) + }) + + It("should convert a rule without alert to nil", func() { + rule, ok := convertRuleToCheckRule( + prometheusv1.Rule{ + Expr: intstr.FromString("expr"), + For: ptr.To(prometheusv1.Duration("10s")), + KeepFiringFor: ptr.To(prometheusv1.NonEmptyDuration("10s")), + }, + upsert, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeFalse()) + Expect(rule).To(BeNil()) + }) + + It("should convert an almost empty rule", func() { + rule, ok := convertRuleToCheckRule( + prometheusv1.Rule{ + Alert: "alert", + }, + upsert, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeTrue()) + Expect(rule.Name).To(Equal("group - alert")) + Expect(rule.Expression).To(Equal("0")) + Expect(rule.For).To(Equal("")) + Expect(rule.KeepFiringFor).To(Equal("")) + Expect(rule.Annotations).To(BeEmpty()) + Expect(rule.Labels).To(BeEmpty()) + }) + + It("should convert a rule with all attributes", func() { + rule, ok := convertRuleToCheckRule( + prometheusv1.Rule{ + Alert: "alert", + Expr: intstr.FromString("expr"), + For: ptr.To(prometheusv1.Duration("10s")), + KeepFiringFor: ptr.To(prometheusv1.NonEmptyDuration("20s")), + Annotations: map[string]string{ + "annotation1": "annotation value 1", + "annotation2": "annotation value 2", + }, + Labels: map[string]string{ + "label1": "label value 1", + "label2": "label value 2", + }, + }, + upsert, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeTrue()) + Expect(rule.Name).To(Equal("group - alert")) + Expect(rule.Expression).To(Equal("expr")) + Expect(rule.For).To(Equal("10s")) + Expect(rule.KeepFiringFor).To(Equal("20s")) + Expect(rule.Annotations).To(HaveLen(2)) + Expect(rule.Annotations["annotation1"]).To(Equal("annotation value 1")) + Expect(rule.Annotations["annotation2"]).To(Equal("annotation value 2")) + Expect(rule.Labels).To(HaveLen(2)) + Expect(rule.Labels["label1"]).To(Equal("label value 1")) + Expect(rule.Labels["label2"]).To(Equal("label value 2")) + }) + + It("should convert a rule with an int expression", func() { + rule, ok := convertRuleToCheckRule( + prometheusv1.Rule{ + Alert: "alert", + Expr: intstr.FromInt32(123), + }, + upsert, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeTrue()) + Expect(rule.Name).To(Equal("group - alert")) + Expect(rule.Expression).To(Equal("123")) + }) + }) + + Describe("converting Prometheus rule resources to http requests", func() { + It("should convert an empty rule to nil", func() { + req, ok := convertRuleToRequest( + "https://api.dash0.com/alerting/check-rules/rule-id", + upsert, + prometheusv1.Rule{}, + &preconditionValidationResult{}, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeFalse()) + Expect(req).To(BeNil()) + }) + + It("should convert a record rule to nil", func() { + req, ok := convertRuleToRequest( + "https://api.dash0.com/alerting/check-rules/rule-id", + upsert, + prometheusv1.Rule{ + Record: "record", + Alert: "alert", + }, + &preconditionValidationResult{}, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeFalse()) + Expect(req).To(BeNil()) + }) + + It("should convert a rule without alert to nil", func() { + req, ok := convertRuleToRequest( + "https://api.dash0.com/alerting/check-rules/rule-id", + upsert, + prometheusv1.Rule{ + Expr: intstr.FromString("expr"), + For: ptr.To(prometheusv1.Duration("10s")), + KeepFiringFor: ptr.To(prometheusv1.NonEmptyDuration("10s")), + }, + &preconditionValidationResult{}, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeFalse()) + Expect(req).To(BeNil()) + }) + + It("should convert an almost empty rule", func() { + req, ok := convertRuleToRequest( + "https://api.dash0.com/alerting/check-rules/rule-id", + upsert, + prometheusv1.Rule{ + Alert: "alert", + }, + &preconditionValidationResult{}, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeTrue()) + Expect(req).ToNot(BeNil()) + Expect(req.URL.String()).To(Equal("https://api.dash0.com/alerting/check-rules/rule-id")) + }) + + It("should convert a rule with all attributes", func() { + req, ok := convertRuleToRequest( + "https://api.dash0.com/alerting/check-rules/rule-id", + upsert, + prometheusv1.Rule{ + Alert: "alert", + Expr: intstr.FromString("expr"), + For: ptr.To(prometheusv1.Duration("10s")), + KeepFiringFor: ptr.To(prometheusv1.NonEmptyDuration("20s")), + Annotations: map[string]string{ + "annotation1": "annotation value 1", + "annotation2": "annotation value 2", + }, + Labels: map[string]string{ + "label1": "label value 1", + "label2": "label value 2", + }, + }, + &preconditionValidationResult{}, + "group", + ptr.To(prometheusv1.Duration("10m")), + &logger, + ) + + Expect(ok).To(BeTrue()) + Expect(req).ToNot(BeNil()) + Expect(req.URL.String()).To(Equal("https://api.dash0.com/alerting/check-rules/rule-id")) + }) + }) +}) + +func createPrometheusRuleCrdReconcilerWithoutAuthToken() { + prometheusRuleCrdReconciler = &PrometheusRuleCrdReconciler{ + // We create the controller multiple times in tests, this option is required, otherwise the controller + // runtime will complain. + skipNameValidation: true, + } +} + +func createPrometheusRuleCrdReconcilerWithAuthToken() { + prometheusRuleCrdReconciler = &PrometheusRuleCrdReconciler{ + AuthToken: AuthorizationTokenTest, + + // We create the controller multiple times in tests, this option is required, otherwise the controller + // runtime will complain. + skipNameValidation: true, + } +} + +func expectRulePutRequests(expectedPaths []string) { + for _, expectedPath := range expectedPaths { + gock.New(ApiEndpointTest). + Put(expectedPath). + MatchParam("dataset", DatasetTest). + Times(1). + Reply(200). + JSON(map[string]string{}) + } +} + +func expectRuleDeleteRequests(expectedPaths []string) { + for _, expectedPath := range expectedPaths { + gock.New(ApiEndpointTest). + Delete(expectedPath). + MatchParam("dataset", DatasetTest). + Times(1). + Reply(200). + JSON(map[string]string{}) + } +} + +func createRuleResource() unstructured.Unstructured { + rule := prometheusv1.PrometheusRule{ + TypeMeta: metav1.TypeMeta{ + APIVersion: "monitoring.coreos.com/v1", + Kind: "PrometheusRule", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: "test-rule", + Namespace: TestNamespaceName, + }, + Spec: prometheusv1.PrometheusRuleSpec{ + Groups: []prometheusv1.RuleGroup{ + { + Name: "group_1", + Rules: []prometheusv1.Rule{ + { + Alert: "rule_1_1", + Expr: intstr.FromString("vector(1)"), + }, + { + Alert: "rule_1_2", + Expr: intstr.FromString("vector(1)"), + }, + { + Record: "rule_1_3", + Expr: intstr.FromString("vector(1)"), + }, + }, + }, + { + Name: "group_2", + Rules: []prometheusv1.Rule{ + { + Alert: "rule_2_1", + Expr: intstr.FromString("vector(1)"), + }, + { + Alert: "rule_2_2", + Expr: intstr.FromString("vector(1)"), + }, + }, + }, + }, + }, + } + marshalled, err := json.Marshal(rule) + Expect(err).NotTo(HaveOccurred()) + unstructuredObject := unstructured.Unstructured{} + err = json.Unmarshal(marshalled, &unstructuredObject) + Expect(err).NotTo(HaveOccurred()) + return unstructuredObject +} + +func ensurePrometheusRuleCrdExists(ctx context.Context) { + prometheusRuleCrd = EnsurePrometheusRuleCrdExists( + ctx, + k8sClient, + ) +} + +func ensurePrometheusRuleCrdDoesNotExist(ctx context.Context) { + if prometheusRuleCrd != nil { + err := k8sClient.Delete(ctx, prometheusRuleCrd, &client.DeleteOptions{ + GracePeriodSeconds: new(int64), + }) + if err != nil && apierrors.IsNotFound(err) { + return + } else if err != nil { + Expect(err).NotTo(HaveOccurred()) + } + + Eventually(func(g Gomega) { + err := k8sClient.Get(ctx, PrometheusRuleCrdQualifiedName, &apiextensionsv1.CustomResourceDefinition{}) + g.Expect(err).To(HaveOccurred()) + g.Expect(apierrors.IsNotFound(err)).To(BeTrue()) + }).Should(Succeed()) + + prometheusRuleCrd = nil + } +} diff --git a/internal/dash0/controller/third_party_crd_common.go b/internal/dash0/controller/third_party_crd_common.go new file mode 100644 index 00000000..fe95577e --- /dev/null +++ b/internal/dash0/controller/third_party_crd_common.go @@ -0,0 +1,533 @@ +// SPDX-FileCopyrightText: Copyright 2024 Dash0 Inc. +// SPDX-License-Identifier: Apache-2.0 + +package controller + +import ( + "context" + "fmt" + "io" + "net/http" + "net/url" + "strings" + "sync/atomic" + "time" + + "github.com/go-logr/logr" + corev1 "k8s.io/api/core/v1" + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/apimachinery/pkg/types" + "k8s.io/apimachinery/pkg/util/wait" + "k8s.io/utils/ptr" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/builder" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/controller" + "sigs.k8s.io/controller-runtime/pkg/event" + "sigs.k8s.io/controller-runtime/pkg/handler" + "sigs.k8s.io/controller-runtime/pkg/predicate" + "sigs.k8s.io/controller-runtime/pkg/reconcile" + + "github.com/dash0hq/dash0-operator/internal/dash0/util" +) + +type ApiClient interface { + SetApiEndpointAndDataset(*ApiConfig, *logr.Logger) + RemoveApiEndpointAndDataset() +} + +type ThirdPartyCrdReconciler interface { + handler.TypedEventHandler[client.Object, reconcile.Request] + reconcile.TypedReconciler[reconcile.Request] + + Manager() ctrl.Manager + GetAuthToken() string + KindDisplayName() string + Group() string + Kind() string + Version() string + QualifiedKind() string + ControllerName() string + DoesCrdExist() *atomic.Bool + SetCrdExists(bool) + SkipNameValidation() bool + CreateResourceReconciler(types.UID, string, *http.Client) + ResourceReconciler() ThirdPartyResourceReconciler +} + +type ThirdPartyResourceReconciler interface { + handler.TypedEventHandler[client.Object, reconcile.Request] + reconcile.TypedReconciler[reconcile.Request] + + KindDisplayName() string + ShortName() string + IsWatching() *atomic.Bool + SetIsWatching(bool) + GetAuthToken() string + GetApiConfig() *atomic.Pointer[ApiConfig] + ControllerName() string + HttpClient() *http.Client + MapResourceToHttpRequests(*preconditionValidationResult, apiAction, *logr.Logger) ([]*http.Request, error) +} + +type ApiConfig struct { + Endpoint string + Dataset string +} + +type apiAction int + +const ( + upsert apiAction = iota + delete +) + +type preconditionValidationResult struct { + executeRequest bool + authToken string + apiEndpoint string + dataset string + k8sNamespace string + k8sName string + spec map[string]interface{} +} + +var ( + retrySettings = wait.Backoff{ + Duration: 5 * time.Second, + Factor: 1.5, + Steps: 3, + } +) + +func SetupThirdPartyCrdReconcilerWithManager( + ctx context.Context, + k8sClient client.Client, + crdReconciler ThirdPartyCrdReconciler, + logger *logr.Logger, +) error { + authToken := crdReconciler.GetAuthToken() + if authToken == "" { + logger.Info(fmt.Sprintf("No Dash0 auth token has been provided via the operator configuration resource. "+ + "The operator will not watch for %s resources.", crdReconciler.KindDisplayName())) + return nil + } + + kubeSystemNamespace := &corev1.Namespace{} + if err := k8sClient.Get(ctx, client.ObjectKey{Name: "kube-system"}, kubeSystemNamespace); err != nil { + msg := "unable to get the kube-system namespace uid" + logger.Error(err, msg) + return fmt.Errorf("%s: %w", msg, err) + } + + crdReconciler.CreateResourceReconciler( + kubeSystemNamespace.UID, + authToken, + &http.Client{}, + ) + + if err := k8sClient.Get(ctx, client.ObjectKey{ + Name: crdReconciler.QualifiedKind(), + }, &apiextensionsv1.CustomResourceDefinition{}); err != nil { + if !apierrors.IsNotFound(err) { + logger.Error( + err, + fmt.Sprintf("unable to call client.Get(\"%s\") custom resource definition", + crdReconciler.QualifiedKind())) + return err + } + } else { + crdReconciler.SetCrdExists(true) + maybeStartWatchingThirdPartyResources(crdReconciler, true, logger) + } + + controllerBuilder := ctrl.NewControllerManagedBy(crdReconciler.Manager()). + Named(crdReconciler.ControllerName()). + Watches( + &apiextensionsv1.CustomResourceDefinition{}, + // Deliberately not using a convenience mechanism like &handler.EnqueueRequestForObject{} (which would + // feed all events into the Reconcile method) here, since using the lower-level TypedEventHandler interface + // directly allows us to distinguish between create and delete events more easily. + crdReconciler, + builder.WithPredicates( + makeFilterPredicate( + crdReconciler.Group(), + crdReconciler.Kind(), + ))) + if crdReconciler.SkipNameValidation() { + controllerBuilder = controllerBuilder.WithOptions(controller.TypedOptions[reconcile.Request]{ + SkipNameValidation: ptr.To(true), + }) + } + if err := controllerBuilder.Complete(crdReconciler); err != nil { + logger.Error(err, + fmt.Sprintf( + "unable to build the controller for the %s CRD reconciler", + crdReconciler.KindDisplayName(), + )) + return err + } + + return nil +} + +func makeFilterPredicate(group string, kind string) predicate.Funcs { + return predicate.Funcs{ + CreateFunc: func(e event.CreateEvent) bool { + return isMatchingCrd(group, kind, e.Object) + }, + UpdateFunc: func(e event.UpdateEvent) bool { + // We are not interested in updates, but we still need to define a filter predicate for it, otherwise _all_ + // update events for CRDs would be passed to our event handler. We always return false to ignore update + // events entirely. Same for generic events. + return false + }, + DeleteFunc: func(e event.DeleteEvent) bool { + return isMatchingCrd(group, kind, e.Object) + }, + GenericFunc: func(e event.GenericEvent) bool { + return false + }, + } +} + +func isMatchingCrd(group string, kind string, crd client.Object) bool { + if crdCasted, ok := crd.(*apiextensionsv1.CustomResourceDefinition); ok { + return crdCasted.Spec.Group == group && + crdCasted.Spec.Names.Kind == kind + } else { + return false + } +} + +func maybeStartWatchingThirdPartyResources( + crdReconciler ThirdPartyCrdReconciler, + isStartup bool, + logger *logr.Logger, +) { + if crdReconciler.ResourceReconciler().IsWatching().Load() { + // we are already watching, do not start a second watch + return + } + + if !crdReconciler.DoesCrdExist().Load() { + logger.Info( + fmt.Sprintf("The %s custom resource definition does not exist in this cluster, the operator will not "+ + "watch for %s resources.", + crdReconciler.QualifiedKind(), + crdReconciler.KindDisplayName(), + )) + return + } + + apiConfig := crdReconciler.ResourceReconciler().GetApiConfig().Load() + if !isValidApiConfig(apiConfig) { + if !isStartup { + // Silently ignore this missing precondition if it happens during the startup of the operator. It will + // be remedied automatically once the operator configuration resource is reconciled for the first time. + logger.Info( + fmt.Sprintf( + "The %s custom resource definition is present in this cluster, but no Dash0 API endpoint been "+ + "provided via the operator configuration resource, or the operator configuration resource "+ + "has not been reconciled yet. The operator will not watch for %s resources. "+ + "(If there is an operator configuration resource with an API endpoint present in the "+ + "cluster, it will be reconciled in a few seconds and this message can be safely ignored.)", + crdReconciler.QualifiedKind(), + crdReconciler.KindDisplayName(), + )) + } + return + } + + logger.Info( + fmt.Sprintf( + "The %s custom resource definition is present in this cluster, and a Dash0 API endpoint has been provided. "+ + "The operator will watch for %s resources.", + crdReconciler.QualifiedKind(), + crdReconciler.KindDisplayName(), + ), + ) + startWatchingThirdPartyResources(crdReconciler, logger) +} + +func startWatchingThirdPartyResources( + crdReconciler ThirdPartyCrdReconciler, + logger *logr.Logger, +) { + logger.Info(fmt.Sprintf("Setting up a watch for %s custom resources.", crdReconciler.KindDisplayName())) + + unstructuredGvkForPersesDashboards := &unstructured.Unstructured{} + unstructuredGvkForPersesDashboards.SetGroupVersionKind(schema.GroupVersionKind{ + Kind: crdReconciler.Kind(), + Group: crdReconciler.Group(), + Version: crdReconciler.Version(), + }) + + resourceReconciler := crdReconciler.ResourceReconciler() + controllerBuilder := ctrl.NewControllerManagedBy(crdReconciler.Manager()). + Named(resourceReconciler.ControllerName()). + Watches( + unstructuredGvkForPersesDashboards, + // Deliberately not using a convenience mechanism like &handler.EnqueueRequestForObject{} (which would + // feed all events into the Reconcile method) here, since using the lower-level TypedEventHandler interface + // directly allows us to distinguish between create and delete events more easily. + resourceReconciler, + ) + if crdReconciler.SkipNameValidation() { + controllerBuilder = controllerBuilder.WithOptions(controller.TypedOptions[reconcile.Request]{ + SkipNameValidation: ptr.To(true), + }) + } + if err := controllerBuilder.Complete(resourceReconciler); err != nil { + logger.Error(err, "unable to create a new controller for watching Perses dashboards") + return + } + resourceReconciler.SetIsWatching(true) +} + +func isValidApiConfig(apiConfig *ApiConfig) bool { + return apiConfig != nil && apiConfig.Endpoint != "" +} + +func urlEncodePathSegment(s string) string { + return url.PathEscape( + // For now the Dash0 backend treats %2F the same as "/", so we need to replace forward slashes with + // something other than %2F. + // See https://stackoverflow.com/questions/71581828/gin-problem-accessing-url-encoded-path-param-containing-forward-slash + strings.ReplaceAll(s, "/", "|"), + ) +} + +func upsertViaApi( + resourceReconciler ThirdPartyResourceReconciler, + resource *unstructured.Unstructured, + logger *logr.Logger, +) error { + preconditionChecksResult := validatePreconditions( + resourceReconciler, + resource, + logger, + ) + if !preconditionChecksResult.executeRequest { + return nil + } + + httpRequests, err := resourceReconciler.MapResourceToHttpRequests(preconditionChecksResult, upsert, logger) + if err != nil { + return err + } + + if len(httpRequests) == 0 { + logger.Info( + fmt.Sprintf( + "%s %s/%s did not contain any %s, skipping.", + resourceReconciler.KindDisplayName(), + resource.GetNamespace(), + resource.GetName(), + resourceReconciler.ShortName(), + )) + } + + return executeAllHttpRequests(resourceReconciler, httpRequests, "Creating/updating", logger) +} + +func deleteViaApi( + resourceReconciler ThirdPartyResourceReconciler, + resource *unstructured.Unstructured, + logger *logr.Logger, +) error { + preconditionChecksResult := validatePreconditions( + resourceReconciler, + resource, + logger, + ) + if !preconditionChecksResult.executeRequest { + return nil + } + + httpRequests, err := resourceReconciler.MapResourceToHttpRequests(preconditionChecksResult, delete, logger) + if err != nil { + return err + } + + if len(httpRequests) == 0 { + logger.Info( + fmt.Sprintf( + "%s %s/%s did not contain any %s, skipping.", + resourceReconciler.KindDisplayName(), + resource.GetNamespace(), + resource.GetName(), + resourceReconciler.ShortName(), + )) + } + + return executeAllHttpRequests(resourceReconciler, httpRequests, "Deleting", logger) +} + +func validatePreconditions( + resourceReconciler ThirdPartyResourceReconciler, + resource *unstructured.Unstructured, + logger *logr.Logger, +) *preconditionValidationResult { + namespace := resource.GetNamespace() + name := resource.GetName() + + apiConfig := resourceReconciler.GetApiConfig().Load() + if !isValidApiConfig(apiConfig) { + logger.Info( + fmt.Sprintf( + "No Dash0 API endpoint has been provided via the operator configuration resource, "+ + "the %s(s) from %s/%s will not be updated in Dash0.", + resourceReconciler.ShortName(), + namespace, + name, + )) + return &preconditionValidationResult{ + executeRequest: false, + } + } + + authToken := resourceReconciler.GetAuthToken() + if authToken == "" { + logger.Info( + fmt.Sprintf( + "No auth token is set on the controller deployment, the %s(s) from %s/%s not be updated in Dash0.", + resourceReconciler.ShortName(), + namespace, + name, + )) + return &preconditionValidationResult{ + executeRequest: false, + } + } + + dataset := apiConfig.Dataset + if dataset == "" { + dataset = util.DatasetDefault + } + + specRaw := resource.Object["spec"] + if specRaw == nil { + logger.Info( + fmt.Sprintf( + "%s %s/%s has no spec, the %s(s) from will not be updated in Dash0.", + resourceReconciler.KindDisplayName(), + namespace, + name, + resourceReconciler.ShortName(), + )) + return &preconditionValidationResult{ + executeRequest: false, + } + } + spec, ok := specRaw.(map[string]interface{}) + if !ok { + logger.Info( + fmt.Sprintf( + "The %s spec in %s/%s is not a map, the %s(s) will not be updated in Dash0.", + resourceReconciler.KindDisplayName(), + namespace, + name, + resourceReconciler.ShortName(), + )) + return &preconditionValidationResult{ + executeRequest: false, + } + } + + return &preconditionValidationResult{ + executeRequest: true, + authToken: authToken, + apiEndpoint: apiConfig.Endpoint, + dataset: dataset, + k8sNamespace: namespace, + k8sName: name, + spec: spec, + } +} + +func executeAllHttpRequests( + resourceReconciler ThirdPartyResourceReconciler, + allRequests []*http.Request, + actionLabel string, + logger *logr.Logger, +) error { + for _, req := range allRequests { + if err := executeSingleHttpRequest(resourceReconciler, req, actionLabel, logger); err != nil { + return err + } + } + return nil +} + +func executeSingleHttpRequest( + resourceReconciler ThirdPartyResourceReconciler, + req *http.Request, + actionLabel string, + logger *logr.Logger, +) error { + logger.Info( + fmt.Sprintf( + "%s %s at %s in Dash0", + actionLabel, + resourceReconciler.ShortName(), + req.URL.String(), + )) + res, err := resourceReconciler.HttpClient().Do(req) + if err != nil { + logger.Error(err, + fmt.Sprintf( + "unable to execute the HTTP request to create/update/delete the %s at %s", + resourceReconciler.ShortName(), + req.URL.String(), + )) + return err + } + + if res.StatusCode < http.StatusOK || res.StatusCode >= http.StatusMultipleChoices { + return handleNon2xxStatusCode(resourceReconciler, req, res, logger) + } + + // http status code was 2xx, discard the response body and close it + defer func() { + _, _ = io.Copy(io.Discard, res.Body) + _ = res.Body.Close() + }() + + return nil +} + +func handleNon2xxStatusCode( + resourceReconciler ThirdPartyResourceReconciler, + req *http.Request, + res *http.Response, + logger *logr.Logger, +) error { + defer func() { + _ = res.Body.Close() + }() + responseBody, readErr := io.ReadAll(res.Body) + if readErr != nil { + readBodyErr := fmt.Errorf("unable to read the API response payload after receiving status code %d when "+ + "trying to udpate/create/delete the %s at %s", + res.StatusCode, + resourceReconciler.ShortName(), + req.URL.String(), + ) + logger.Error(readBodyErr, "unable to read the API response payload") + return readBodyErr + } + + statusCodeErr := fmt.Errorf( + "unexpected status code %d when updating/creating/deleting the %s at %s, response body is %s", + res.StatusCode, + resourceReconciler.ShortName(), + req.URL.String(), + string(responseBody), + ) + logger.Error(statusCodeErr, "unexpected status code") + return statusCodeErr +} diff --git a/internal/dash0/selfmonitoringapiaccess/self_monitoring_and_api_access.go b/internal/dash0/selfmonitoringapiaccess/self_monitoring_and_api_access.go index 4b726f4e..fe0c445e 100644 --- a/internal/dash0/selfmonitoringapiaccess/self_monitoring_and_api_access.go +++ b/internal/dash0/selfmonitoringapiaccess/self_monitoring_and_api_access.go @@ -633,7 +633,7 @@ func ConvertExportConfigurationToEnvVarSettings(selfMonitoringExport dash0v1alph Name: util.AuthorizationHeaderName, Value: authHeaderValue, }} - if dash0Export.Dataset != "" && dash0Export.Dataset != "default" { + if dash0Export.Dataset != "" && dash0Export.Dataset != util.DatasetDefault { headers = append(headers, dash0v1alpha1.Header{ Name: util.Dash0DatasetHeaderName, Value: dash0Export.Dataset, diff --git a/internal/dash0/util/constants.go b/internal/dash0/util/constants.go index d6ecbf8e..de454c7f 100644 --- a/internal/dash0/util/constants.go +++ b/internal/dash0/util/constants.go @@ -6,6 +6,7 @@ package util const ( AuthorizationHeaderName = "Authorization" Dash0DatasetHeaderName = "Dash0-Dataset" + DatasetDefault = "default" DatasetInsights = "dash0-internal" SelfMonitoringAndApiAuthTokenEnvVarName = "SELF_MONITORING_AND_API_AUTH_TOKEN" diff --git a/test-resources/bin/test-cleanup.sh b/test-resources/bin/test-cleanup.sh index c0036e2f..22690a48 100755 --- a/test-resources/bin/test-cleanup.sh +++ b/test-resources/bin/test-cleanup.sh @@ -38,15 +38,16 @@ kubectl delete secret \ kubectl delete ns dash0-system --ignore-not-found -# deliberately deleting the dashboard after undeploying the operator to avoid deleting the dashboard in Dash0 every time. +# deliberately deleting dashboards & check rules after undeploying the operator to avoid deleting these items in +# Dash0 every time. kubectl delete -n ${target_namespace} -f test-resources/customresources/persesdashboard/persesdashboard.yaml || true +kubectl delete -n ${target_namespace} -f test-resources/customresources/prometheusrule/prometheusrule.yaml || true kubectl delete --ignore-not-found=true customresourcedefinition dash0monitorings.operator.dash0.com kubectl delete --ignore-not-found=true customresourcedefinition dash0operatorconfigurations.operator.dash0.com kubectl delete --ignore-not-found=true customresourcedefinition dash0operatorconfigurations.operator.dash0.com -kubectl delete --ignore-not-found=true customresourcedefinition perses.perses.dev kubectl delete --ignore-not-found=true customresourcedefinition persesdashboards.perses.dev -kubectl delete --ignore-not-found=true customresourcedefinition persesdatasources.perses.dev +kubectl delete --ignore-not-found=true customresourcedefinition prometheusrules.monitoring.coreos.com # The following resources are deleted automatically with helm uninstall, unless for example when the operator manager # crashes and the helm pre-delete helm hook cannot run, then they might be left behind. diff --git a/test-resources/bin/test-scenario-01-aum-operator-cr.sh b/test-resources/bin/test-scenario-01-aum-operator-cr.sh index 4297e05f..25fefdb7 100755 --- a/test-resources/bin/test-scenario-01-aum-operator-cr.sh +++ b/test-resources/bin/test-scenario-01-aum-operator-cr.sh @@ -34,16 +34,12 @@ kubectl create secret \ --from-literal=token="${DASH0_AUTHORIZATION_TOKEN}" finish_step -echo "STEP $step_counter: install foreign custom resource definitions" -install_foreign_crds +echo "STEP $step_counter: install third-party custom resource definitions" +install_third_party_crds finish_step -if [[ "${DEPLOY_PERSES_DASHBOARD:-}" == true ]]; then - echo "STEP $step_counter: deploy a Perses dashboard resource to namespace ${target_namespace}" - kubectl apply -n ${target_namespace} -f test-resources/customresources/persesdashboard/persesdashboard.yaml - finish_step -fi +install_third_party_resources echo "STEP $step_counter: rebuild images" build_all_images diff --git a/test-resources/bin/test-scenario-02-operator-cr-aum.sh b/test-resources/bin/test-scenario-02-operator-cr-aum.sh index 007cc3f5..57afcc21 100755 --- a/test-resources/bin/test-scenario-02-operator-cr-aum.sh +++ b/test-resources/bin/test-scenario-02-operator-cr-aum.sh @@ -34,8 +34,8 @@ kubectl create secret \ --from-literal=token="${DASH0_AUTHORIZATION_TOKEN}" finish_step -echo "STEP $step_counter: install foreign custom resource definitions" -install_foreign_crds +echo "STEP $step_counter: install third-party custom resource definitions" +install_third_party_crds finish_step echo "STEP $step_counter: rebuild images" @@ -71,8 +71,4 @@ if [[ "${DEPLOY_APPLICATION_UNDER_MONITORING:-}" != false ]]; then finish_step fi -if [[ "${DEPLOY_PERSES_DASHBOARD:-}" == true ]]; then - echo "STEP $step_counter: deploy a Perses dashboard resource to namespace ${target_namespace}" - kubectl apply -n ${target_namespace} -f test-resources/customresources/persesdashboard/persesdashboard.yaml - finish_step -fi +install_third_party_resources diff --git a/test-resources/bin/util b/test-resources/bin/util index 18afd709..1869e1ee 100644 --- a/test-resources/bin/util +++ b/test-resources/bin/util @@ -193,7 +193,20 @@ install_monitoring_resource() { kubectl wait --namespace ${target_namespace} dash0monitorings.operator.dash0.com/dash0-monitoring-resource --for condition=Available } -install_foreign_crds() { +install_third_party_crds() { kubectl apply --server-side -f https://raw.githubusercontent.com/perses/perses-operator/main/config/crd/bases/perses.dev_persesdashboards.yaml + kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.77.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml } +install_third_party_resources() { + if [[ "${DEPLOY_PERSES_DASHBOARD:-}" == true ]]; then + echo "STEP $step_counter: deploy a Perses dashboard resource to namespace ${target_namespace}" + kubectl apply -n ${target_namespace} -f test-resources/customresources/persesdashboard/persesdashboard.yaml + finish_step + fi + if [[ "${DEPLOY_PROMETHEUS_RULE:-}" == true ]]; then + echo "STEP $step_counter: deploy a Prometheus rule resource to namespace ${target_namespace}" + kubectl apply -n ${target_namespace} -f test-resources/customresources/prometheusrule/prometheusrule.yaml + finish_step + fi +} \ No newline at end of file diff --git a/test-resources/customresources/prometheusrule/prometheusrule.yaml b/test-resources/customresources/prometheusrule/prometheusrule.yaml new file mode 100644 index 00000000..e13a1aa6 --- /dev/null +++ b/test-resources/customresources/prometheusrule/prometheusrule.yaml @@ -0,0 +1,44 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + labels: + prometheus: example + role: alert-rules + name: prometheus-example-rules +spec: + groups: + - name: dash0/k8s + interval: 5m + limit: 10 + partial_response_strategy: warn + rules: + - alert: K8s Deployment replicas mismatch + expr: "kube_deployment_spec_replicas != kube_deployment_status_replicas_available" + for: 10s + keep_firing_for: 10s + annotations: + description: "K8s Deployment replicas mismatch" + summary: "K8s Deployment replicas mismatch {{ $labels.k8s_node_name }}" + some-annotation: "another annotation" + labels: + label-1: label value 1 + label-2: label value 2 + label-3: label value 3 + - alert: K8s pod crash looping + expr: "increase(kube_pod_container_status_restarts_total[1m]) > $__threshold" + annotations: + description: "Pod labels.namespace/labels.pod is crash looping. VALUE = value, LABELS = labels" + summary: "K8s pod crash looping K8s pod crash looping {{ $label. k8s_node_name }}" + - record: will be skipped + expr: "vector(1)" + - name: dash0/collector + interval: 10m + limit: 15 + partial_response_strategy: abort + rules: + - alert: exporter send failed spans + expr: "sum by (cloud_region) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > $__threshold" + for: 5m + annotations: + description: "dash0-collector - exporter send failed spans" + summary: "dash0-collector - exporter send failed spans {{ $labels.cloud_region }}" diff --git a/test/e2e/application_under_test.go b/test/e2e/application_under_test.go index 9e850556..9532040d 100644 --- a/test/e2e/application_under_test.go +++ b/test/e2e/application_under_test.go @@ -11,7 +11,7 @@ import ( "gopkg.in/yaml.v3" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/e2e/collector.go b/test/e2e/collector.go index 604025fc..c0cd4635 100644 --- a/test/e2e/collector.go +++ b/test/e2e/collector.go @@ -9,7 +9,7 @@ import ( "strings" "time" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/e2e/container_images.go b/test/e2e/container_images.go index bbb190b3..21f3bf10 100644 --- a/test/e2e/container_images.go +++ b/test/e2e/container_images.go @@ -8,7 +8,7 @@ import ( "os/exec" "strings" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/e2e/dash0_monitoring_resource.go b/test/e2e/dash0_monitoring_resource.go index 08b315b0..50e81a3f 100644 --- a/test/e2e/dash0_monitoring_resource.go +++ b/test/e2e/dash0_monitoring_resource.go @@ -13,7 +13,7 @@ import ( dash0v1alpha1 "github.com/dash0hq/dash0-operator/api/dash0monitoring/v1alpha1" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/e2e/dash0_operator_configuration_resource.go b/test/e2e/dash0_operator_configuration_resource.go index 93a65ca5..b559e19e 100644 --- a/test/e2e/dash0_operator_configuration_resource.go +++ b/test/e2e/dash0_operator_configuration_resource.go @@ -12,7 +12,7 @@ import ( "text/template" "time" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/e2e/e2e_test.go b/test/e2e/e2e_test.go index b96305db..e70211c1 100644 --- a/test/e2e/e2e_test.go +++ b/test/e2e/e2e_test.go @@ -40,7 +40,7 @@ var _ = Describe("Dash0 Kubernetes Operator", Ordered, func() { pwdOutput, err := run(exec.Command("pwd"), false) Expect(err).NotTo(HaveOccurred()) workingDir = strings.TrimSpace(pwdOutput) - _, _ = fmt.Fprintf(GinkgoWriter, "workingDir: %s\n", workingDir) + e2ePrint("workingDir: %s\n", workingDir) env, err := readDotEnvFile(dotEnvFile) Expect(err).NotTo(HaveOccurred()) @@ -914,9 +914,9 @@ func runInParallelForAllWorkloadTypes[C workloadConfig]( go func(cfg C) { defer GinkgoRecover() defer wg.Done() - _, _ = fmt.Fprintf(GinkgoWriter, "(before test step: %s)\n", workloadTypeString) + e2ePrint("(before test step: %s)\n", workloadTypeString) testStep(cfg) - _, _ = fmt.Fprintf(GinkgoWriter, "(after test step: %s)\n", workloadTypeString) + e2ePrint("(after test step: %s)\n", workloadTypeString) passed[workloadTypeString] = true }(config) } diff --git a/test/e2e/events.go b/test/e2e/events.go index d5722c7c..afa97ff0 100644 --- a/test/e2e/events.go +++ b/test/e2e/events.go @@ -12,7 +12,6 @@ import ( "github.com/dash0hq/dash0-operator/internal/dash0/util" - //nolint:golint,revive . "github.com/onsi/gomega" testUtil "github.com/dash0hq/dash0-operator/test/util" diff --git a/test/e2e/labels.go b/test/e2e/labels.go index 0f5daea2..a53a8462 100644 --- a/test/e2e/labels.go +++ b/test/e2e/labels.go @@ -11,7 +11,7 @@ import ( "github.com/dash0hq/dash0-operator/internal/dash0/util" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/e2e/operator.go b/test/e2e/operator.go index 3ff5af4b..8a6a10a7 100644 --- a/test/e2e/operator.go +++ b/test/e2e/operator.go @@ -12,7 +12,7 @@ import ( "github.com/dash0hq/dash0-operator/internal/dash0/startup" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) @@ -106,7 +106,7 @@ func deployOperatorWithAutoOperationConfiguration( return err } - _, _ = fmt.Fprintf(GinkgoWriter, "output of helm install:\n%s", output) + e2ePrint("output of helm install:\n%s", output) waitForManagerPodAndWebhookToStart(operatorNamespace) if operatorConfigurationValues != nil { @@ -190,8 +190,7 @@ func ensureDash0OperatorHelmRepoIsInstalled( Expect(err).NotTo(HaveOccurred()) if !regexp.MustCompile( fmt.Sprintf("%s\\s+%s", repositoryName, operatorHelmChartUrl)).MatchString(repoList) { - _, _ = fmt.Fprintf( - GinkgoWriter, + e2ePrint( "The helm repo %s (%s) has not been found, adding it now.\n", repositoryName, operatorHelmChartUrl, @@ -207,8 +206,7 @@ func ensureDash0OperatorHelmRepoIsInstalled( ))).To(Succeed()) Expect(runAndIgnoreOutput(exec.Command("helm", "repo", "update"))).To(Succeed()) } else { - _, _ = fmt.Fprintf( - GinkgoWriter, + e2ePrint( "The helm repo %s (%s) is already installed, updating it now.\n", repositoryName, operatorHelmChartUrl, @@ -333,7 +331,7 @@ func upgradeOperator( output, err := run(exec.Command("helm", arguments...)) Expect(err).NotTo(HaveOccurred()) - _, _ = fmt.Fprintf(GinkgoWriter, "output of helm upgrade:\n%s", output) + e2ePrint("output of helm upgrade:\n%s", output) By("waiting shortly, to give the operator time to restart after helm upgrade") time.Sleep(5 * time.Second) diff --git a/test/e2e/run_command.go b/test/e2e/run_command.go index ddaa9ccb..70f54fad 100644 --- a/test/e2e/run_command.go +++ b/test/e2e/run_command.go @@ -10,7 +10,6 @@ import ( "strings" "time" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive . "github.com/onsi/gomega" ) @@ -38,17 +37,17 @@ func run(cmd *exec.Cmd, logCommandArgs ...bool) (string, error) { cmd.Dir = dir if err := os.Chdir(cmd.Dir); err != nil { - _, _ = fmt.Fprintf(GinkgoWriter, "chdir dir: %s\n", err) + e2ePrint("chdir dir: %s\n", err) } cmd.Env = append(os.Environ(), "GO111MODULE=on") command := strings.Join(cmd.Args, " ") if logCommand { - _, _ = fmt.Fprintf(GinkgoWriter, "running: %s\n", command) + e2ePrint("running: %s\n", command) } output, err := cmd.CombinedOutput() if alwaysLogOutput { - _, _ = fmt.Fprintf(GinkgoWriter, "output: %s\n", string(output)) + e2ePrint("output: %s\n", string(output)) } if err != nil { return string(output), fmt.Errorf("%s failed with error: (%v) %s", command, err, string(output)) diff --git a/test/e2e/setup_teardown.go b/test/e2e/setup_teardown.go index a99a4a5c..48ea5a22 100644 --- a/test/e2e/setup_teardown.go +++ b/test/e2e/setup_teardown.go @@ -9,7 +9,7 @@ import ( "strconv" "strings" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/e2e/spans.go b/test/e2e/spans.go index 82d3c571..43cffa98 100644 --- a/test/e2e/spans.go +++ b/test/e2e/spans.go @@ -14,7 +14,6 @@ import ( "go.opentelemetry.io/collector/pdata/ptrace" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive . "github.com/onsi/gomega" ) @@ -80,7 +79,7 @@ func sendRequest(g Gomega, port int, httpPathWithQuery string) { }() responseBody, err := io.ReadAll(response.Body) if err != nil { - _, _ = fmt.Fprintf(GinkgoWriter, "could not read http response from %s: %s\n", url, err.Error()) + e2ePrint("could not read http response from %s: %s\n", url, err.Error()) } g.Expect(err).NotTo(HaveOccurred()) g.Expect(responseBody).To(ContainSubstring("We make Observability easy for every developer.")) diff --git a/test/e2e/util.go b/test/e2e/util.go new file mode 100644 index 00000000..defc5da5 --- /dev/null +++ b/test/e2e/util.go @@ -0,0 +1,14 @@ +// SPDX-FileCopyrightText: Copyright 2024 Dash0 Inc. +// SPDX-License-Identifier: Apache-2.0 + +package e2e + +import ( + "fmt" + + . "github.com/onsi/ginkgo/v2" +) + +func e2ePrint(format string, a ...any) { + fmt.Fprintf(GinkgoWriter, format, a...) +} diff --git a/test/e2e/verify_instrumentation.go b/test/e2e/verify_instrumentation.go index 6663bbe0..70f4d58a 100644 --- a/test/e2e/verify_instrumentation.go +++ b/test/e2e/verify_instrumentation.go @@ -8,7 +8,7 @@ import ( "os/exec" "time" - . "github.com/onsi/ginkgo/v2" //nolint:golint,revive + . "github.com/onsi/ginkgo/v2" . "github.com/onsi/gomega" ) diff --git a/test/util/crds/monitoring.coreos.com_prometheusrules.yaml b/test/util/crds/monitoring.coreos.com_prometheusrules.yaml new file mode 100644 index 00000000..ab08bf10 --- /dev/null +++ b/test/util/crds/monitoring.coreos.com_prometheusrules.yaml @@ -0,0 +1,141 @@ +# https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.77.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.16.1 + operator.prometheus.io/version: 0.77.1 + name: prometheusrules.monitoring.coreos.com +spec: + group: monitoring.coreos.com + names: + categories: + - prometheus-operator + kind: PrometheusRule + listKind: PrometheusRuleList + plural: prometheusrules + shortNames: + - promrule + singular: prometheusrule + scope: Namespaced + versions: + - name: v1 + schema: + openAPIV3Schema: + description: |- + The `PrometheusRule` custom resource definition (CRD) defines [alerting](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) and [recording](https://prometheus.io/docs/prometheus/latest/configuration/recording_rules/) rules to be evaluated by `Prometheus` or `ThanosRuler` objects. + + `Prometheus` and `ThanosRuler` objects select `PrometheusRule` objects using label and namespace selectors. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: Specification of desired alerting rule definitions for Prometheus. + properties: + groups: + description: Content of Prometheus rule file + items: + description: RuleGroup is a list of sequentially evaluated recording + and alerting rules. + properties: + interval: + description: Interval determines how often rules in the group + are evaluated. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + limit: + description: |- + Limit the number of alerts an alerting rule and series a recording + rule can produce. + Limit is supported starting with Prometheus >= 2.31 and Thanos Ruler >= 0.24. + type: integer + name: + description: Name of the rule group. + minLength: 1 + type: string + partial_response_strategy: + description: |- + PartialResponseStrategy is only used by ThanosRuler and will + be ignored by Prometheus instances. + More info: https://github.com/thanos-io/thanos/blob/main/docs/components/rule.md#partial-response + pattern: ^(?i)(abort|warn)?$ + type: string + rules: + description: List of alerting and recording rules. + items: + description: |- + Rule describes an alerting or recording rule + See Prometheus documentation: [alerting](https://www.prometheus.io/docs/prometheus/latest/configuration/alerting_rules/) or [recording](https://www.prometheus.io/docs/prometheus/latest/configuration/recording_rules/#recording-rules) rule + properties: + alert: + description: |- + Name of the alert. Must be a valid label value. + Only one of `record` and `alert` must be set. + type: string + annotations: + additionalProperties: + type: string + description: |- + Annotations to add to each alert. + Only valid for alerting rules. + type: object + expr: + anyOf: + - type: integer + - type: string + description: PromQL expression to evaluate. + x-kubernetes-int-or-string: true + for: + description: Alerts are considered firing once they have + been returned for this long. + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + keep_firing_for: + description: KeepFiringFor defines how long an alert will + continue firing after the condition that triggered it + has cleared. + minLength: 1 + pattern: ^(0|(([0-9]+)y)?(([0-9]+)w)?(([0-9]+)d)?(([0-9]+)h)?(([0-9]+)m)?(([0-9]+)s)?(([0-9]+)ms)?)$ + type: string + labels: + additionalProperties: + type: string + description: Labels to add or overwrite. + type: object + record: + description: |- + Name of the time series to output to. Must be a valid metric name. + Only one of `record` and `alert` must be set. + type: string + required: + - expr + type: object + type: array + required: + - name + type: object + type: array + x-kubernetes-list-map-keys: + - name + x-kubernetes-list-type: map + type: object + required: + - spec + type: object + served: true + storage: true diff --git a/test/util/crds/perses.dev_persesdashboards.yaml b/test/util/crds/perses.dev_persesdashboards.yaml new file mode 100644 index 00000000..a9390c37 --- /dev/null +++ b/test/util/crds/perses.dev_persesdashboards.yaml @@ -0,0 +1,258 @@ +# https://raw.githubusercontent.com/perses/perses-operator/b4c59f10020fd77d2eab601e333970b8ea208661/config/crd/bases/perses.dev_persesdashboards.yaml +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.14.0 + name: persesdashboards.perses.dev +spec: + group: perses.dev + names: + kind: PersesDashboard + listKind: PersesDashboardList + plural: persesdashboards + singular: persesdashboard + scope: Namespaced + versions: + - name: v1alpha1 + schema: + openAPIV3Schema: + description: PersesDashboard is the Schema for the persesdashboards API + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + properties: + datasources: + additionalProperties: + properties: + default: + type: boolean + display: + properties: + description: + type: string + name: + type: string + type: object + plugin: + description: |- + Plugin will contain the datasource configuration. + The data typed is available in Cue. + properties: + kind: + type: string + spec: + x-kubernetes-preserve-unknown-fields: true + required: + - kind + - spec + type: object + required: + - default + - plugin + type: object + description: Datasources is an optional list of datasource definition. + type: object + display: + properties: + description: + type: string + name: + type: string + type: object + duration: + description: Duration is the default time range to use when getting + data to fill the dashboard + format: duration + type: string + layouts: + items: + properties: + kind: + type: string + spec: + x-kubernetes-preserve-unknown-fields: true + required: + - kind + - spec + type: object + type: array + panels: + additionalProperties: + properties: + kind: + type: string + spec: + properties: + display: + properties: + description: + type: string + name: + type: string + required: + - name + type: object + plugin: + properties: + kind: + type: string + spec: + x-kubernetes-preserve-unknown-fields: true + required: + - kind + - spec + type: object + queries: + items: + properties: + kind: + type: string + spec: + properties: + plugin: + properties: + kind: + type: string + spec: + x-kubernetes-preserve-unknown-fields: true + required: + - kind + - spec + type: object + required: + - plugin + type: object + required: + - kind + - spec + type: object + type: array + required: + - display + - plugin + type: object + required: + - kind + - spec + type: object + type: object + refreshInterval: + description: RefreshInterval is the default refresh interval to use + when landing on the dashboard + format: duration + type: string + variables: + items: + properties: + kind: + description: Kind is the type of the variable. Depending on + the value of Kind, it will change the content of Spec. + type: string + spec: + x-kubernetes-preserve-unknown-fields: true + required: + - kind + - spec + type: object + type: array + required: + - duration + - layouts + - panels + type: object + status: + description: PersesDashboardStatus defines the observed state of PersesDashboard + properties: + conditions: + items: + description: "Condition contains details for one aspect of the current + state of this API Resource.\n---\nThis struct is intended for + direct use as an array at the field path .status.conditions. For + example,\n\n\n\ttype FooStatus struct{\n\t // Represents the + observations of a foo's current state.\n\t // Known .status.conditions.type + are: \"Available\", \"Progressing\", and \"Degraded\"\n\t // + +patchMergeKey=type\n\t // +patchStrategy=merge\n\t // +listType=map\n\t + \ // +listMapKey=type\n\t Conditions []metav1.Condition `json:\"conditions,omitempty\" + patchStrategy:\"merge\" patchMergeKey:\"type\" protobuf:\"bytes,1,rep,name=conditions\"`\n\n\n\t + \ // other fields\n\t}" + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: |- + type of condition in CamelCase or in foo.example.com/CamelCase. + --- + Many .condition.type values are consistent across resources like Available, but because arbitrary conditions can be + useful (see .node.status.conditions), the ability to deconflict is important. + The regex it matches is (dns1123SubdomainFmt/)?(qualifiedNameFmt) + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + type: object + type: object + served: true + storage: true + subresources: + status: {} diff --git a/test/util/operator_resource.go b/test/util/operator_resource.go index 9b79446d..d66bae8c 100644 --- a/test/util/operator_resource.go +++ b/test/util/operator_resource.go @@ -35,6 +35,66 @@ var ( }, } + OperatorConfigurationResourceWithoutExport = dash0v1alpha1.Dash0OperatorConfigurationSpec{ + SelfMonitoring: dash0v1alpha1.SelfMonitoring{ + Enabled: false, + }, + } + + OperatorConfigurationResourceDash0ExportWithoutApiEndpointWithToken = dash0v1alpha1.Dash0OperatorConfigurationSpec{ + SelfMonitoring: dash0v1alpha1.SelfMonitoring{ + Enabled: false, + }, + Export: &dash0v1alpha1.Export{ + Dash0: &dash0v1alpha1.Dash0Configuration{ + Authorization: dash0v1alpha1.Authorization{ + Token: &AuthorizationTokenTest, + }, + }, + }, + } + + OperatorConfigurationResourceDash0ExportWithoutApiEndpointWithSecretRef = dash0v1alpha1.Dash0OperatorConfigurationSpec{ + SelfMonitoring: dash0v1alpha1.SelfMonitoring{ + Enabled: false, + }, + Export: &dash0v1alpha1.Export{ + Dash0: &dash0v1alpha1.Dash0Configuration{ + Authorization: dash0v1alpha1.Authorization{ + SecretRef: &SecretRefTest, + }, + }, + }, + } + + OperatorConfigurationResourceDash0ExportWithApiEndpointWithToken = dash0v1alpha1.Dash0OperatorConfigurationSpec{ + SelfMonitoring: dash0v1alpha1.SelfMonitoring{ + Enabled: false, + }, + Export: &dash0v1alpha1.Export{ + Dash0: &dash0v1alpha1.Dash0Configuration{ + ApiEndpoint: ApiEndpointTest, + Authorization: dash0v1alpha1.Authorization{ + Token: &AuthorizationTokenTest, + }, + }, + }, + } + + OperatorConfigurationResourceDash0ExportWithApiEndpointWithSecretRef = dash0v1alpha1.Dash0OperatorConfigurationSpec{ + SelfMonitoring: dash0v1alpha1.SelfMonitoring{ + Enabled: false, + }, + Export: &dash0v1alpha1.Export{ + Dash0: &dash0v1alpha1.Dash0Configuration{ + ApiEndpoint: ApiEndpointTest, + Authorization: dash0v1alpha1.Authorization{ + SecretRef: &SecretRefTest, + }, + }, + }, + } + OperatorConfigurationResourceWithoutSelfMonitoringWithToken = dash0v1alpha1.Dash0OperatorConfigurationSpec{ SelfMonitoring: dash0v1alpha1.SelfMonitoring{ Enabled: false, diff --git a/test/util/third_party_resource.go b/test/util/third_party_resource.go new file mode 100644 index 00000000..f995707a --- /dev/null +++ b/test/util/third_party_resource.go @@ -0,0 +1,73 @@ +// SPDX-FileCopyrightText: Copyright 2024 Dash0 Inc. +// SPDX-License-Identifier: Apache-2.0 + +package util + +import ( + "context" + _ "embed" + + apiextensionsv1 "k8s.io/apiextensions-apiserver/pkg/apis/apiextensions/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/yaml" + + . "github.com/onsi/gomega" +) + +var ( + //go:embed crds/perses.dev_persesdashboards.yaml + persesDashboardsCrdYaml []byte + PersesDashboardCrdQualifiedName = types.NamespacedName{ + Name: "persesdashboards.perses.dev", + } + + //go:embed crds/monitoring.coreos.com_prometheusrules.yaml + prometheusRulesCrdYaml []byte + PrometheusRuleCrdQualifiedName = types.NamespacedName{ + Name: "prometheusrules.monitoring.coreos.com", + } +) + +func EnsurePersesDashboardCrdExists( + ctx context.Context, + k8sClient client.Client, +) *apiextensionsv1.CustomResourceDefinition { + return ensureThirdPartyResourceExists( + ctx, + k8sClient, + persesDashboardsCrdYaml, + PersesDashboardCrdQualifiedName, + ) +} + +func EnsurePrometheusRuleCrdExists( + ctx context.Context, + k8sClient client.Client, +) *apiextensionsv1.CustomResourceDefinition { + return ensureThirdPartyResourceExists( + ctx, + k8sClient, + prometheusRulesCrdYaml, + PrometheusRuleCrdQualifiedName, + ) +} + +func ensureThirdPartyResourceExists( + ctx context.Context, + k8sClient client.Client, + crdYaml []byte, + crdQualifiedName types.NamespacedName, +) *apiextensionsv1.CustomResourceDefinition { + crdFromYaml := &apiextensionsv1.CustomResourceDefinition{} + Expect(yaml.Unmarshal(crdYaml, crdFromYaml)).To(Succeed()) + crdObject := EnsureKubernetesObjectExists( + ctx, + k8sClient, + crdQualifiedName, + &apiextensionsv1.CustomResourceDefinition{}, + crdFromYaml, + ) + + return crdObject.(*apiextensionsv1.CustomResourceDefinition) +}