Skip to content

Commit

Permalink
feat(checkrules): manage check rules via the operator
Browse files Browse the repository at this point in the history
  • Loading branch information
basti1302 committed Oct 12, 2024
1 parent a496335 commit 7caa29a
Show file tree
Hide file tree
Showing 41 changed files with 2,876 additions and 659 deletions.
4 changes: 4 additions & 0 deletions .golangci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,7 @@ linters:
- unconvert
- unparam
- unused
linters-settings:
errcheck:
exclude-functions:
- fmt.Fprintf
34 changes: 25 additions & 9 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/go-logr/logr"
persesv1alpha1 "github.com/perses/perses-operator/api/v1alpha1"
prometheusv1 "github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring/v1"
semconv "go.opentelemetry.io/collector/semconv/v1.27.0"
otelmetric "go.opentelemetry.io/otel/metric"
appsv1 "k8s.io/api/apps/v1"
Expand Down Expand Up @@ -105,9 +106,10 @@ func init() {
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
utilruntime.Must(dash0v1alpha1.AddToScheme(scheme))

// for perses dashboard controller, prometheus scrape config controller etc.
// required for Perses dashboard controller and Prometheus rules controller.
utilruntime.Must(apiextensionsv1.AddToScheme(scheme))
utilruntime.Must(persesv1alpha1.AddToScheme(scheme))
utilruntime.Must(prometheusv1.AddToScheme(scheme))
}

func main() {
Expand Down Expand Up @@ -529,16 +531,30 @@ func startDash0Controllers(
metricNamePrefix,
&setupLog,
)
prometheusRuleCrdReconciler := &controller.PrometheusRuleCrdReconciler{
AuthToken: envVars.selfMonitoringAndApiAuthToken,
}
if err := prometheusRuleCrdReconciler.SetupWithManager(ctx, mgr, startupTasksK8sClient, &setupLog); err != nil {
return fmt.Errorf("unable to set up the Prometheus rule reconciler: %w", err)
}
prometheusRuleCrdReconciler.InitializeSelfMonitoringMetrics(
meter,
metricNamePrefix,
&setupLog,
)

operatorConfigurationReconciler := &controller.OperatorConfigurationReconciler{
Client: k8sClient,
Clientset: clientset,
PersesDashboardCrdReconciler: persesDashboardCrdReconciler,
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("dash0-operator-configuration-controller"),
DeploymentSelfReference: deploymentSelfReference,
Images: images,
DevelopmentMode: developmentMode,
Client: k8sClient,
Clientset: clientset,
ApiClients: []controller.ApiClient{
persesDashboardCrdReconciler,
prometheusRuleCrdReconciler,
},
Scheme: mgr.GetScheme(),
Recorder: mgr.GetEventRecorderFor("dash0-operator-configuration-controller"),
DeploymentSelfReference: deploymentSelfReference,
Images: images,
DevelopmentMode: developmentMode,
}
if err := operatorConfigurationReconciler.SetupWithManager(mgr); err != nil {
return fmt.Errorf("unable to set up the operator configuration reconciler: %w", err)
Expand Down
8 changes: 8 additions & 0 deletions config/rbac/role.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,14 @@ rules:
- delete
- get
- list
- apiGroups:
- monitoring.coreos.com
resources:
- prometheusrules
verbs:
- get
- list
- watch
- apiGroups:
- operator.dash0.com
resources:
Expand Down
1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ require (
github.com/onsi/ginkgo/v2 v2.20.2
github.com/onsi/gomega v1.34.2
github.com/perses/perses-operator v0.0.0-20240402153734-4ccf03f6c8e6
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.1
github.com/wI2L/jsondiff v0.6.0
go.opentelemetry.io/collector/pdata v1.17.0
go.opentelemetry.io/collector/semconv v0.111.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,8 @@ github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINE
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.1 h1:XGoEXT6WTTihO+MD8MAao+YaQIH905HbK0WK2lyo28k=
github.com/prometheus-operator/prometheus-operator/pkg/apis/monitoring v0.77.1/go.mod h1:D0KY8md81DQKdaR/cXwnhoWB3MYYyc/UjvqE8GFkIvA=
github.com/prometheus/client_golang v1.20.0 h1:jBzTZ7B099Rg24tny+qngoynol8LtVYlA2bqx3vEloI=
github.com/prometheus/client_golang v1.20.0/go.mod h1:PIEt8X02hGcP8JWbeHyeZ53Y/jReSnHgO035n//V5WE=
github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E=
Expand Down
41 changes: 34 additions & 7 deletions helm-chart/dash0-operator/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -561,13 +561,8 @@ ways to achieve this:
```console
kubectl apply --server-side -f https://raw.githubusercontent.com/perses/perses-operator/main/config/crd/bases/perses.dev_persesdashboards.yaml
```
* Alternatively, install the full Perses operator: Go to <https://github.com/perses/perses-operator> and follow the installation
instructions there.

Note that the custom resource definition needs to be installed before the Dash0 operator is started.
If you have installed the Dash0 operator before installing the Perses dashboard custom resource definition, you need to
restart the Dash0 operator once, for example by deleting the operator's controller pod:
`kubectl --namespace dash0-system delete pod -l app.kubernetes.io/component=controller`
* Alternatively, install the full Perses operator: Go to <https://github.com/perses/perses-operator> and follow the
installation instructions there.

With the prerequisites in place, you can manage Dash0 dashboards via the operator.
The Dash0 operator will watch for Perses dashboard resources in the cluster and synchronize them with the Dash0 backend:
Expand All @@ -579,3 +574,35 @@ The dashboards created by the operator will be in read-only mode in the Dash0 UI

If the Dash0 operator configuration resource has the `dataset` property set, the operator will create the dashboards
in that specified dataset, otherwise they will be created in the `default` dataset.

## Managing Dash0 Check Rules with the Operator

You can manage your Dash0 check rules via the Dash0 Kubernetes operator.

Pre-requisites for this feature:
* A Dash0 operator configuration resource has to be installed in the cluster.
* The operator configuration resource must have the `apiEndpoint` property.
* The operator configuration resource must have a Dash0 export configured with authorization
(either `token` or `secret-ref`).

Furthermore, the custom resource definition for Prometheus rules needs to be installed in the cluster. There are two
ways to achieve this:
* Install the Prometheus rules custom resource definition with the following command:
```console
kubectl apply --server-side -f https://raw.githubusercontent.com/prometheus-operator/prometheus-operator/v0.77.1/example/prometheus-operator-crd/monitoring.coreos.com_prometheusrules.yaml
```
* Alternatively, install the full kube-prometheus stack Helm chart: Go to
<https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack> and follow the
installation instructions there.

With the prerequisites in place, you can manage Dash0 check rules via the operator.
The Dash0 operator will watch for Prometheus rule resources in the cluster and synchronize them with the Dash0 backend:
* When a new Prometheus rule resource is created, the operator will create corresponding check rules via Dash0's API.
* When a Prometheus rule resource is changed, the operator will update the corresponding check rules via Dash0's API.
* When a Prometheus rule resource is deleted, the operator will delete the corresponding check rules via Dash0's API.

Note that a Prometheus rule resource can contain multiple groups, and each of those groups can have multiple rules.
The Dash0 operator will create individual check rules for each rule in each group.

If the Dash0 operator configuration resource has the `dataset` property set, the operator will create the rules
in that specified dataset, otherwise they will be created in the `default` dataset.
38 changes: 24 additions & 14 deletions helm-chart/dash0-operator/templates/operator/cluster-roles.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ metadata:

rules:

# Permissions required to watch for the foreign CRD (Perses dashboards, Prometheus scrape configs).
# Permissions required to watch for the third-party CRD (Perses dashboards, Prometheus check rules):
- apiGroups:
- apiextensions.k8s.io
resources:
Expand All @@ -20,7 +20,7 @@ rules:
- list
- watch

# Permissions required to instrument workloads in the apps API group.
# Permissions required to instrument workloads in the apps API group:
- apiGroups:
- apps
resources:
Expand All @@ -35,7 +35,7 @@ rules:
- update
- watch

# Permissions required to instrument workloads in the batch API group.
# Permissions required to instrument workloads in the batch API group:
- apiGroups:
- batch
resources:
Expand All @@ -48,7 +48,7 @@ rules:
- update
- watch

# Pmrmissions required top create a Dash0 operator configuration resources
# Pmrmissions required to create a Dash0 operator configuration resource:
- apiGroups:
- ""
resources:
Expand All @@ -57,7 +57,7 @@ rules:
- get

# Permissions required to queue events to report about the operator's actions, and to attach dangling events to their
# respective involved objects.
# respective involved objects:
- apiGroups:
- ""
resources:
Expand All @@ -75,7 +75,7 @@ rules:
- get

# Permissions required to automatically restart (i.e. delete) pods when instrumenting replicasets that are not part of a
# higher order workload (e.g. a deployment, daemonset).
# higher order workload (e.g. a deployment, daemonset):
- apiGroups:
- ""
resources:
Expand All @@ -85,7 +85,7 @@ rules:
- get
- list

# Permissions required to watch for the Perses dashboard resources.
# Permissions required to watch Perses dashboard resources:
- apiGroups:
- perses.dev
resources:
Expand All @@ -95,7 +95,17 @@ rules:
- list
- watch

# Permissions required to manage the Dash0 monitoring resource, its finalizers and status.
# Permissions required to watch Prometheus rule resources:
- apiGroups:
- monitoring.coreos.com
resources:
- prometheusrules
verbs:
- get
- list
- watch

# Permissions required to manage the Dash0 monitoring resource, its finalizers and status:
- apiGroups:
- operator.dash0.com
resources:
Expand All @@ -110,15 +120,15 @@ rules:
- update
- watch

# Permissions required to manage the Dash0 monitoring resource, its finalizers and status.
# Permissions required to manage the Dash0 monitoring resource, its finalizers and status:
- apiGroups:
- operator.dash0.com
resources:
- dash0monitorings/finalizers
verbs:
- update

# Permissions required to manage the Dash0 monitoring resource, its finalizers and status.
# Permissions required to manage the Dash0 monitoring resource, its finalizers and status:
- apiGroups:
- operator.dash0.com
resources:
Expand All @@ -128,7 +138,7 @@ rules:
- patch
- update

# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status.
# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status:
- apiGroups:
- operator.dash0.com
resources:
Expand All @@ -143,15 +153,15 @@ rules:
- update
- watch

# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status.
# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status:
- apiGroups:
- operator.dash0.com
resources:
- dash0operatorconfigurations/finalizers
verbs:
- update

# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status.
# Permissions required to manage the Dash0 operator configuration resource, its finalizers and status:
- apiGroups:
- operator.dash0.com
resources:
Expand All @@ -161,7 +171,7 @@ rules:
- patch
- update

# Permissions required to manage OTel collector resources.
# Permissions required to manage OTel collector resources:
- apiGroups:
- ""
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,14 @@ cluster roles should match snapshot:
- get
- list
- watch
- apiGroups:
- monitoring.coreos.com
resources:
- prometheusrules
verbs:
- get
- list
- watch
- apiGroups:
- operator.dash0.com
resources:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ func ConvertExportSettingsToExporterList(export dash0v1alpha1.Export) ([]OtlpExp
Name: util.AuthorizationHeaderName,
Value: authHeaderValue,
}}
if d0.Dataset != "" && d0.Dataset != "default" {
if d0.Dataset != "" && d0.Dataset != util.DatasetDefault {
headers = append(headers, dash0v1alpha1.Header{
Name: util.Dash0DatasetHeaderName,
Value: d0.Dataset,
Expand Down
39 changes: 23 additions & 16 deletions internal/dash0/controller/operator_configuration_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ import (

type OperatorConfigurationReconciler struct {
client.Client
Clientset *kubernetes.Clientset
PersesDashboardCrdReconciler *PersesDashboardCrdReconciler
Scheme *runtime.Scheme
Recorder record.EventRecorder
DeploymentSelfReference *appsv1.Deployment
DanglingEventsTimeouts *util.DanglingEventsTimeouts
Images util.Images
DevelopmentMode bool
Clientset *kubernetes.Clientset
ApiClients []ApiClient
Scheme *runtime.Scheme
Recorder record.EventRecorder
DeploymentSelfReference *appsv1.Deployment
DanglingEventsTimeouts *util.DanglingEventsTimeouts
Images util.Images
DevelopmentMode bool
}

const (
Expand Down Expand Up @@ -138,6 +138,9 @@ func (r *OperatorConfigurationReconciler) Reconcile(ctx context.Context, req ctr

if resourceDeleted {
logger.Info("Reconciling the deletion of the operator configuration resource", "name", req.Name)
for _, apiClient := range r.ApiClients {
apiClient.RemoveApiEndpointAndDataset()
}
if err = r.removeSelfMonitoringAndApiAccessAndUpdate(ctx); err != nil {
logger.Error(err, "cannot disable self-monitoring/API access of the controller deployment, requeuing reconcile request.")
return ctrl.Result{
Expand All @@ -163,16 +166,20 @@ func (r *OperatorConfigurationReconciler) Reconcile(ctx context.Context, req ctr
if resource.HasDash0ApiAccessConfigured() {
dataset := resource.Spec.Export.Dash0.Dataset
if dataset == "" {
dataset = "default"
dataset = util.DatasetDefault
}
for _, apiClient := range r.ApiClients {
apiClient.SetApiEndpointAndDataset(&ApiConfig{
Endpoint: resource.Spec.Export.Dash0.ApiEndpoint,
Dataset: dataset,
}, &logger)
}
r.PersesDashboardCrdReconciler.SetApiEndpointAndDataset(&ApiConfig{
Endpoint: resource.Spec.Export.Dash0.ApiEndpoint,
Dataset: dataset,
}, &logger)
} else {
logger.Info("Settings required for managing dashboards via the operator are missing, the operator will not " +
"update dashboards in Dash0.")
r.PersesDashboardCrdReconciler.RemoveApiEndpointAndDataset()
logger.Info("Settings required for managing dashboards or check rules via the operator are missing, the " +
"operator will not update dashboards nor check rules in Dash0.")
for _, apiClient := range r.ApiClients {
apiClient.RemoveApiEndpointAndDataset()
}
}

currentSelfMonitoringAndApiAccessConfiguration, err :=
Expand Down
Loading

0 comments on commit 7caa29a

Please sign in to comment.