From 7a01fe9b8530ac5994b07e8d92508f3b39e0c90e Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Tue, 16 Apr 2024 17:55:46 +0200 Subject: [PATCH 01/16] (fleet/prometheus/alerts) set up gitops for alert deployment This allows for prometheus rules to be automagically applied through rancher fleet. Simply adding some rules to the fleet directory will deploy them into the cluster, ready for the prometheus operator to pick them up and deploy. Documentation to be found in the `docs/alerts` directory. (fleet/lib/prometheusrules) add values file (fleet/prometheus rules) depends on prometheus-crds --- docs/alerts/README.md | 33 +++++++++++++++++ fleet/lib/prometheus-alertrules/Chart.yaml | 5 +++ fleet/lib/prometheus-alertrules/README.md | 32 +++++++++++++++++ fleet/lib/prometheus-alertrules/fleet.yaml | 16 +++++++++ .../prometheus-alertrules/rules/nodes.yaml | 35 +++++++++++++++++++ .../templates/prometheusrule.yaml | 34 ++++++++++++++++++ fleet/lib/prometheus-alertrules/values.yaml | 7 ++++ 7 files changed, 162 insertions(+) create mode 100644 docs/alerts/README.md create mode 100644 fleet/lib/prometheus-alertrules/Chart.yaml create mode 100644 fleet/lib/prometheus-alertrules/README.md create mode 100644 fleet/lib/prometheus-alertrules/fleet.yaml create mode 100644 fleet/lib/prometheus-alertrules/rules/nodes.yaml create mode 100644 fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml create mode 100644 fleet/lib/prometheus-alertrules/values.yaml diff --git a/docs/alerts/README.md b/docs/alerts/README.md new file mode 100644 index 000000000..c6ed513e6 --- /dev/null +++ b/docs/alerts/README.md @@ -0,0 +1,33 @@ +# Prometheus rules GitOps + +Any Prometheus rules file defined in the +[fleet/lib/prometheus-alertrules/rules](../../prometheus-alertrules/rules) +directory will be deployed to the cluster. It's possible to define a default +namespace in the `values.yaml` file with the `rules.namespace` key. + +## Adding Prometheus rules + +1. Write the Prometheus rules in a yaml file according to the [prometheus + specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). +1. Add the YAML file to the `/rules` directory +1. Commit + +## Prometheus rule AURA standards + +* `summary` annotation: The `summary` annotation is used to be able to describe a + group of alerts incomming. This annotation DOES NOT contain any templated + variables and provides a simple single sentence summary of what the alert is + about. For example "Disk space full in 24h". When a cluster triggers several + alerts, it can be hany to group these alerts into a single notification, this + is when the `summary` can be used. +* `discription` annotation: This provides a detailed overview of the alert + specifically to this instance of the alert. It MAY contain templated variables + to enrich the message. +* `receiver` label: The receiver label is used by alertmanager to decide on the + routing of the notification for the alert. It exists out of `,` seperated list + of receivers, pre- and suffixed with `,` to make regex matching easier in the + alertmanager. For example: `,slack,squadcast,email,` The receivers are defined + in the alertmanager configuration. + Currently (20240503) the following receivers are configured: + * `slack-test` + * `squadcast-test` diff --git a/fleet/lib/prometheus-alertrules/Chart.yaml b/fleet/lib/prometheus-alertrules/Chart.yaml new file mode 100644 index 000000000..fb14c879f --- /dev/null +++ b/fleet/lib/prometheus-alertrules/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +appVersion: 0.1.0 +description: "Prometheus LSST rules GitOps" +name: lsst-prometheus-alerts +version: 0.1.1 diff --git a/fleet/lib/prometheus-alertrules/README.md b/fleet/lib/prometheus-alertrules/README.md new file mode 100644 index 000000000..ae985d000 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/README.md @@ -0,0 +1,32 @@ +# Prometheus rules GitOps + +Any Prometheus rules file defined in the `/rules` directory will be deployed to +the cluster. It's possible to define a default namespace in the `values.yaml` +file with the `rules.namespace` key. + +## Adding Prometheus rules + +1. Write the Prometheus rules in a yaml file according to the [prometheus + specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). +1. Add the YAML file to the `/rules` directory +1. Commit + +## Prometheus rule AURA standards + +* `summary` annotation: The `summary` annotation is used to be able to describe a + group of alerts incomming. This annotation DOES NOT contain any templated + variables and provides a simple single sentence summary of what the alert is + about. For example "Disk space full in 24h". When a cluster triggers several + alerts, it can be hany to group these alerts into a single notification, this + is when the `summary` can be used. +* `discription` annotation: This provides a detailed overview of the alert + specifically to this instance of the alert. It MAY contain templated variables + to enrich the message. +* `receiver` label: The receiver label is used by alertmanager to decide on the + routing of the notification for the alert. It exists out of `,` seperated list + of receivers, pre- and suffixed with `,` to make regex matching easier in the + alertmanager. For example: `,slack,squadcast,email,` The receivers are defined + in the alertmanager configuration. + Currently (20240503) the following receivers are configured: + * `slack-test` + * `squadcast-test` diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml new file mode 100644 index 000000000..fc86bac14 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/fleet.yaml @@ -0,0 +1,16 @@ +--- +defaultNamespace: &name lsst-prometheus-alerts +labels: + bundle: *name +namespaceLabels: + lsst.io/discover: "true" +helm: + releaseName: *name + takeOwnership: true + timeoutSeconds: 300 + waitForJobs: false + atomic: false +dependsOn: + - selector: + matchLabels: + bundle: prometheus-operator-crds diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml new file mode 100644 index 000000000..402de0533 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml @@ -0,0 +1,35 @@ +groups: + - name: "NodeFilesystem" + rules: + - alert: "NodeFilesystemFillingUp" + annotaitons: + summary: "The nodes disk will fill up within 6h" + description: | + Nodes TODO:node_label disk is currently almost full at TODO%. It will fill up within 6 hours. + expr: | + ( + node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"} + / + node_filesystem_size_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"} + * + 100 + < + 10 + and + predict_linear( + node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h], + 12 * 60 * 60 + ) + < + 0 + and + node_filesystem_readonly{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"} + == + 0 + and + delta( + node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h] + ) + < + 0 + ) diff --git a/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml new file mode 100644 index 000000000..af693a18e --- /dev/null +++ b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml @@ -0,0 +1,34 @@ +# yamllint disable-file +{{- if .Values.rules.enabled }} +{{- $files := .Files.Glob "rules/**.yaml" }} +{{- range $rawpath, $content := $files }} +{{- $path := ($rawpath | lower | replace " " "-") }} +{{- $ruleDir := dir $path }} +{{- $ruleFile := base $path }} +{{- $namespaceSplit := regexSplit "\\/+" $ruleDir -1 }} +{{- $namespace := $.Values.rules.namespace | default $.Release.Namespace }} +{{- if (eq (len $namespaceSplit) 2) }} +{{- $namespace = (index $namespaceSplit 1) }} +{{- end }} +{{- $alertName := lower (index (regexSplit "\\.yaml" $ruleFile -1) 0) }} +--- +apiVersion: monitoring.coreos.com/v1 │ +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" "alert" $alertName | trunc 63 | trimSuffix "-" }} + namespace: {{ $namespace }} + labels: + lsst.io/component: "prometheus-rules" + lsst.io/dir: {{ $ruleDir | quote }} + lsst.io/file: {{ $ruleFile | quote }} + {{- with $.Values.rules.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with $.Values.rules.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{ $content | toString }} +{{ end }} +{{ end }} diff --git a/fleet/lib/prometheus-alertrules/values.yaml b/fleet/lib/prometheus-alertrules/values.yaml new file mode 100644 index 000000000..52d53e4a4 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/values.yaml @@ -0,0 +1,7 @@ +--- +rules: + enabled: true + namespace: ~ + additionalAnnotations: {} + additionalLabels: + lsst.io/rule: "true" From 0d90c790f893f5bd111029e7cbe996e8e7cb596d Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Fri, 3 May 2024 15:58:49 +0200 Subject: [PATCH 02/16] (fleet/prometheus) modify alerting stack triggers --- .../aggregator/values.yaml | 24 ++++++++++++++++++- .../prometheus-alertrules/rules/nodes.yaml | 2 +- .../templates/prometheusrule.yaml | 2 +- 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml index 24a87fdef..53a8669b9 100644 --- a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml +++ b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml @@ -45,7 +45,29 @@ alertmanager: tls: - secretName: tls-alertmanager-ingress hosts: - - alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org + - "alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org" + config: + global: + resolve_timeout: 5m + slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test + inhibit_rules: + - source_matchers: + - alertname = "InfoInhibitor" + target_matchers: + - severity = "info" + equal: ["namespace"] + - source_matchers: + - severity = "critical" + target_matchers: + - severity =~ "info|warning" + equal: ["alertname"] + - source_matchers: + - severity = "warning" + target_matchers: + - severity = "info" + equal: ["alertname"] + templates: + - "/etc/alertmanager/configmaps/alertmanager-templates/*.tmpl" grafana: enabled: true diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml index 402de0533..b0a8bc863 100644 --- a/fleet/lib/prometheus-alertrules/rules/nodes.yaml +++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml @@ -2,7 +2,7 @@ groups: - name: "NodeFilesystem" rules: - alert: "NodeFilesystemFillingUp" - annotaitons: + annotations: summary: "The nodes disk will fill up within 6h" description: | Nodes TODO:node_label disk is currently almost full at TODO%. It will fill up within 6 hours. diff --git a/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml index af693a18e..3f54756a1 100644 --- a/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml +++ b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml @@ -12,7 +12,7 @@ {{- end }} {{- $alertName := lower (index (regexSplit "\\.yaml" $ruleFile -1) 0) }} --- -apiVersion: monitoring.coreos.com/v1 │ +apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: {{ printf "%s-%s" "alert" $alertName | trunc 63 | trimSuffix "-" }} From 0885a7c78876dec166d4394e295fad3169389fb2 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Tue, 7 May 2024 19:47:05 +0200 Subject: [PATCH 03/16] (fleet/alertrules) remove default receivers and update docs --- docs/alerts/README.md | 3 --- fleet/lib/prometheus-alertrules/README.md | 3 --- 2 files changed, 6 deletions(-) diff --git a/docs/alerts/README.md b/docs/alerts/README.md index c6ed513e6..ac1ab8cc3 100644 --- a/docs/alerts/README.md +++ b/docs/alerts/README.md @@ -28,6 +28,3 @@ namespace in the `values.yaml` file with the `rules.namespace` key. of receivers, pre- and suffixed with `,` to make regex matching easier in the alertmanager. For example: `,slack,squadcast,email,` The receivers are defined in the alertmanager configuration. - Currently (20240503) the following receivers are configured: - * `slack-test` - * `squadcast-test` diff --git a/fleet/lib/prometheus-alertrules/README.md b/fleet/lib/prometheus-alertrules/README.md index ae985d000..de5e48845 100644 --- a/fleet/lib/prometheus-alertrules/README.md +++ b/fleet/lib/prometheus-alertrules/README.md @@ -27,6 +27,3 @@ file with the `rules.namespace` key. of receivers, pre- and suffixed with `,` to make regex matching easier in the alertmanager. For example: `,slack,squadcast,email,` The receivers are defined in the alertmanager configuration. - Currently (20240503) the following receivers are configured: - * `slack-test` - * `squadcast-test` From 882d4adc4a81cce81ce2bc4601ea22dc2918240e Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Tue, 14 May 2024 18:22:08 +0200 Subject: [PATCH 04/16] (fleet/alerts) add ceph alerts --- .../lib/prometheus-alertrules/rules/ceph.yaml | 164 ++++++++++++++++++ 1 file changed, 164 insertions(+) create mode 100644 fleet/lib/prometheus-alertrules/rules/ceph.yaml diff --git a/fleet/lib/prometheus-alertrules/rules/ceph.yaml b/fleet/lib/prometheus-alertrules/rules/ceph.yaml new file mode 100644 index 000000000..cbe295b63 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/rules/ceph.yaml @@ -0,0 +1,164 @@ +groups: + - name: "ceph.rules" + rules: + - alert: "CephQuotaFillingUp" + annotations: + summary: "The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full" + description: | + Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{ + $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please + keep in mind that ceph pools reaching 100% is dangerous. + labels: + secverity: "warning" + receivers: ",slack," + expr: | + (ceph_pool_stored/ceph_pool_quota_bytes > 0.75 and ceph_pool_quota_bytes != 0)*100 + - alert: "CephQuotaFillingUp" + annotations: + summary: "The Ceph pool quota is almost full" + description: | + Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{ + $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please + keep in mind that ceph pools reaching 100% is dangerous. + labels: + secverity: "critical" + receivers: ",slack," + expr: | + (ceph_pool_stored/ceph_pool_quota_bytes > 0.9 and ceph_pool_quota_bytes != 0)*100 + - alert: CephTargetDown + expr: up{job=".*ceph.*"} == 0 + for: 10m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + CEPH target on {{ $labels.prom_cluster }} down for more than 2m, + please check - it could be a either exporter crash or a whole cluster + crash + summary: CEPH exporter down on {{ $labels.prom_cluster }} + - alert: CephErrorState + expr: ceph_health_status > 1 + for: 5m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Ceph is in Error state on {{ $labels.prom_cluster }} for longer than + 5m, please check status of pools and OSDs + summary: CEPH in ERROR + - alert: CephWarnState + expr: ceph_health_status == 1 + for: 30m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + Ceph is in Warn state on {{ $labels.prom_cluster }} for longer than + 30m, please check status of pools and OSDs + summary: CEPH in WARN + - alert: OsdDown + expr: ceph_osd_up == 0 + for: 30m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + OSD is down longer than 30 min on {{ $labels.prom_cluster }}, please + check whats the status + summary: OSD down + - alert: OsdApplyLatencyTooHigh + expr: ceph_osd_apply_latency_ms > 5000 + for: 90s + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + OSD latency for {{ $labels.osd }} is too high on {{ + $labels.prom_cluster }}. Please check if it doesn't stuck in weird + state + summary: OSD latency too high {{ $labels.osd }} + - alert: CephPgDown + expr: ceph_pg_down > 0 + for: 3m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are down (unavailable) for too long on {{ + $labels.prom_cluster }}. Please ensure that all the data are + available + summary: PG DOWN [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgIncomplete + expr: ceph_pg_incomplete > 0 + for: 2m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are incomplete (unavailable) for too long on {{ + $labels.prom_cluster }}. Please ensure that all the data are + available + summary: PG INCOMPLETE [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgInconsistent + expr: ceph_pg_inconsistent > 0 + for: 1m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + Some groups are inconsistent for too long on {{ $labels.prom_cluster + }}. Data is available but inconsistent across nodes + summary: PG INCONSISTENT [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgActivating + expr: ceph_pg_activating > 0 + for: 5m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are activating for too long on {{ $labels.prom_cluster + }}. Those PGs are unavailable for too long! + summary: PG ACTIVATING [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgBackfillTooFull + expr: ceph_pg_backfill_toofull > 0 + for: 5m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + Some groups are located on full OSD on cluster {{ + $labels.prom_cluster }}. Those PGs can be unavailable shortly. Please + check OSDs, change weight or reconfigure CRUSH rules. + summary: PG TOO FULL [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgUnavailable + expr: ceph_pg_total - ceph_pg_active > 0 + for: 5m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are unavailable on {{ $labels.prom_cluster }}. Please + check their detailed status and current configuration. + summary: PG UNAVAILABLE [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephOsdReweighted + expr: ceph_osd_weight < 1 + for: 1h + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + OSD on cluster {{ $labels.prom_cluster}} was reweighted for too long. + Please either create silent or fix that issue + summary: OSD {{ $labels.ceph_daemon }} on {{ $labels.prom_cluster }} reweighted - {{ $value }} From 75ded139578a516356eb254eadfd7c4858e9b549 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Thu, 16 May 2024 15:11:25 +0200 Subject: [PATCH 05/16] (fleet/alerting) remove ceph override --- .../overlays/ayekan/values.yaml | 35 ++++++++++++++----- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml index b81041535..f931158af 100644 --- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml +++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml @@ -201,15 +201,32 @@ alertmanager: continue: true - receiver: slack-kube-test matchers: - - alertname =~ "Kube.*" - - receiver: slack-node-test - group_by: [instance] - matchers: - - alertname =~ "Node.*" - - receiver: slack-network-test - group_by: [instance] - matchers: - - alertname =~ "Network.*" + - receivers =~ ".*,slack,.*" + continue: true + routes: + - receiver: slack-kube-test + matchers: + - alertname =~ "Kube.*" + - receiver: slack-node-test + group_by: ["instance"] + matchers: + - alertname =~ "Node.*" + - receiver: slack-network-test + group_by: ["instance"] + matchers: + - alertname =~ "Network.*" + # Below is an example for the namespace based alert routing. + # This will send alerts from a namespace to the namespace specific team + # on slack + # - receiver: slack-rook-ceph-team + # matchers: + # - namespace = "rook-ceph" + # Below is an example for the group based alert routing. + # This will send alerts with a specifc group in the receiver list to the + # alert channel. + # - receiver: email-group + # matchers: + # - receivers =~ ".*,group,.*" receivers: - name: "null" - name: watchdog From e8d6b867dc12f661a45d515954bf86665c17d6cb Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Fri, 17 May 2024 12:18:28 +0200 Subject: [PATCH 06/16] (fleet/alerts) include cluster in slack alerts --- .../configmap-alertmanager-templates.yaml | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml b/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml index 94c4d49df..6b0a18c35 100644 --- a/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml +++ b/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml @@ -5,25 +5,25 @@ metadata: data: slack-generic-alert.tmpl: | {{ define "slack.o11y.generic.text" }} - *Site:* {{ .CommonLabels.site }} + *Site:* {{ .CommonLabels.prom_site }} *Alert:* {{ .GroupLabels.alertname }} *Summary:* {{ .CommonAnnotations.summary }} - {{ template "__o11y_alert_list" . }} + {{ template "__o11y_alert_short_list" . }} {{ end }} {{ define "slack.o11y.generic.title"}} - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.alertname }} + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.alertname }} {{ end }} slack-kube-alert.tmpl: | {{ define "slack.o11y.kube.text" }} *Alert:* {{ .GroupLabels.alertname }} - *Site:* {{ .CommonLabels.site }} - *Kube cluster:* {{ .CommonLabels.prom }} + *Site:* {{ .CommonLabels.prom_site }} + *Kube cluster:* {{ .CommonLabels.prom_cluster }} *Namespace:* {{ .GroupLabels.namespace }} *Summary:* {{ .CommonAnnotations.summary }} {{ template "__o11y_alert_list" . }} {{ end }} {{ define "slack.o11y.kube.title"}} - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }} + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }} {{ end }} slack-network-alert.tmpl: | {{ define "slack.o11y.network.text" }} @@ -36,12 +36,13 @@ data: {{ template "__o11y_alert_list" . }} {{ end }} template-helpers.tmpl: | + {{ define "__o11y_alert_title" }} + {{ end }} {{ define "__o11y_alert_list" }} *Alerts:* ========= {{ range .Alerts -}} - *Alert:* {{ .Labels.alertname }} - *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Severity:* {{ .Labels.severity }} *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }} @@ -51,3 +52,13 @@ data: {{ end }} {{ end }} {{ end }} + {{ define "__o11y_alert_short_list" }} + *Alerts:* + ========= + {{ range .Alerts -}} + - *Alert:* {{ .Labels.alertname }} + *Description:* {{ .Annotations.description }} + *Severity:* {{ .Labels.severity }} + *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }} + {{ end }} + {{ end }} From b44d94c86ba7588c1202643c26bd62bfd978df96 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Mon, 10 Jun 2024 10:29:23 +0200 Subject: [PATCH 07/16] (fleet/alerts) move slack credentials to cluster overlay --- fleet/lib/kube-prometheus-stack/aggregator/values.yaml | 1 - fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml | 1 - 2 files changed, 2 deletions(-) diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml index 53a8669b9..216f6d6dd 100644 --- a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml +++ b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml @@ -49,7 +49,6 @@ alertmanager: config: global: resolve_timeout: 5m - slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test inhibit_rules: - source_matchers: - alertname = "InfoInhibitor" diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml index f931158af..741c54fe6 100644 --- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml +++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml @@ -180,7 +180,6 @@ alertmanager: - lsst-webhooks config: global: - resolve_timeout: 5m slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test route: group_by: [alertname, namespace, site] From 78afed48206dda6bcd2cc6cb7f981dd647c9c03e Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Mon, 10 Jun 2024 10:29:46 +0200 Subject: [PATCH 08/16] (fleet/alerts) remove `atomic` default from fleet.yaml --- fleet/lib/prometheus-alertrules/fleet.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml index fc86bac14..8f6a3260b 100644 --- a/fleet/lib/prometheus-alertrules/fleet.yaml +++ b/fleet/lib/prometheus-alertrules/fleet.yaml @@ -9,7 +9,6 @@ helm: takeOwnership: true timeoutSeconds: 300 waitForJobs: false - atomic: false dependsOn: - selector: matchLabels: From 9354afaa6a060a236d47cc41c4d1b8ab6ae530c8 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Mon, 10 Jun 2024 10:33:26 +0200 Subject: [PATCH 09/16] (fleet/alerts) insert templating values for node disk alerts --- fleet/lib/prometheus-alertrules/rules/nodes.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml index b0a8bc863..5fe214c09 100644 --- a/fleet/lib/prometheus-alertrules/rules/nodes.yaml +++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml @@ -5,30 +5,30 @@ groups: annotations: summary: "The nodes disk will fill up within 6h" description: | - Nodes TODO:node_label disk is currently almost full at TODO%. It will fill up within 6 hours. + Nodes {{ $labels.instance }} disk is currently almost full at {{ $value }}. It will fill up within 6 hours. expr: | ( - node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"} + node_filesystem_avail_bytes{fstype!="",job="node-exporter"} / - node_filesystem_size_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"} + node_filesystem_size_bytes{fstype!="",job="node-exporter"} * 100 < 10 and predict_linear( - node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h], + node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], 12 * 60 * 60 ) < 0 and - node_filesystem_readonly{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"} + node_filesystem_readonly{fstype!="",job="node-exporter"} == 0 and delta( - node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h] + node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h] ) < 0 From 58cd0b1c64288ac5eee09aa51e11011fad5356b2 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Mon, 10 Jun 2024 14:52:22 +0200 Subject: [PATCH 10/16] (fleet/alerts) make yamllint happy --- fleet/lib/prometheus-alertrules/Chart.yaml | 2 +- fleet/lib/prometheus-alertrules/rules/ceph.yaml | 14 +++++++------- fleet/lib/prometheus-alertrules/rules/nodes.yaml | 6 +++--- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/fleet/lib/prometheus-alertrules/Chart.yaml b/fleet/lib/prometheus-alertrules/Chart.yaml index fb14c879f..a9013689a 100644 --- a/fleet/lib/prometheus-alertrules/Chart.yaml +++ b/fleet/lib/prometheus-alertrules/Chart.yaml @@ -1,5 +1,5 @@ apiVersion: v2 appVersion: 0.1.0 -description: "Prometheus LSST rules GitOps" +description: Prometheus LSST rules GitOps name: lsst-prometheus-alerts version: 0.1.1 diff --git a/fleet/lib/prometheus-alertrules/rules/ceph.yaml b/fleet/lib/prometheus-alertrules/rules/ceph.yaml index cbe295b63..001ed2031 100644 --- a/fleet/lib/prometheus-alertrules/rules/ceph.yaml +++ b/fleet/lib/prometheus-alertrules/rules/ceph.yaml @@ -1,27 +1,27 @@ groups: - - name: "ceph.rules" + - name: ceph.rules rules: - - alert: "CephQuotaFillingUp" + - alert: CephQuotaFillingUp annotations: - summary: "The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full" + summary: The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full description: | Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{ $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please keep in mind that ceph pools reaching 100% is dangerous. labels: - secverity: "warning" + secverity: warning receivers: ",slack," expr: | (ceph_pool_stored/ceph_pool_quota_bytes > 0.75 and ceph_pool_quota_bytes != 0)*100 - - alert: "CephQuotaFillingUp" + - alert: CephQuotaFillingUp annotations: - summary: "The Ceph pool quota is almost full" + summary: The Ceph pool quota is almost full description: | Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{ $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please keep in mind that ceph pools reaching 100% is dangerous. labels: - secverity: "critical" + secverity: critical receivers: ",slack," expr: | (ceph_pool_stored/ceph_pool_quota_bytes > 0.9 and ceph_pool_quota_bytes != 0)*100 diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml index 5fe214c09..2644b3119 100644 --- a/fleet/lib/prometheus-alertrules/rules/nodes.yaml +++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml @@ -1,9 +1,9 @@ groups: - - name: "NodeFilesystem" + - name: NodeFilesystem rules: - - alert: "NodeFilesystemFillingUp" + - alert: NodeFilesystemFillingUp annotations: - summary: "The nodes disk will fill up within 6h" + summary: The nodes disk will fill up within 6h description: | Nodes {{ $labels.instance }} disk is currently almost full at {{ $value }}. It will fill up within 6 hours. expr: | From b7a82b4df8f617dcf034ccda79fbb4c29129c485 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Thu, 13 Jun 2024 17:23:44 +0200 Subject: [PATCH 11/16] (fleet/prometheusrule) remove explicit timeout --- fleet/lib/prometheus-alertrules/fleet.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml index 8f6a3260b..e9b083ecc 100644 --- a/fleet/lib/prometheus-alertrules/fleet.yaml +++ b/fleet/lib/prometheus-alertrules/fleet.yaml @@ -7,7 +7,6 @@ namespaceLabels: helm: releaseName: *name takeOwnership: true - timeoutSeconds: 300 waitForJobs: false dependsOn: - selector: From 62d8d608879c8fdc7a98cab427fdb87a1d875588 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Thu, 13 Jun 2024 17:29:08 +0200 Subject: [PATCH 12/16] (fleet/alerts) remove duplicate docs in favor of top level --- fleet/lib/prometheus-alertrules/README.md | 29 ----------------------- 1 file changed, 29 deletions(-) delete mode 100644 fleet/lib/prometheus-alertrules/README.md diff --git a/fleet/lib/prometheus-alertrules/README.md b/fleet/lib/prometheus-alertrules/README.md deleted file mode 100644 index de5e48845..000000000 --- a/fleet/lib/prometheus-alertrules/README.md +++ /dev/null @@ -1,29 +0,0 @@ -# Prometheus rules GitOps - -Any Prometheus rules file defined in the `/rules` directory will be deployed to -the cluster. It's possible to define a default namespace in the `values.yaml` -file with the `rules.namespace` key. - -## Adding Prometheus rules - -1. Write the Prometheus rules in a yaml file according to the [prometheus - specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). -1. Add the YAML file to the `/rules` directory -1. Commit - -## Prometheus rule AURA standards - -* `summary` annotation: The `summary` annotation is used to be able to describe a - group of alerts incomming. This annotation DOES NOT contain any templated - variables and provides a simple single sentence summary of what the alert is - about. For example "Disk space full in 24h". When a cluster triggers several - alerts, it can be hany to group these alerts into a single notification, this - is when the `summary` can be used. -* `discription` annotation: This provides a detailed overview of the alert - specifically to this instance of the alert. It MAY contain templated variables - to enrich the message. -* `receiver` label: The receiver label is used by alertmanager to decide on the - routing of the notification for the alert. It exists out of `,` seperated list - of receivers, pre- and suffixed with `,` to make regex matching easier in the - alertmanager. For example: `,slack,squadcast,email,` The receivers are defined - in the alertmanager configuration. From 22f5863d4957a319a2806b7288a25c009f82e8a6 Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Thu, 13 Jun 2024 17:31:37 +0200 Subject: [PATCH 13/16] (fleet/prom-stack) make yamllint happy --- fleet/lib/kube-prometheus-stack/aggregator/values.yaml | 10 +++++----- .../kube-prometheus-stack/overlays/ayekan/values.yaml | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml index 216f6d6dd..439b49ae2 100644 --- a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml +++ b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml @@ -45,7 +45,7 @@ alertmanager: tls: - secretName: tls-alertmanager-ingress hosts: - - "alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org" + - alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org config: global: resolve_timeout: 5m @@ -54,19 +54,19 @@ alertmanager: - alertname = "InfoInhibitor" target_matchers: - severity = "info" - equal: ["namespace"] + equal: [namespace] - source_matchers: - severity = "critical" target_matchers: - severity =~ "info|warning" - equal: ["alertname"] + equal: [alertname] - source_matchers: - severity = "warning" target_matchers: - severity = "info" - equal: ["alertname"] + equal: [alertname] templates: - - "/etc/alertmanager/configmaps/alertmanager-templates/*.tmpl" + - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl grafana: enabled: true diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml index 741c54fe6..56dfbcaf0 100644 --- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml +++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml @@ -207,11 +207,11 @@ alertmanager: matchers: - alertname =~ "Kube.*" - receiver: slack-node-test - group_by: ["instance"] + group_by: [instance] matchers: - alertname =~ "Node.*" - receiver: slack-network-test - group_by: ["instance"] + group_by: [instance] matchers: - alertname =~ "Network.*" # Below is an example for the namespace based alert routing. From 5735726e091fe90d595154cf564d79bc1381f19f Mon Sep 17 00:00:00 2001 From: Francis Begyn Date: Thu, 4 Jul 2024 12:14:51 +0200 Subject: [PATCH 14/16] Update externalsecret-grafana-keycloak-credentials.yaml Use the correct key for the secret instead of the older one. --- .../externalsecret-grafana-keycloak-credentials.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml b/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml index 008591475..8401e6fa5 100644 --- a/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml +++ b/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml @@ -18,4 +18,4 @@ spec: - secretKey: keycloak_url remoteRef: key: *item - property: hostname + property: url From a9e34dcd3928b1915f9eaa7ac83b76e1677eec58 Mon Sep 17 00:00:00 2001 From: Kris Buytaert Date: Fri, 5 Jul 2024 08:57:26 +0200 Subject: [PATCH 15/16] Disabling default deployment of K8s dashboards on grafana, in order to deploy our own --- fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml index 56dfbcaf0..8b9ef16ca 100644 --- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml +++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml @@ -1,5 +1,6 @@ --- prometheus: + prometheusSpec: configMaps: - sd-snmp-network @@ -276,3 +277,5 @@ alertmanager: equal: [alertname] templates: - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl +grafana: + defaultDashboardsEnabled: false From a024f6c41fc4eee8eb4063e651d206700b610430 Mon Sep 17 00:00:00 2001 From: Kris Buytaert Date: Fri, 5 Jul 2024 10:23:36 +0200 Subject: [PATCH 16/16] Fixing Yaml --- fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml index 8b9ef16ca..95a671dc0 100644 --- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml +++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml @@ -278,4 +278,4 @@ alertmanager: templates: - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl grafana: - defaultDashboardsEnabled: false + defaultDashboardsEnabled: false