From 7a01fe9b8530ac5994b07e8d92508f3b39e0c90e Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Tue, 16 Apr 2024 17:55:46 +0200
Subject: [PATCH 01/16] (fleet/prometheus/alerts) set up gitops for alert
 deployment

This allows for prometheus rules to be automagically applied through rancher
fleet. Simply adding some rules to the fleet directory will deploy them into the
cluster, ready for the prometheus operator to pick them up and deploy.

Documentation to be found in the `docs/alerts` directory.

(fleet/lib/prometheusrules) add values file

(fleet/prometheus rules) depends on prometheus-crds
---
 docs/alerts/README.md                         | 33 +++++++++++++++++
 fleet/lib/prometheus-alertrules/Chart.yaml    |  5 +++
 fleet/lib/prometheus-alertrules/README.md     | 32 +++++++++++++++++
 fleet/lib/prometheus-alertrules/fleet.yaml    | 16 +++++++++
 .../prometheus-alertrules/rules/nodes.yaml    | 35 +++++++++++++++++++
 .../templates/prometheusrule.yaml             | 34 ++++++++++++++++++
 fleet/lib/prometheus-alertrules/values.yaml   |  7 ++++
 7 files changed, 162 insertions(+)
 create mode 100644 docs/alerts/README.md
 create mode 100644 fleet/lib/prometheus-alertrules/Chart.yaml
 create mode 100644 fleet/lib/prometheus-alertrules/README.md
 create mode 100644 fleet/lib/prometheus-alertrules/fleet.yaml
 create mode 100644 fleet/lib/prometheus-alertrules/rules/nodes.yaml
 create mode 100644 fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml
 create mode 100644 fleet/lib/prometheus-alertrules/values.yaml

diff --git a/docs/alerts/README.md b/docs/alerts/README.md
new file mode 100644
index 000000000..c6ed513e6
--- /dev/null
+++ b/docs/alerts/README.md
@@ -0,0 +1,33 @@
+# Prometheus rules GitOps
+
+Any Prometheus rules file defined in the
+[fleet/lib/prometheus-alertrules/rules](../../prometheus-alertrules/rules)
+directory will be deployed to the cluster. It's possible to define a default
+namespace in the `values.yaml` file with the `rules.namespace` key.
+
+## Adding Prometheus rules
+
+1. Write the Prometheus rules in a yaml file according to the [prometheus
+   specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
+1. Add the YAML file to the `/rules` directory
+1. Commit
+
+## Prometheus rule AURA standards
+
+* `summary` annotation: The `summary` annotation is used to be able to describe a
+  group of alerts incomming. This annotation DOES NOT contain any templated
+  variables and provides a simple single sentence summary of what the alert is
+  about. For example "Disk space full in 24h". When a cluster triggers several
+  alerts, it can be hany to group these alerts into a single notification, this
+  is when the `summary` can be used.
+* `discription` annotation: This provides a detailed overview of the alert
+  specifically to this instance of the alert. It MAY contain templated variables
+  to enrich the message.
+* `receiver` label: The receiver label is used by alertmanager to decide on the
+  routing of the notification for the alert. It exists out of `,` seperated list
+  of receivers, pre- and suffixed with `,` to make regex matching easier in the
+  alertmanager. For example: `,slack,squadcast,email,` The receivers are defined
+  in the alertmanager configuration.
+  Currently (20240503) the following receivers are configured:
+  * `slack-test`
+  * `squadcast-test`
diff --git a/fleet/lib/prometheus-alertrules/Chart.yaml b/fleet/lib/prometheus-alertrules/Chart.yaml
new file mode 100644
index 000000000..fb14c879f
--- /dev/null
+++ b/fleet/lib/prometheus-alertrules/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: 0.1.0
+description: "Prometheus LSST rules GitOps"
+name: lsst-prometheus-alerts
+version: 0.1.1
diff --git a/fleet/lib/prometheus-alertrules/README.md b/fleet/lib/prometheus-alertrules/README.md
new file mode 100644
index 000000000..ae985d000
--- /dev/null
+++ b/fleet/lib/prometheus-alertrules/README.md
@@ -0,0 +1,32 @@
+# Prometheus rules GitOps
+
+Any Prometheus rules file defined in the `/rules` directory will be deployed to
+the cluster. It's possible to define a default namespace in the `values.yaml`
+file with the `rules.namespace` key.
+
+## Adding Prometheus rules
+
+1. Write the Prometheus rules in a yaml file according to the [prometheus
+   specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
+1. Add the YAML file to the `/rules` directory
+1. Commit
+
+## Prometheus rule AURA standards
+
+* `summary` annotation: The `summary` annotation is used to be able to describe a
+  group of alerts incomming. This annotation DOES NOT contain any templated
+  variables and provides a simple single sentence summary of what the alert is
+  about. For example "Disk space full in 24h". When a cluster triggers several
+  alerts, it can be hany to group these alerts into a single notification, this
+  is when the `summary` can be used.
+* `discription` annotation: This provides a detailed overview of the alert
+  specifically to this instance of the alert. It MAY contain templated variables
+  to enrich the message.
+* `receiver` label: The receiver label is used by alertmanager to decide on the
+  routing of the notification for the alert. It exists out of `,` seperated list
+  of receivers, pre- and suffixed with `,` to make regex matching easier in the
+  alertmanager. For example: `,slack,squadcast,email,` The receivers are defined
+  in the alertmanager configuration.
+  Currently (20240503) the following receivers are configured:
+  * `slack-test`
+  * `squadcast-test`
diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml
new file mode 100644
index 000000000..fc86bac14
--- /dev/null
+++ b/fleet/lib/prometheus-alertrules/fleet.yaml
@@ -0,0 +1,16 @@
+---
+defaultNamespace: &name lsst-prometheus-alerts
+labels:
+  bundle: *name
+namespaceLabels:
+  lsst.io/discover: "true"
+helm:
+  releaseName: *name
+  takeOwnership: true
+  timeoutSeconds: 300
+  waitForJobs: false
+  atomic: false
+dependsOn:
+  - selector:
+      matchLabels:
+        bundle: prometheus-operator-crds
diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
new file mode 100644
index 000000000..402de0533
--- /dev/null
+++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
@@ -0,0 +1,35 @@
+groups:
+  - name: "NodeFilesystem"
+    rules:
+      - alert: "NodeFilesystemFillingUp"
+        annotaitons:
+          summary: "The nodes disk will fill up within 6h"
+          description: |
+            Nodes TODO:node_label disk is currently almost full at TODO%. It will fill up within 6 hours.
+        expr: |
+          (
+                        node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}
+                      /
+                        node_filesystem_size_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}
+                    *
+                      100
+                  <
+                    10
+                and
+                    predict_linear(
+                      node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h],
+                      12 * 60 * 60
+                    )
+                  <
+                    0
+              and
+                  node_filesystem_readonly{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}
+                ==
+                  0
+            and
+                delta(
+                  node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h]
+                )
+              <
+                0
+          )
diff --git a/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml
new file mode 100644
index 000000000..af693a18e
--- /dev/null
+++ b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml
@@ -0,0 +1,34 @@
+# yamllint disable-file
+{{- if .Values.rules.enabled }}
+{{- $files := .Files.Glob "rules/**.yaml" }}
+{{- range $rawpath, $content := $files }}
+{{- $path := ($rawpath | lower | replace " " "-") }}
+{{- $ruleDir := dir $path }}
+{{- $ruleFile := base $path }}
+{{- $namespaceSplit := regexSplit "\\/+" $ruleDir -1 }}
+{{- $namespace := $.Values.rules.namespace | default $.Release.Namespace }}
+{{- if (eq (len $namespaceSplit) 2) }}
+{{- $namespace = (index $namespaceSplit 1) }}
+{{- end }}
+{{- $alertName := lower (index (regexSplit "\\.yaml" $ruleFile -1) 0) }}
+---
+apiVersion: monitoring.coreos.com/v1                                                                                                                                                         │
+kind: PrometheusRule
+metadata:
+  name: {{ printf "%s-%s" "alert" $alertName | trunc 63 | trimSuffix "-" }}
+  namespace: {{ $namespace }}
+  labels:
+    lsst.io/component: "prometheus-rules"
+    lsst.io/dir: {{ $ruleDir | quote }}
+    lsst.io/file: {{ $ruleFile | quote }}
+    {{- with $.Values.rules.additionalLabels }}
+    {{- toYaml . | nindent 4 }}
+    {{- end }}
+  {{- with $.Values.rules.additionalAnnotations }}
+  annotations:
+    {{- toYaml . | nindent 4 }}
+  {{- end }}
+spec:
+  {{ $content | toString }}
+{{ end }}
+{{ end }}
diff --git a/fleet/lib/prometheus-alertrules/values.yaml b/fleet/lib/prometheus-alertrules/values.yaml
new file mode 100644
index 000000000..52d53e4a4
--- /dev/null
+++ b/fleet/lib/prometheus-alertrules/values.yaml
@@ -0,0 +1,7 @@
+---
+rules:
+  enabled: true
+  namespace: ~
+  additionalAnnotations: {}
+  additionalLabels:
+    lsst.io/rule: "true"

From 0d90c790f893f5bd111029e7cbe996e8e7cb596d Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Fri, 3 May 2024 15:58:49 +0200
Subject: [PATCH 02/16] (fleet/prometheus) modify alerting stack triggers

---
 .../aggregator/values.yaml                    | 24 ++++++++++++++++++-
 .../prometheus-alertrules/rules/nodes.yaml    |  2 +-
 .../templates/prometheusrule.yaml             |  2 +-
 3 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
index 24a87fdef..53a8669b9 100644
--- a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
@@ -45,7 +45,29 @@ alertmanager:
     tls:
       - secretName: tls-alertmanager-ingress
         hosts:
-          - alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org
+          - "alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org"
+  config:
+    global:
+      resolve_timeout: 5m
+      slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test
+    inhibit_rules:
+      - source_matchers:
+          - alertname = "InfoInhibitor"
+        target_matchers:
+          - severity = "info"
+        equal: ["namespace"]
+      - source_matchers:
+          - severity = "critical"
+        target_matchers:
+          - severity =~ "info|warning"
+        equal: ["alertname"]
+      - source_matchers:
+          - severity = "warning"
+        target_matchers:
+          - severity = "info"
+        equal: ["alertname"]
+    templates:
+      - "/etc/alertmanager/configmaps/alertmanager-templates/*.tmpl"
 
 grafana:
   enabled: true
diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
index 402de0533..b0a8bc863 100644
--- a/fleet/lib/prometheus-alertrules/rules/nodes.yaml
+++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
@@ -2,7 +2,7 @@ groups:
   - name: "NodeFilesystem"
     rules:
       - alert: "NodeFilesystemFillingUp"
-        annotaitons:
+        annotations:
           summary: "The nodes disk will fill up within 6h"
           description: |
             Nodes TODO:node_label disk is currently almost full at TODO%. It will fill up within 6 hours.
diff --git a/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml
index af693a18e..3f54756a1 100644
--- a/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml
+++ b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml
@@ -12,7 +12,7 @@
 {{- end }}
 {{- $alertName := lower (index (regexSplit "\\.yaml" $ruleFile -1) 0) }}
 ---
-apiVersion: monitoring.coreos.com/v1                                                                                                                                                         │
+apiVersion: monitoring.coreos.com/v1
 kind: PrometheusRule
 metadata:
   name: {{ printf "%s-%s" "alert" $alertName | trunc 63 | trimSuffix "-" }}

From 0885a7c78876dec166d4394e295fad3169389fb2 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Tue, 7 May 2024 19:47:05 +0200
Subject: [PATCH 03/16] (fleet/alertrules) remove default receivers and update
 docs

---
 docs/alerts/README.md                     | 3 ---
 fleet/lib/prometheus-alertrules/README.md | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/docs/alerts/README.md b/docs/alerts/README.md
index c6ed513e6..ac1ab8cc3 100644
--- a/docs/alerts/README.md
+++ b/docs/alerts/README.md
@@ -28,6 +28,3 @@ namespace in the `values.yaml` file with the `rules.namespace` key.
   of receivers, pre- and suffixed with `,` to make regex matching easier in the
   alertmanager. For example: `,slack,squadcast,email,` The receivers are defined
   in the alertmanager configuration.
-  Currently (20240503) the following receivers are configured:
-  * `slack-test`
-  * `squadcast-test`
diff --git a/fleet/lib/prometheus-alertrules/README.md b/fleet/lib/prometheus-alertrules/README.md
index ae985d000..de5e48845 100644
--- a/fleet/lib/prometheus-alertrules/README.md
+++ b/fleet/lib/prometheus-alertrules/README.md
@@ -27,6 +27,3 @@ file with the `rules.namespace` key.
   of receivers, pre- and suffixed with `,` to make regex matching easier in the
   alertmanager. For example: `,slack,squadcast,email,` The receivers are defined
   in the alertmanager configuration.
-  Currently (20240503) the following receivers are configured:
-  * `slack-test`
-  * `squadcast-test`

From 882d4adc4a81cce81ce2bc4601ea22dc2918240e Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Tue, 14 May 2024 18:22:08 +0200
Subject: [PATCH 04/16] (fleet/alerts) add ceph alerts

---
 .../lib/prometheus-alertrules/rules/ceph.yaml | 164 ++++++++++++++++++
 1 file changed, 164 insertions(+)
 create mode 100644 fleet/lib/prometheus-alertrules/rules/ceph.yaml

diff --git a/fleet/lib/prometheus-alertrules/rules/ceph.yaml b/fleet/lib/prometheus-alertrules/rules/ceph.yaml
new file mode 100644
index 000000000..cbe295b63
--- /dev/null
+++ b/fleet/lib/prometheus-alertrules/rules/ceph.yaml
@@ -0,0 +1,164 @@
+groups:
+  - name: "ceph.rules"
+    rules:
+      - alert: "CephQuotaFillingUp"
+        annotations:
+          summary: "The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full"
+          description: |
+            Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{
+            $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please
+            keep in mind that ceph pools reaching 100% is dangerous.
+        labels:
+          secverity: "warning"
+          receivers: ",slack,"
+        expr: |
+          (ceph_pool_stored/ceph_pool_quota_bytes > 0.75 and ceph_pool_quota_bytes != 0)*100
+      - alert: "CephQuotaFillingUp"
+        annotations:
+          summary: "The Ceph pool quota is almost full"
+          description: |
+            Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{
+            $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please
+            keep in mind that ceph pools reaching 100% is dangerous.
+        labels:
+          secverity: "critical"
+          receivers: ",slack,"
+        expr: |
+          (ceph_pool_stored/ceph_pool_quota_bytes > 0.9 and ceph_pool_quota_bytes != 0)*100
+      - alert: CephTargetDown
+        expr: up{job=".*ceph.*"} == 0
+        for: 10m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            CEPH target on {{ $labels.prom_cluster }} down for more than 2m,
+            please check - it could be a either exporter crash or a whole cluster
+            crash
+          summary: CEPH exporter down on {{ $labels.prom_cluster }}
+      - alert: CephErrorState
+        expr: ceph_health_status > 1
+        for: 5m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Ceph is in Error state on {{ $labels.prom_cluster }} for longer than
+            5m, please check status of pools and OSDs
+          summary: CEPH in ERROR
+      - alert: CephWarnState
+        expr: ceph_health_status == 1
+        for: 30m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Ceph is in Warn state on {{ $labels.prom_cluster }} for longer than
+            30m, please check status of pools and OSDs
+          summary: CEPH in WARN
+      - alert: OsdDown
+        expr: ceph_osd_up == 0
+        for: 30m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            OSD is down longer than 30 min on {{ $labels.prom_cluster }}, please
+            check whats the status
+          summary: OSD down
+      - alert: OsdApplyLatencyTooHigh
+        expr: ceph_osd_apply_latency_ms > 5000
+        for: 90s
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            OSD latency for {{ $labels.osd }} is too high on {{
+            $labels.prom_cluster }}. Please check if it doesn't stuck in weird
+            state
+          summary: OSD latency too high {{ $labels.osd }}
+      - alert: CephPgDown
+        expr: ceph_pg_down > 0
+        for: 3m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are down (unavailable) for too long on {{
+            $labels.prom_cluster }}. Please ensure that all the data are
+            available
+          summary: PG DOWN [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgIncomplete
+        expr: ceph_pg_incomplete > 0
+        for: 2m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are incomplete (unavailable) for too long on {{
+            $labels.prom_cluster }}. Please ensure that all the data are
+            available
+          summary: PG INCOMPLETE [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgInconsistent
+        expr: ceph_pg_inconsistent > 0
+        for: 1m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are inconsistent for too long on {{ $labels.prom_cluster
+            }}. Data is available but inconsistent across nodes
+          summary: PG INCONSISTENT [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgActivating
+        expr: ceph_pg_activating > 0
+        for: 5m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are activating for too long on {{ $labels.prom_cluster
+            }}. Those PGs are unavailable for too long!
+          summary: PG ACTIVATING [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgBackfillTooFull
+        expr: ceph_pg_backfill_toofull > 0
+        for: 5m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are located on full OSD on cluster {{
+            $labels.prom_cluster }}. Those PGs can be unavailable shortly. Please
+            check OSDs, change weight or reconfigure CRUSH rules.
+          summary: PG TOO FULL [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgUnavailable
+        expr: ceph_pg_total - ceph_pg_active > 0
+        for: 5m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are unavailable on {{ $labels.prom_cluster }}. Please
+            check their detailed status and current configuration.
+          summary: PG UNAVAILABLE [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephOsdReweighted
+        expr: ceph_osd_weight < 1
+        for: 1h
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            OSD on cluster {{ $labels.prom_cluster}} was reweighted for too long.
+            Please either create silent or fix that issue
+          summary: OSD {{ $labels.ceph_daemon }} on {{ $labels.prom_cluster }} reweighted - {{ $value }}

From 75ded139578a516356eb254eadfd7c4858e9b549 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Thu, 16 May 2024 15:11:25 +0200
Subject: [PATCH 05/16] (fleet/alerting) remove ceph override

---
 .../overlays/ayekan/values.yaml               | 35 ++++++++++++++-----
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
index b81041535..f931158af 100644
--- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
@@ -201,15 +201,32 @@ alertmanager:
           continue: true
         - receiver: slack-kube-test
           matchers:
-            - alertname =~ "Kube.*"
-        - receiver: slack-node-test
-          group_by: [instance]
-          matchers:
-            - alertname =~ "Node.*"
-        - receiver: slack-network-test
-          group_by: [instance]
-          matchers:
-            - alertname =~ "Network.*"
+            - receivers =~ ".*,slack,.*"
+          continue: true
+          routes:
+            - receiver: slack-kube-test
+              matchers:
+                - alertname =~ "Kube.*"
+            - receiver: slack-node-test
+              group_by: ["instance"]
+              matchers:
+                - alertname =~ "Node.*"
+            - receiver: slack-network-test
+              group_by: ["instance"]
+              matchers:
+                - alertname =~ "Network.*"
+        # Below is an example for the namespace based alert routing.
+        # This will send alerts from a namespace to the namespace specific team
+        # on slack
+        # - receiver: slack-rook-ceph-team
+        #   matchers:
+        #     - namespace = "rook-ceph"
+        # Below is an example for the group based alert routing.
+        # This will send alerts with a specifc group in the receiver list to the
+        # alert channel.
+        # - receiver: email-group
+        #   matchers:
+        #     - receivers =~ ".*,group,.*"
     receivers:
       - name: "null"
       - name: watchdog

From e8d6b867dc12f661a45d515954bf86665c17d6cb Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Fri, 17 May 2024 12:18:28 +0200
Subject: [PATCH 06/16] (fleet/alerts) include cluster in slack alerts

---
 .../configmap-alertmanager-templates.yaml     | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml b/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml
index 94c4d49df..6b0a18c35 100644
--- a/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml
+++ b/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml
@@ -5,25 +5,25 @@ metadata:
 data:
   slack-generic-alert.tmpl: |
     {{ define "slack.o11y.generic.text" }}
-    *Site:* {{ .CommonLabels.site }}
+    *Site:* {{ .CommonLabels.prom_site }}
     *Alert:* {{ .GroupLabels.alertname }}
     *Summary:* {{ .CommonAnnotations.summary }}
-    {{ template "__o11y_alert_list" . }}
+    {{ template "__o11y_alert_short_list" . }}
     {{ end }}
     {{ define "slack.o11y.generic.title"}}
-    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.alertname }}
+    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.alertname }}
     {{ end }}
   slack-kube-alert.tmpl: |
     {{ define "slack.o11y.kube.text" }}
     *Alert:* {{ .GroupLabels.alertname }}
-    *Site:* {{ .CommonLabels.site }}
-    *Kube cluster:* {{ .CommonLabels.prom }}
+    *Site:* {{ .CommonLabels.prom_site }}
+    *Kube cluster:* {{ .CommonLabels.prom_cluster }}
     *Namespace:* {{ .GroupLabels.namespace }}
     *Summary:* {{ .CommonAnnotations.summary }}
     {{ template "__o11y_alert_list" . }}
     {{ end }}
     {{ define "slack.o11y.kube.title"}}
-    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }}
+    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }}
     {{ end }}
   slack-network-alert.tmpl: |
     {{ define "slack.o11y.network.text" }}
@@ -36,12 +36,13 @@ data:
     {{ template "__o11y_alert_list" . }}
     {{ end }}
   template-helpers.tmpl: |
+    {{ define "__o11y_alert_title" }}
+    {{ end }}
     {{ define "__o11y_alert_list" }}
     *Alerts:*
     =========
     {{ range .Alerts -}}
     - *Alert:* {{ .Labels.alertname }}
-      *Summary:* {{ .Annotations.summary }}
       *Description:* {{ .Annotations.description }}
       *Severity:* {{ .Labels.severity }}
       *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}
@@ -51,3 +52,13 @@ data:
       {{ end }}
     {{ end }}
     {{ end }}
+    {{ define "__o11y_alert_short_list" }}
+    *Alerts:*
+    =========
+    {{ range .Alerts -}}
+    - *Alert:* {{ .Labels.alertname }}
+      *Description:* {{ .Annotations.description }}
+      *Severity:* {{ .Labels.severity }}
+      *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}
+    {{ end }}
+    {{ end }}

From b44d94c86ba7588c1202643c26bd62bfd978df96 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Mon, 10 Jun 2024 10:29:23 +0200
Subject: [PATCH 07/16] (fleet/alerts) move slack credentials to cluster
 overlay

---
 fleet/lib/kube-prometheus-stack/aggregator/values.yaml      | 1 -
 fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml | 1 -
 2 files changed, 2 deletions(-)

diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
index 53a8669b9..216f6d6dd 100644
--- a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
@@ -49,7 +49,6 @@ alertmanager:
   config:
     global:
       resolve_timeout: 5m
-      slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test
     inhibit_rules:
       - source_matchers:
           - alertname = "InfoInhibitor"
diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
index f931158af..741c54fe6 100644
--- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
@@ -180,7 +180,6 @@ alertmanager:
       - lsst-webhooks
   config:
     global:
-      resolve_timeout: 5m
       slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test
     route:
       group_by: [alertname, namespace, site]

From 78afed48206dda6bcd2cc6cb7f981dd647c9c03e Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Mon, 10 Jun 2024 10:29:46 +0200
Subject: [PATCH 08/16] (fleet/alerts) remove `atomic` default from fleet.yaml

---
 fleet/lib/prometheus-alertrules/fleet.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml
index fc86bac14..8f6a3260b 100644
--- a/fleet/lib/prometheus-alertrules/fleet.yaml
+++ b/fleet/lib/prometheus-alertrules/fleet.yaml
@@ -9,7 +9,6 @@ helm:
   takeOwnership: true
   timeoutSeconds: 300
   waitForJobs: false
-  atomic: false
 dependsOn:
   - selector:
       matchLabels:

From 9354afaa6a060a236d47cc41c4d1b8ab6ae530c8 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Mon, 10 Jun 2024 10:33:26 +0200
Subject: [PATCH 09/16] (fleet/alerts) insert templating values for node disk
 alerts

---
 fleet/lib/prometheus-alertrules/rules/nodes.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
index b0a8bc863..5fe214c09 100644
--- a/fleet/lib/prometheus-alertrules/rules/nodes.yaml
+++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
@@ -5,30 +5,30 @@ groups:
         annotations:
           summary: "The nodes disk will fill up within 6h"
           description: |
-            Nodes TODO:node_label disk is currently almost full at TODO%. It will fill up within 6 hours.
+            Nodes {{ $labels.instance }} disk is currently almost full at {{ $value }}. It will fill up within 6 hours.
         expr: |
           (
-                        node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}
+                        node_filesystem_avail_bytes{fstype!="",job="node-exporter"}
                       /
-                        node_filesystem_size_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}
+                        node_filesystem_size_bytes{fstype!="",job="node-exporter"}
                     *
                       100
                   <
                     10
                 and
                     predict_linear(
-                      node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h],
+                      node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h],
                       12 * 60 * 60
                     )
                   <
                     0
               and
-                  node_filesystem_readonly{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}
+                  node_filesystem_readonly{fstype!="",job="node-exporter"}
                 ==
                   0
             and
                 delta(
-                  node_filesystem_avail_bytes{fstype!="",job="node-exporter",mountpoint=~"/var/lib/(mysql.*|prometheus.*|mongod.*|pgsql.*|es-data.*|rabbitmq.*)"}[6h]
+                  node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h]
                 )
               <
                 0

From 58cd0b1c64288ac5eee09aa51e11011fad5356b2 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Mon, 10 Jun 2024 14:52:22 +0200
Subject: [PATCH 10/16] (fleet/alerts) make yamllint happy

---
 fleet/lib/prometheus-alertrules/Chart.yaml       |  2 +-
 fleet/lib/prometheus-alertrules/rules/ceph.yaml  | 14 +++++++-------
 fleet/lib/prometheus-alertrules/rules/nodes.yaml |  6 +++---
 3 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/fleet/lib/prometheus-alertrules/Chart.yaml b/fleet/lib/prometheus-alertrules/Chart.yaml
index fb14c879f..a9013689a 100644
--- a/fleet/lib/prometheus-alertrules/Chart.yaml
+++ b/fleet/lib/prometheus-alertrules/Chart.yaml
@@ -1,5 +1,5 @@
 apiVersion: v2
 appVersion: 0.1.0
-description: "Prometheus LSST rules GitOps"
+description: Prometheus LSST rules GitOps
 name: lsst-prometheus-alerts
 version: 0.1.1
diff --git a/fleet/lib/prometheus-alertrules/rules/ceph.yaml b/fleet/lib/prometheus-alertrules/rules/ceph.yaml
index cbe295b63..001ed2031 100644
--- a/fleet/lib/prometheus-alertrules/rules/ceph.yaml
+++ b/fleet/lib/prometheus-alertrules/rules/ceph.yaml
@@ -1,27 +1,27 @@
 groups:
-  - name: "ceph.rules"
+  - name: ceph.rules
     rules:
-      - alert: "CephQuotaFillingUp"
+      - alert: CephQuotaFillingUp
         annotations:
-          summary: "The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full"
+          summary: The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full
           description: |
             Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{
             $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please
             keep in mind that ceph pools reaching 100% is dangerous.
         labels:
-          secverity: "warning"
+          secverity: warning
           receivers: ",slack,"
         expr: |
           (ceph_pool_stored/ceph_pool_quota_bytes > 0.75 and ceph_pool_quota_bytes != 0)*100
-      - alert: "CephQuotaFillingUp"
+      - alert: CephQuotaFillingUp
         annotations:
-          summary: "The Ceph pool quota is almost full"
+          summary: The Ceph pool quota is almost full
           description: |
             Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{
             $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please
             keep in mind that ceph pools reaching 100% is dangerous.
         labels:
-          secverity: "critical"
+          secverity: critical
           receivers: ",slack,"
         expr: |
           (ceph_pool_stored/ceph_pool_quota_bytes > 0.9 and ceph_pool_quota_bytes != 0)*100
diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
index 5fe214c09..2644b3119 100644
--- a/fleet/lib/prometheus-alertrules/rules/nodes.yaml
+++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml
@@ -1,9 +1,9 @@
 groups:
-  - name: "NodeFilesystem"
+  - name: NodeFilesystem
     rules:
-      - alert: "NodeFilesystemFillingUp"
+      - alert: NodeFilesystemFillingUp
         annotations:
-          summary: "The nodes disk will fill up within 6h"
+          summary: The nodes disk will fill up within 6h
           description: |
             Nodes {{ $labels.instance }} disk is currently almost full at {{ $value }}. It will fill up within 6 hours.
         expr: |

From b7a82b4df8f617dcf034ccda79fbb4c29129c485 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Thu, 13 Jun 2024 17:23:44 +0200
Subject: [PATCH 11/16] (fleet/prometheusrule) remove explicit timeout

---
 fleet/lib/prometheus-alertrules/fleet.yaml | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml
index 8f6a3260b..e9b083ecc 100644
--- a/fleet/lib/prometheus-alertrules/fleet.yaml
+++ b/fleet/lib/prometheus-alertrules/fleet.yaml
@@ -7,7 +7,6 @@ namespaceLabels:
 helm:
   releaseName: *name
   takeOwnership: true
-  timeoutSeconds: 300
   waitForJobs: false
 dependsOn:
   - selector:

From 62d8d608879c8fdc7a98cab427fdb87a1d875588 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Thu, 13 Jun 2024 17:29:08 +0200
Subject: [PATCH 12/16] (fleet/alerts) remove duplicate docs in favor of top
 level

---
 fleet/lib/prometheus-alertrules/README.md | 29 -----------------------
 1 file changed, 29 deletions(-)
 delete mode 100644 fleet/lib/prometheus-alertrules/README.md

diff --git a/fleet/lib/prometheus-alertrules/README.md b/fleet/lib/prometheus-alertrules/README.md
deleted file mode 100644
index de5e48845..000000000
--- a/fleet/lib/prometheus-alertrules/README.md
+++ /dev/null
@@ -1,29 +0,0 @@
-# Prometheus rules GitOps
-
-Any Prometheus rules file defined in the `/rules` directory will be deployed to
-the cluster. It's possible to define a default namespace in the `values.yaml`
-file with the `rules.namespace` key.
-
-## Adding Prometheus rules
-
-1. Write the Prometheus rules in a yaml file according to the [prometheus
-   specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
-1. Add the YAML file to the `/rules` directory
-1. Commit
-
-## Prometheus rule AURA standards
-
-* `summary` annotation: The `summary` annotation is used to be able to describe a
-  group of alerts incomming. This annotation DOES NOT contain any templated
-  variables and provides a simple single sentence summary of what the alert is
-  about. For example "Disk space full in 24h". When a cluster triggers several
-  alerts, it can be hany to group these alerts into a single notification, this
-  is when the `summary` can be used.
-* `discription` annotation: This provides a detailed overview of the alert
-  specifically to this instance of the alert. It MAY contain templated variables
-  to enrich the message.
-* `receiver` label: The receiver label is used by alertmanager to decide on the
-  routing of the notification for the alert. It exists out of `,` seperated list
-  of receivers, pre- and suffixed with `,` to make regex matching easier in the
-  alertmanager. For example: `,slack,squadcast,email,` The receivers are defined
-  in the alertmanager configuration.

From 22f5863d4957a319a2806b7288a25c009f82e8a6 Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Thu, 13 Jun 2024 17:31:37 +0200
Subject: [PATCH 13/16] (fleet/prom-stack) make yamllint happy

---
 fleet/lib/kube-prometheus-stack/aggregator/values.yaml | 10 +++++-----
 .../kube-prometheus-stack/overlays/ayekan/values.yaml  |  4 ++--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
index 216f6d6dd..439b49ae2 100644
--- a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
@@ -45,7 +45,7 @@ alertmanager:
     tls:
       - secretName: tls-alertmanager-ingress
         hosts:
-          - "alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org"
+          - alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org
   config:
     global:
       resolve_timeout: 5m
@@ -54,19 +54,19 @@ alertmanager:
           - alertname = "InfoInhibitor"
         target_matchers:
           - severity = "info"
-        equal: ["namespace"]
+        equal: [namespace]
       - source_matchers:
           - severity = "critical"
         target_matchers:
           - severity =~ "info|warning"
-        equal: ["alertname"]
+        equal: [alertname]
       - source_matchers:
           - severity = "warning"
         target_matchers:
           - severity = "info"
-        equal: ["alertname"]
+        equal: [alertname]
     templates:
-      - "/etc/alertmanager/configmaps/alertmanager-templates/*.tmpl"
+      - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl
 
 grafana:
   enabled: true
diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
index 741c54fe6..56dfbcaf0 100644
--- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
@@ -207,11 +207,11 @@ alertmanager:
               matchers:
                 - alertname =~ "Kube.*"
             - receiver: slack-node-test
-              group_by: ["instance"]
+              group_by: [instance]
               matchers:
                 - alertname =~ "Node.*"
             - receiver: slack-network-test
-              group_by: ["instance"]
+              group_by: [instance]
               matchers:
                 - alertname =~ "Network.*"
         # Below is an example for the namespace based alert routing.

From 5735726e091fe90d595154cf564d79bc1381f19f Mon Sep 17 00:00:00 2001
From: Francis Begyn <fbegyn@inuits.eu>
Date: Thu, 4 Jul 2024 12:14:51 +0200
Subject: [PATCH 14/16] Update externalsecret-grafana-keycloak-credentials.yaml

Use the correct key for the secret instead of the older one.
---
 .../externalsecret-grafana-keycloak-credentials.yaml            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml b/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml
index 008591475..8401e6fa5 100644
--- a/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml
+++ b/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml
@@ -18,4 +18,4 @@ spec:
   - secretKey: keycloak_url
     remoteRef:
       key: *item
-      property: hostname
+      property: url

From a9e34dcd3928b1915f9eaa7ac83b76e1677eec58 Mon Sep 17 00:00:00 2001
From: Kris Buytaert <Kris.Buytaert@gmail.com>
Date: Fri, 5 Jul 2024 08:57:26 +0200
Subject: [PATCH 15/16] Disabling default deployment of K8s dashboards on
 grafana, in order to deploy our own

---
 fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
index 56dfbcaf0..8b9ef16ca 100644
--- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
@@ -1,5 +1,6 @@
 ---
 prometheus:
+
   prometheusSpec:
     configMaps:
       - sd-snmp-network
@@ -276,3 +277,5 @@ alertmanager:
         equal: [alertname]
     templates:
       - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl
+grafana:
+   defaultDashboardsEnabled: false

From a024f6c41fc4eee8eb4063e651d206700b610430 Mon Sep 17 00:00:00 2001
From: Kris Buytaert <Kris.Buytaert@gmail.com>
Date: Fri, 5 Jul 2024 10:23:36 +0200
Subject: [PATCH 16/16] Fixing Yaml

---
 fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
index 8b9ef16ca..95a671dc0 100644
--- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
+++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
@@ -278,4 +278,4 @@ alertmanager:
     templates:
       - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl
 grafana:
-   defaultDashboardsEnabled: false
+  defaultDashboardsEnabled: false