lsst-it · fbegyn · Apr 16, 2024 · May 3, 2024 · May 7, 2024 · May 14, 2024
diff --git a/docs/alerts/README.md b/docs/alerts/README.md
@@ -0,0 +1,30 @@
+# Prometheus rules GitOps
+
+Any Prometheus rules file defined in the
+[fleet/lib/prometheus-alertrules/rules](../../prometheus-alertrules/rules)
+directory will be deployed to the cluster. It's possible to define a default
+namespace in the `values.yaml` file with the `rules.namespace` key.
+
+## Adding Prometheus rules
+
+1. Write the Prometheus rules in a yaml file according to the [prometheus
+   specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/).
+1. Add the YAML file to the `/rules` directory
+1. Commit
+
+## Prometheus rule AURA standards
+
+* `summary` annotation: The `summary` annotation is used to be able to describe a
+  group of alerts incomming. This annotation DOES NOT contain any templated
+  variables and provides a simple single sentence summary of what the alert is
+  about. For example "Disk space full in 24h". When a cluster triggers several
+  alerts, it can be hany to group these alerts into a single notification, this
+  is when the `summary` can be used.
+* `discription` annotation: This provides a detailed overview of the alert
+  specifically to this instance of the alert. It MAY contain templated variables
+  to enrich the message.
+* `receiver` label: The receiver label is used by alertmanager to decide on the
+  routing of the notification for the alert. It exists out of `,` seperated list
+  of receivers, pre- and suffixed with `,` to make regex matching easier in the
+  alertmanager. For example: `,slack,squadcast,email,` The receivers are defined
+  in the alertmanager configuration.
diff --git a/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml b/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml
@@ -5,25 +5,25 @@ metadata:
 data:
   slack-generic-alert.tmpl: |
     {{ define "slack.o11y.generic.text" }}
-    *Site:* {{ .CommonLabels.site }}
+    *Site:* {{ .CommonLabels.prom_site }}
     *Alert:* {{ .GroupLabels.alertname }}
     *Summary:* {{ .CommonAnnotations.summary }}
-    {{ template "__o11y_alert_list" . }}
+    {{ template "__o11y_alert_short_list" . }}
     {{ end }}
     {{ define "slack.o11y.generic.title"}}
-    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.alertname }}
+    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.alertname }}
     {{ end }}
   slack-kube-alert.tmpl: |
     {{ define "slack.o11y.kube.text" }}
     *Alert:* {{ .GroupLabels.alertname }}
-    *Site:* {{ .CommonLabels.site }}
-    *Kube cluster:* {{ .CommonLabels.prom }}
+    *Site:* {{ .CommonLabels.prom_site }}
+    *Kube cluster:* {{ .CommonLabels.prom_cluster }}
     *Namespace:* {{ .GroupLabels.namespace }}
     *Summary:* {{ .CommonAnnotations.summary }}
     {{ template "__o11y_alert_list" . }}
     {{ end }}
     {{ define "slack.o11y.kube.title"}}
-    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }}
+    [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }}
     {{ end }}
   slack-network-alert.tmpl: |
     {{ define "slack.o11y.network.text" }}
@@ -36,12 +36,13 @@ data:
     {{ template "__o11y_alert_list" . }}
     {{ end }}
   template-helpers.tmpl: |
+    {{ define "__o11y_alert_title" }}
+    {{ end }}
     {{ define "__o11y_alert_list" }}
     *Alerts:*
     =========
     {{ range .Alerts -}}
     - *Alert:* {{ .Labels.alertname }}
-      *Summary:* {{ .Annotations.summary }}
       *Description:* {{ .Annotations.description }}
       *Severity:* {{ .Labels.severity }}
       *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}
@@ -51,3 +52,13 @@ data:
       {{ end }}
     {{ end }}
     {{ end }}
+    {{ define "__o11y_alert_short_list" }}
+    *Alerts:*
+    =========
+    {{ range .Alerts -}}
+    - *Alert:* {{ .Labels.alertname }}
+      *Description:* {{ .Annotations.description }}
+      *Severity:* {{ .Labels.severity }}
+      *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }}
+    {{ end }}
+    {{ end }}
diff --git a/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml b/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml
@@ -18,4 +18,4 @@ spec:
   - secretKey: keycloak_url
     remoteRef:
       key: *item
-      property: hostname
+      property: url
diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml
@@ -46,6 +46,27 @@ alertmanager:
       - secretName: tls-alertmanager-ingress
         hosts:
           - alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org
+  config:
+    global:
+      resolve_timeout: 5m
+    inhibit_rules:
+      - source_matchers:
+          - alertname = "InfoInhibitor"
+        target_matchers:
+          - severity = "info"
+        equal: [namespace]
+      - source_matchers:
+          - severity = "critical"
+        target_matchers:
+          - severity =~ "info|warning"
+        equal: [alertname]
+      - source_matchers:
+          - severity = "warning"
+        target_matchers:
+          - severity = "info"
+        equal: [alertname]
+    templates:
+      - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl
 
 grafana:
   enabled: true

diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml
@@ -1,5 +1,6 @@
 ---
 prometheus:
+
   prometheusSpec:
     configMaps:
       - sd-snmp-network
@@ -180,7 +181,6 @@ alertmanager:
       - lsst-webhooks
   config:
     global:
-      resolve_timeout: 5m
       slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test
     route:
       group_by: [alertname, namespace, site]
@@ -201,15 +201,32 @@ alertmanager:
           continue: true
         - receiver: slack-kube-test
           matchers:
-            - alertname =~ "Kube.*"
-        - receiver: slack-node-test
-          group_by: [instance]
-          matchers:
-            - alertname =~ "Node.*"
-        - receiver: slack-network-test
-          group_by: [instance]
-          matchers:
-            - alertname =~ "Network.*"
+            - receivers =~ ".*,slack,.*"
+          continue: true
+          routes:
+            - receiver: slack-kube-test
+              matchers:
+                - alertname =~ "Kube.*"
+            - receiver: slack-node-test
+              group_by: [instance]
+              matchers:
+                - alertname =~ "Node.*"
+            - receiver: slack-network-test
+              group_by: [instance]
+              matchers:
+                - alertname =~ "Network.*"
+        # Below is an example for the namespace based alert routing.
+        # This will send alerts from a namespace to the namespace specific team
+        # on slack
+        # - receiver: slack-rook-ceph-team
+        #   matchers:
+        #     - namespace = "rook-ceph"
+        # Below is an example for the group based alert routing.
+        # This will send alerts with a specifc group in the receiver list to the
+        # alert channel.
+        # - receiver: email-group
+        #   matchers:
+        #     - receivers =~ ".*,group,.*"
     receivers:
       - name: "null"
       - name: watchdog
@@ -260,3 +277,5 @@ alertmanager:
         equal: [alertname]
     templates:
       - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl
+grafana:
+  defaultDashboardsEnabled: false
diff --git a/fleet/lib/prometheus-alertrules/Chart.yaml b/fleet/lib/prometheus-alertrules/Chart.yaml
@@ -0,0 +1,5 @@
+apiVersion: v2
+appVersion: 0.1.0
+description: Prometheus LSST rules GitOps
+name: lsst-prometheus-alerts
+version: 0.1.1
diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml
@@ -0,0 +1,14 @@
+---
+defaultNamespace: &name lsst-prometheus-alerts
+labels:
+  bundle: *name
+namespaceLabels:
+  lsst.io/discover: "true"
+helm:
+  releaseName: *name
+  takeOwnership: true
+  waitForJobs: false
+dependsOn:
+  - selector:
+      matchLabels:
+        bundle: prometheus-operator-crds
diff --git a/fleet/lib/prometheus-alertrules/rules/ceph.yaml b/fleet/lib/prometheus-alertrules/rules/ceph.yaml
@@ -0,0 +1,164 @@
+groups:
+  - name: ceph.rules
+    rules:
+      - alert: CephQuotaFillingUp
+        annotations:
+          summary: The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full
+          description: |
+            Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{
+            $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please
+            keep in mind that ceph pools reaching 100% is dangerous.
+        labels:
+          secverity: warning
+          receivers: ",slack,"
+        expr: |
+          (ceph_pool_stored/ceph_pool_quota_bytes > 0.75 and ceph_pool_quota_bytes != 0)*100
+      - alert: CephQuotaFillingUp
+        annotations:
+          summary: The Ceph pool quota is almost full
+          description: |
+            Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{
+            $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please
+            keep in mind that ceph pools reaching 100% is dangerous.
+        labels:
+          secverity: critical
+          receivers: ",slack,"
+        expr: |
+          (ceph_pool_stored/ceph_pool_quota_bytes > 0.9 and ceph_pool_quota_bytes != 0)*100
+      - alert: CephTargetDown
+        expr: up{job=".*ceph.*"} == 0
+        for: 10m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            CEPH target on {{ $labels.prom_cluster }} down for more than 2m,
+            please check - it could be a either exporter crash or a whole cluster
+            crash
+          summary: CEPH exporter down on {{ $labels.prom_cluster }}
+      - alert: CephErrorState
+        expr: ceph_health_status > 1
+        for: 5m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Ceph is in Error state on {{ $labels.prom_cluster }} for longer than
+            5m, please check status of pools and OSDs
+          summary: CEPH in ERROR
+      - alert: CephWarnState
+        expr: ceph_health_status == 1
+        for: 30m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Ceph is in Warn state on {{ $labels.prom_cluster }} for longer than
+            30m, please check status of pools and OSDs
+          summary: CEPH in WARN
+      - alert: OsdDown
+        expr: ceph_osd_up == 0
+        for: 30m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            OSD is down longer than 30 min on {{ $labels.prom_cluster }}, please
+            check whats the status
+          summary: OSD down
+      - alert: OsdApplyLatencyTooHigh
+        expr: ceph_osd_apply_latency_ms > 5000
+        for: 90s
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            OSD latency for {{ $labels.osd }} is too high on {{
+            $labels.prom_cluster }}. Please check if it doesn't stuck in weird
+            state
+          summary: OSD latency too high {{ $labels.osd }}
+      - alert: CephPgDown
+        expr: ceph_pg_down > 0
+        for: 3m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are down (unavailable) for too long on {{
+            $labels.prom_cluster }}. Please ensure that all the data are
+            available
+          summary: PG DOWN [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgIncomplete
+        expr: ceph_pg_incomplete > 0
+        for: 2m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are incomplete (unavailable) for too long on {{
+            $labels.prom_cluster }}. Please ensure that all the data are
+            available
+          summary: PG INCOMPLETE [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgInconsistent
+        expr: ceph_pg_inconsistent > 0
+        for: 1m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are inconsistent for too long on {{ $labels.prom_cluster
+            }}. Data is available but inconsistent across nodes
+          summary: PG INCONSISTENT [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgActivating
+        expr: ceph_pg_activating > 0
+        for: 5m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are activating for too long on {{ $labels.prom_cluster
+            }}. Those PGs are unavailable for too long!
+          summary: PG ACTIVATING [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgBackfillTooFull
+        expr: ceph_pg_backfill_toofull > 0
+        for: 5m
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are located on full OSD on cluster {{
+            $labels.prom_cluster }}. Those PGs can be unavailable shortly. Please
+            check OSDs, change weight or reconfigure CRUSH rules.
+          summary: PG TOO FULL [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephPgUnavailable
+        expr: ceph_pg_total - ceph_pg_active > 0
+        for: 5m
+        labels:
+          severity: critical
+          receivers: ",slack,"
+        annotations:
+          description: |
+            Some groups are unavailable on {{ $labels.prom_cluster }}. Please
+            check their detailed status and current configuration.
+          summary: PG UNAVAILABLE [{{ $value }}] on {{ $labels.prom_cluster }}
+      - alert: CephOsdReweighted
+        expr: ceph_osd_weight < 1
+        for: 1h
+        labels:
+          severity: warning
+          receivers: ",slack,"
+        annotations:
+          description: |
+            OSD on cluster {{ $labels.prom_cluster}} was reweighted for too long.
+            Please either create silent or fix that issue
+          summary: OSD {{ $labels.ceph_daemon }} on {{ $labels.prom_cluster }} reweighted - {{ $value }}