diff --git a/docs/alerts/README.md b/docs/alerts/README.md new file mode 100644 index 000000000..ac1ab8cc3 --- /dev/null +++ b/docs/alerts/README.md @@ -0,0 +1,30 @@ +# Prometheus rules GitOps + +Any Prometheus rules file defined in the +[fleet/lib/prometheus-alertrules/rules](../../prometheus-alertrules/rules) +directory will be deployed to the cluster. It's possible to define a default +namespace in the `values.yaml` file with the `rules.namespace` key. + +## Adding Prometheus rules + +1. Write the Prometheus rules in a yaml file according to the [prometheus + specification](https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/). +1. Add the YAML file to the `/rules` directory +1. Commit + +## Prometheus rule AURA standards + +* `summary` annotation: The `summary` annotation is used to be able to describe a + group of alerts incomming. This annotation DOES NOT contain any templated + variables and provides a simple single sentence summary of what the alert is + about. For example "Disk space full in 24h". When a cluster triggers several + alerts, it can be hany to group these alerts into a single notification, this + is when the `summary` can be used. +* `discription` annotation: This provides a detailed overview of the alert + specifically to this instance of the alert. It MAY contain templated variables + to enrich the message. +* `receiver` label: The receiver label is used by alertmanager to decide on the + routing of the notification for the alert. It exists out of `,` seperated list + of receivers, pre- and suffixed with `,` to make regex matching easier in the + alertmanager. For example: `,slack,squadcast,email,` The receivers are defined + in the alertmanager configuration. diff --git a/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml b/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml index 94c4d49df..6b0a18c35 100644 --- a/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml +++ b/fleet/lib/kube-prometheus-stack-pre/configmap-alertmanager-templates.yaml @@ -5,25 +5,25 @@ metadata: data: slack-generic-alert.tmpl: | {{ define "slack.o11y.generic.text" }} - *Site:* {{ .CommonLabels.site }} + *Site:* {{ .CommonLabels.prom_site }} *Alert:* {{ .GroupLabels.alertname }} *Summary:* {{ .CommonAnnotations.summary }} - {{ template "__o11y_alert_list" . }} + {{ template "__o11y_alert_short_list" . }} {{ end }} {{ define "slack.o11y.generic.title"}} - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.alertname }} + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.alertname }} {{ end }} slack-kube-alert.tmpl: | {{ define "slack.o11y.kube.text" }} *Alert:* {{ .GroupLabels.alertname }} - *Site:* {{ .CommonLabels.site }} - *Kube cluster:* {{ .CommonLabels.prom }} + *Site:* {{ .CommonLabels.prom_site }} + *Kube cluster:* {{ .CommonLabels.prom_cluster }} *Namespace:* {{ .GroupLabels.namespace }} *Summary:* {{ .CommonAnnotations.summary }} {{ template "__o11y_alert_list" . }} {{ end }} {{ define "slack.o11y.kube.title"}} - [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }} + [{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .CommonLabels.prom_cluster }}/{{ .GroupLabels.namespace }}/{{ .GroupLabels.alertname }} {{ end }} slack-network-alert.tmpl: | {{ define "slack.o11y.network.text" }} @@ -36,12 +36,13 @@ data: {{ template "__o11y_alert_list" . }} {{ end }} template-helpers.tmpl: | + {{ define "__o11y_alert_title" }} + {{ end }} {{ define "__o11y_alert_list" }} *Alerts:* ========= {{ range .Alerts -}} - *Alert:* {{ .Labels.alertname }} - *Summary:* {{ .Annotations.summary }} *Description:* {{ .Annotations.description }} *Severity:* {{ .Labels.severity }} *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }} @@ -51,3 +52,13 @@ data: {{ end }} {{ end }} {{ end }} + {{ define "__o11y_alert_short_list" }} + *Alerts:* + ========= + {{ range .Alerts -}} + - *Alert:* {{ .Labels.alertname }} + *Description:* {{ .Annotations.description }} + *Severity:* {{ .Labels.severity }} + *Time:* {{ .StartsAt.Format "2006-01-02 15:04:05 MST" }} + {{ end }} + {{ end }} diff --git a/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml b/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml index 008591475..8401e6fa5 100644 --- a/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml +++ b/fleet/lib/kube-prometheus-stack-pre/externalsecret-grafana-keycloak-credentials.yaml @@ -18,4 +18,4 @@ spec: - secretKey: keycloak_url remoteRef: key: *item - property: hostname + property: url diff --git a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml index 24a87fdef..439b49ae2 100644 --- a/fleet/lib/kube-prometheus-stack/aggregator/values.yaml +++ b/fleet/lib/kube-prometheus-stack/aggregator/values.yaml @@ -46,6 +46,27 @@ alertmanager: - secretName: tls-alertmanager-ingress hosts: - alertmanager.${ .ClusterName }.${ .ClusterLabels.site }.lsst.org + config: + global: + resolve_timeout: 5m + inhibit_rules: + - source_matchers: + - alertname = "InfoInhibitor" + target_matchers: + - severity = "info" + equal: [namespace] + - source_matchers: + - severity = "critical" + target_matchers: + - severity =~ "info|warning" + equal: [alertname] + - source_matchers: + - severity = "warning" + target_matchers: + - severity = "info" + equal: [alertname] + templates: + - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl grafana: enabled: true diff --git a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml index b81041535..95a671dc0 100644 --- a/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml +++ b/fleet/lib/kube-prometheus-stack/overlays/ayekan/values.yaml @@ -1,5 +1,6 @@ --- prometheus: + prometheusSpec: configMaps: - sd-snmp-network @@ -180,7 +181,6 @@ alertmanager: - lsst-webhooks config: global: - resolve_timeout: 5m slack_api_url_file: /etc/alertmanager/secrets/lsst-webhooks/slack-test route: group_by: [alertname, namespace, site] @@ -201,15 +201,32 @@ alertmanager: continue: true - receiver: slack-kube-test matchers: - - alertname =~ "Kube.*" - - receiver: slack-node-test - group_by: [instance] - matchers: - - alertname =~ "Node.*" - - receiver: slack-network-test - group_by: [instance] - matchers: - - alertname =~ "Network.*" + - receivers =~ ".*,slack,.*" + continue: true + routes: + - receiver: slack-kube-test + matchers: + - alertname =~ "Kube.*" + - receiver: slack-node-test + group_by: [instance] + matchers: + - alertname =~ "Node.*" + - receiver: slack-network-test + group_by: [instance] + matchers: + - alertname =~ "Network.*" + # Below is an example for the namespace based alert routing. + # This will send alerts from a namespace to the namespace specific team + # on slack + # - receiver: slack-rook-ceph-team + # matchers: + # - namespace = "rook-ceph" + # Below is an example for the group based alert routing. + # This will send alerts with a specifc group in the receiver list to the + # alert channel. + # - receiver: email-group + # matchers: + # - receivers =~ ".*,group,.*" receivers: - name: "null" - name: watchdog @@ -260,3 +277,5 @@ alertmanager: equal: [alertname] templates: - /etc/alertmanager/configmaps/alertmanager-templates/*.tmpl +grafana: + defaultDashboardsEnabled: false diff --git a/fleet/lib/prometheus-alertrules/Chart.yaml b/fleet/lib/prometheus-alertrules/Chart.yaml new file mode 100644 index 000000000..a9013689a --- /dev/null +++ b/fleet/lib/prometheus-alertrules/Chart.yaml @@ -0,0 +1,5 @@ +apiVersion: v2 +appVersion: 0.1.0 +description: Prometheus LSST rules GitOps +name: lsst-prometheus-alerts +version: 0.1.1 diff --git a/fleet/lib/prometheus-alertrules/fleet.yaml b/fleet/lib/prometheus-alertrules/fleet.yaml new file mode 100644 index 000000000..e9b083ecc --- /dev/null +++ b/fleet/lib/prometheus-alertrules/fleet.yaml @@ -0,0 +1,14 @@ +--- +defaultNamespace: &name lsst-prometheus-alerts +labels: + bundle: *name +namespaceLabels: + lsst.io/discover: "true" +helm: + releaseName: *name + takeOwnership: true + waitForJobs: false +dependsOn: + - selector: + matchLabels: + bundle: prometheus-operator-crds diff --git a/fleet/lib/prometheus-alertrules/rules/ceph.yaml b/fleet/lib/prometheus-alertrules/rules/ceph.yaml new file mode 100644 index 000000000..001ed2031 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/rules/ceph.yaml @@ -0,0 +1,164 @@ +groups: + - name: ceph.rules + rules: + - alert: CephQuotaFillingUp + annotations: + summary: The Ceph pool quota in cluster {{ $labels.prom_cluster }} is almost full + description: | + Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{ + $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please + keep in mind that ceph pools reaching 100% is dangerous. + labels: + secverity: warning + receivers: ",slack," + expr: | + (ceph_pool_stored/ceph_pool_quota_bytes > 0.75 and ceph_pool_quota_bytes != 0)*100 + - alert: CephQuotaFillingUp + annotations: + summary: The Ceph pool quota is almost full + description: | + Ceph pool id {{ $labels.pool_id }} on {{ $labels.prom_cluster }}/ {{ + $labels.namespace }}/{{ $labels.pod }} is at {{ $value }}%. Please + keep in mind that ceph pools reaching 100% is dangerous. + labels: + secverity: critical + receivers: ",slack," + expr: | + (ceph_pool_stored/ceph_pool_quota_bytes > 0.9 and ceph_pool_quota_bytes != 0)*100 + - alert: CephTargetDown + expr: up{job=".*ceph.*"} == 0 + for: 10m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + CEPH target on {{ $labels.prom_cluster }} down for more than 2m, + please check - it could be a either exporter crash or a whole cluster + crash + summary: CEPH exporter down on {{ $labels.prom_cluster }} + - alert: CephErrorState + expr: ceph_health_status > 1 + for: 5m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Ceph is in Error state on {{ $labels.prom_cluster }} for longer than + 5m, please check status of pools and OSDs + summary: CEPH in ERROR + - alert: CephWarnState + expr: ceph_health_status == 1 + for: 30m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + Ceph is in Warn state on {{ $labels.prom_cluster }} for longer than + 30m, please check status of pools and OSDs + summary: CEPH in WARN + - alert: OsdDown + expr: ceph_osd_up == 0 + for: 30m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + OSD is down longer than 30 min on {{ $labels.prom_cluster }}, please + check whats the status + summary: OSD down + - alert: OsdApplyLatencyTooHigh + expr: ceph_osd_apply_latency_ms > 5000 + for: 90s + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + OSD latency for {{ $labels.osd }} is too high on {{ + $labels.prom_cluster }}. Please check if it doesn't stuck in weird + state + summary: OSD latency too high {{ $labels.osd }} + - alert: CephPgDown + expr: ceph_pg_down > 0 + for: 3m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are down (unavailable) for too long on {{ + $labels.prom_cluster }}. Please ensure that all the data are + available + summary: PG DOWN [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgIncomplete + expr: ceph_pg_incomplete > 0 + for: 2m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are incomplete (unavailable) for too long on {{ + $labels.prom_cluster }}. Please ensure that all the data are + available + summary: PG INCOMPLETE [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgInconsistent + expr: ceph_pg_inconsistent > 0 + for: 1m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + Some groups are inconsistent for too long on {{ $labels.prom_cluster + }}. Data is available but inconsistent across nodes + summary: PG INCONSISTENT [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgActivating + expr: ceph_pg_activating > 0 + for: 5m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are activating for too long on {{ $labels.prom_cluster + }}. Those PGs are unavailable for too long! + summary: PG ACTIVATING [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgBackfillTooFull + expr: ceph_pg_backfill_toofull > 0 + for: 5m + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + Some groups are located on full OSD on cluster {{ + $labels.prom_cluster }}. Those PGs can be unavailable shortly. Please + check OSDs, change weight or reconfigure CRUSH rules. + summary: PG TOO FULL [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephPgUnavailable + expr: ceph_pg_total - ceph_pg_active > 0 + for: 5m + labels: + severity: critical + receivers: ",slack," + annotations: + description: | + Some groups are unavailable on {{ $labels.prom_cluster }}. Please + check their detailed status and current configuration. + summary: PG UNAVAILABLE [{{ $value }}] on {{ $labels.prom_cluster }} + - alert: CephOsdReweighted + expr: ceph_osd_weight < 1 + for: 1h + labels: + severity: warning + receivers: ",slack," + annotations: + description: | + OSD on cluster {{ $labels.prom_cluster}} was reweighted for too long. + Please either create silent or fix that issue + summary: OSD {{ $labels.ceph_daemon }} on {{ $labels.prom_cluster }} reweighted - {{ $value }} diff --git a/fleet/lib/prometheus-alertrules/rules/nodes.yaml b/fleet/lib/prometheus-alertrules/rules/nodes.yaml new file mode 100644 index 000000000..2644b3119 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/rules/nodes.yaml @@ -0,0 +1,35 @@ +groups: + - name: NodeFilesystem + rules: + - alert: NodeFilesystemFillingUp + annotations: + summary: The nodes disk will fill up within 6h + description: | + Nodes {{ $labels.instance }} disk is currently almost full at {{ $value }}. It will fill up within 6 hours. + expr: | + ( + node_filesystem_avail_bytes{fstype!="",job="node-exporter"} + / + node_filesystem_size_bytes{fstype!="",job="node-exporter"} + * + 100 + < + 10 + and + predict_linear( + node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h], + 12 * 60 * 60 + ) + < + 0 + and + node_filesystem_readonly{fstype!="",job="node-exporter"} + == + 0 + and + delta( + node_filesystem_avail_bytes{fstype!="",job="node-exporter"}[6h] + ) + < + 0 + ) diff --git a/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml new file mode 100644 index 000000000..3f54756a1 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/templates/prometheusrule.yaml @@ -0,0 +1,34 @@ +# yamllint disable-file +{{- if .Values.rules.enabled }} +{{- $files := .Files.Glob "rules/**.yaml" }} +{{- range $rawpath, $content := $files }} +{{- $path := ($rawpath | lower | replace " " "-") }} +{{- $ruleDir := dir $path }} +{{- $ruleFile := base $path }} +{{- $namespaceSplit := regexSplit "\\/+" $ruleDir -1 }} +{{- $namespace := $.Values.rules.namespace | default $.Release.Namespace }} +{{- if (eq (len $namespaceSplit) 2) }} +{{- $namespace = (index $namespaceSplit 1) }} +{{- end }} +{{- $alertName := lower (index (regexSplit "\\.yaml" $ruleFile -1) 0) }} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ printf "%s-%s" "alert" $alertName | trunc 63 | trimSuffix "-" }} + namespace: {{ $namespace }} + labels: + lsst.io/component: "prometheus-rules" + lsst.io/dir: {{ $ruleDir | quote }} + lsst.io/file: {{ $ruleFile | quote }} + {{- with $.Values.rules.additionalLabels }} + {{- toYaml . | nindent 4 }} + {{- end }} + {{- with $.Values.rules.additionalAnnotations }} + annotations: + {{- toYaml . | nindent 4 }} + {{- end }} +spec: + {{ $content | toString }} +{{ end }} +{{ end }} diff --git a/fleet/lib/prometheus-alertrules/values.yaml b/fleet/lib/prometheus-alertrules/values.yaml new file mode 100644 index 000000000..52d53e4a4 --- /dev/null +++ b/fleet/lib/prometheus-alertrules/values.yaml @@ -0,0 +1,7 @@ +--- +rules: + enabled: true + namespace: ~ + additionalAnnotations: {} + additionalLabels: + lsst.io/rule: "true"