feat(metrics): add prometheus alert rules template and unit tests (#897)

Signed-off-by: Shichao Nie <[email protected]>
AutoMQ · Mar 7, 2024 · d17278e · d17278e
1 parent beeb2b1
commit d17278e
Show file tree

Hide file tree

Showing 5 changed files with 716 additions and 7 deletions.
diff --git a/docker/telemetry/alertmanager/alertmanager.yml b/docker/telemetry/alertmanager/alertmanager.yml
@@ -0,0 +1,57 @@
+global:
+  resolve_timeout: 5m
+# The directory from which notification templates are read.
+templates:
+  - '/etc/alertmanager/template/*.tmpl'
+
+# The root route on which each incoming alert enters.
+route:
+  # The labels by which incoming alerts are grouped together. For example,
+  # multiple alerts coming in for cluster=A and alertname=LatencyHigh would
+  # be batched into a single group.
+  #
+  # To aggregate by all possible labels use '...' as the sole label name.
+  # This effectively disables aggregation entirely, passing through all
+  # alerts as-is. This is unlikely to be what you want, unless you have
+  # a very low alert volume or your upstream notification system performs
+  # its own grouping. Example: group_by: [...]
+  group_by: ['alertname', 'job', 'instance']
+
+  # When a new group of alerts is created by an incoming alert, wait at
+  # least 'group_wait' to send the initial notification.
+  # This way ensures that you get multiple alerts for the same group that start
+  # firing shortly after another are batched together on the first
+  # notification.
+  group_wait: 30s
+
+  # When the first notification was sent, wait 'group_interval' to send a batch
+  # of new alerts that started firing for that group.
+  group_interval: 5m
+
+  # If an alert has successfully been sent, wait 'repeat_interval' to
+  # resend them.
+  repeat_interval: 3h
+
+  # A default receiver
+  receiver: webhook_receiver
+
+# Inhibition rules allow to mute a set of alerts given that another alert is
+# firing.
+# We use this to mute any warning-level notifications if the same alert is
+# already critical.
+inhibit_rules:
+  - source_matchers: [severity="critical"]
+    target_matchers: [severity="warning"]
+    # Apply inhibition if the alertname is the same.
+    # CAUTION:
+    #   If all label names listed in `equal` are missing
+    #   from both the source and target alerts,
+    #   the inhibition rule will apply!
+    equal: [alertname, job, instance]
+
+
+receivers:
+  - name: 'webhook_receiver'
+    webhook_configs:
+    - url: '${your_webhook_url}'
+
diff --git a/docker/telemetry/docker-compose.yaml b/docker/telemetry/docker-compose.yaml
@@ -29,11 +29,24 @@ services:
       - "--enable-feature=otlp-write-receiver"
     volumes:
       - ./prometheus/prometheus.yml:/prometheus/prometheus.yml
+      - ./prometheus/rules:/prometheus/rules
       - ${DATA_PATH}/prometheus/data:/prometheus
     depends_on:
       - otel-collector
     extra_hosts:
       - "host.docker.internal:host-gateway"
+  alertmanager:
+    image: prom/alertmanager
+    ports:
+      - "9087:9087"
+    volumes:
+      - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
+      - ${DATA_PATH}/alertmanager/data:/etc/alertmanager
+    command:
+      - "--config.file=/etc/alertmanager/alertmanager.yml"
+      - "--web.listen-address=:9087"
+    extra_hosts:
+      - "host.docker.internal:host-gateway"
   otel-collector:
     image: otel/opentelemetry-collector-contrib
     volumes:

diff --git a/docker/telemetry/prometheus/prometheus.yml b/docker/telemetry/prometheus/prometheus.yml
@@ -1,20 +1,18 @@
 # my global config
 global:
-  scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
-  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
+  scrape_interval: 30s # Set the scrape interval to every 30 seconds. Default is every 1 minute.
+  evaluation_interval: 30s # Evaluate rules every 30 seconds. The default is every 1 minute.
   # scrape_timeout is set to the global default (10s).
 
 # Alertmanager configuration
 alerting:
   alertmanagers:
     - static_configs:
-        - targets:
-          # - alertmanager:9093
+        - targets: ["host.docker.internal:9087"]
 
 # Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
 rule_files:
-  # - "first_rules.yml"
-  # - "second_rules.yml"
+   - "/prometheus/rules/alert_rules_template.yml"
 
 # A scrape configuration containing exactly one endpoint to scrape:
 # Here it's Prometheus itself.
@@ -33,7 +31,7 @@ scrape_configs:
           group: 'prometheus'
 
   - job_name: "kafka"
-    scrape_interval: 5s
+    scrape_interval: 30s
     honor_labels: true
     static_configs:
       - targets: ["host.docker.internal:8890"]
diff --git a/docker/telemetry/prometheus/rules/alert_rules_template.yml b/docker/telemetry/prometheus/rules/alert_rules_template.yml
@@ -0,0 +1,122 @@
+# This is the alert rules template for AutoMQ, please modify the alert thresholds and period per your needs
+# before applying it to your production environment.
+groups:
+  - name: kafka_alerts
+    rules:
+      - alert: ActiveControllerCount
+        expr: sum(kafka_controller_active_count) by (job) != 1
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Illegal kafka active controller number for cluster {{ $labels.job }}"
+          description: "Current number of active controller is {{ $value }}"
+
+      - alert: KafkaClusterHighBytesInPerSec
+        expr: sum(rate(kafka_broker_network_io_bytes_total{direction="in"}[1m])) by (job) > 50 * 1024 * 1024
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}"
+          description: "The number of bytes per second received by Kafka cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaClusterHighBytesOutPerSec
+        expr: sum(rate(kafka_broker_network_io_bytes_total{direction="out"}[1m])) by (job) > 50 * 1024 * 1024
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}"
+          description: "The number of bytes per second fetched from Kafka cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighBytesInPerSec
+        expr: rate(kafka_broker_network_io_bytes_total{direction="in"}[1m]) > 20 * 1024 * 1024
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
+          description: "The number of bytes per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighBytesOutPerSec
+        expr: rate(kafka_broker_network_io_bytes_total{direction="out"}[1m]) > 20 * 1024 * 1024
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
+          description: "The number of bytes per second fetched from Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighProduceRequestRate
+        expr: sum(rate(kafka_request_count_total{type="Produce"}[1m])) by (job, instance) > 1000
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka produce request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
+          description: "The number of produce requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighFetchRequestRate
+        expr: sum(rate(kafka_request_count_total{type="Fetch"}[1m])) by (job, instance) > 1000
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka fetch request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
+          description: "The number of fetch requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighProduceLatency
+        expr: kafka_request_time_99p_milliseconds{type="Produce"} > 100
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka produce request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
+          description: "The 99th percentile of produce request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighFetchLatency
+        expr: kafka_request_time_99p_milliseconds{type="Fetch"} > 1000
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High Kafka fetch request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
+          description: "The 99th percentile of fetch request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighErrorRequestRate
+        expr: sum(rate(kafka_request_error_count_total{error!="NONE"}[1m])) by (job, instance, error) > 0.1
+        for: 2m
+        labels:
+          severity: critical
+        annotations:
+          summary: "High Kafka error request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
+          description: "The error request rate of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighPartitionCount
+        expr: kafka_partition_count > 5000
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many partitions: {{ $value }}."
+          description: "The partition count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaBrokerHighConnectionCount
+        expr: sum(kafka_server_connection_count) by (job, instance) > 1000
+        for: 1m
+        labels:
+          severity: critical
+        annotations:
+          summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many connections: {{ $value }}."
+          description: "The connection count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."
+
+      - alert: KafkaGroupHighConsumerLag
+        expr: sum(max(kafka_log_end_offset) by (job, topic, partition)) by (job, topic)
+          - on (topic) group_left (consumer_group) sum(max(kafka_group_commit_offset) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) > 10000
+        for: 1m
+        labels:
+          severity: warning
+        annotations:
+          summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}."
+          description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold."