Skip to content

Commit

Permalink
feat(metrics): add prometheus alert rules template and unit tests (#897)
Browse files Browse the repository at this point in the history
Signed-off-by: Shichao Nie <[email protected]>
  • Loading branch information
SCNieh authored Mar 7, 2024
1 parent beeb2b1 commit d17278e
Show file tree
Hide file tree
Showing 5 changed files with 716 additions and 7 deletions.
57 changes: 57 additions & 0 deletions docker/telemetry/alertmanager/alertmanager.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
global:
resolve_timeout: 5m
# The directory from which notification templates are read.
templates:
- '/etc/alertmanager/template/*.tmpl'

# The root route on which each incoming alert enters.
route:
# The labels by which incoming alerts are grouped together. For example,
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would
# be batched into a single group.
#
# To aggregate by all possible labels use '...' as the sole label name.
# This effectively disables aggregation entirely, passing through all
# alerts as-is. This is unlikely to be what you want, unless you have
# a very low alert volume or your upstream notification system performs
# its own grouping. Example: group_by: [...]
group_by: ['alertname', 'job', 'instance']

# When a new group of alerts is created by an incoming alert, wait at
# least 'group_wait' to send the initial notification.
# This way ensures that you get multiple alerts for the same group that start
# firing shortly after another are batched together on the first
# notification.
group_wait: 30s

# When the first notification was sent, wait 'group_interval' to send a batch
# of new alerts that started firing for that group.
group_interval: 5m

# If an alert has successfully been sent, wait 'repeat_interval' to
# resend them.
repeat_interval: 3h

# A default receiver
receiver: webhook_receiver

# Inhibition rules allow to mute a set of alerts given that another alert is
# firing.
# We use this to mute any warning-level notifications if the same alert is
# already critical.
inhibit_rules:
- source_matchers: [severity="critical"]
target_matchers: [severity="warning"]
# Apply inhibition if the alertname is the same.
# CAUTION:
# If all label names listed in `equal` are missing
# from both the source and target alerts,
# the inhibition rule will apply!
equal: [alertname, job, instance]


receivers:
- name: 'webhook_receiver'
webhook_configs:
- url: '${your_webhook_url}'

13 changes: 13 additions & 0 deletions docker/telemetry/docker-compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,24 @@ services:
- "--enable-feature=otlp-write-receiver"
volumes:
- ./prometheus/prometheus.yml:/prometheus/prometheus.yml
- ./prometheus/rules:/prometheus/rules
- ${DATA_PATH}/prometheus/data:/prometheus
depends_on:
- otel-collector
extra_hosts:
- "host.docker.internal:host-gateway"
alertmanager:
image: prom/alertmanager
ports:
- "9087:9087"
volumes:
- ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml
- ${DATA_PATH}/alertmanager/data:/etc/alertmanager
command:
- "--config.file=/etc/alertmanager/alertmanager.yml"
- "--web.listen-address=:9087"
extra_hosts:
- "host.docker.internal:host-gateway"
otel-collector:
image: otel/opentelemetry-collector-contrib
volumes:
Expand Down
12 changes: 5 additions & 7 deletions docker/telemetry/prometheus/prometheus.yml
Original file line number Diff line number Diff line change
@@ -1,20 +1,18 @@
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
scrape_interval: 30s # Set the scrape interval to every 30 seconds. Default is every 1 minute.
evaluation_interval: 30s # Evaluate rules every 30 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
- targets: ["host.docker.internal:9087"]

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/prometheus/rules/alert_rules_template.yml"

# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
Expand All @@ -33,7 +31,7 @@ scrape_configs:
group: 'prometheus'

- job_name: "kafka"
scrape_interval: 5s
scrape_interval: 30s
honor_labels: true
static_configs:
- targets: ["host.docker.internal:8890"]
122 changes: 122 additions & 0 deletions docker/telemetry/prometheus/rules/alert_rules_template.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
# This is the alert rules template for AutoMQ, please modify the alert thresholds and period per your needs
# before applying it to your production environment.
groups:
- name: kafka_alerts
rules:
- alert: ActiveControllerCount
expr: sum(kafka_controller_active_count) by (job) != 1
for: 1m
labels:
severity: critical
annotations:
summary: "Illegal kafka active controller number for cluster {{ $labels.job }}"
description: "Current number of active controller is {{ $value }}"

- alert: KafkaClusterHighBytesInPerSec
expr: sum(rate(kafka_broker_network_io_bytes_total{direction="in"}[1m])) by (job) > 50 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}"
description: "The number of bytes per second received by Kafka cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaClusterHighBytesOutPerSec
expr: sum(rate(kafka_broker_network_io_bytes_total{direction="out"}[1m])) by (job) > 50 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}"
description: "The number of bytes per second fetched from Kafka cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighBytesInPerSec
expr: rate(kafka_broker_network_io_bytes_total{direction="in"}[1m]) > 20 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of bytes per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighBytesOutPerSec
expr: rate(kafka_broker_network_io_bytes_total{direction="out"}[1m]) > 20 * 1024 * 1024
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of bytes per second fetched from Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighProduceRequestRate
expr: sum(rate(kafka_request_count_total{type="Produce"}[1m])) by (job, instance) > 1000
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka produce request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of produce requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighFetchRequestRate
expr: sum(rate(kafka_request_count_total{type="Fetch"}[1m])) by (job, instance) > 1000
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka fetch request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The number of fetch requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighProduceLatency
expr: kafka_request_time_99p_milliseconds{type="Produce"} > 100
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka produce request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The 99th percentile of produce request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighFetchLatency
expr: kafka_request_time_99p_milliseconds{type="Fetch"} > 1000
for: 1m
labels:
severity: warning
annotations:
summary: "High Kafka fetch request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The 99th percentile of fetch request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighErrorRequestRate
expr: sum(rate(kafka_request_error_count_total{error!="NONE"}[1m])) by (job, instance, error) > 0.1
for: 2m
labels:
severity: critical
annotations:
summary: "High Kafka error request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}"
description: "The error request rate of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighPartitionCount
expr: kafka_partition_count > 5000
for: 1m
labels:
severity: critical
annotations:
summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many partitions: {{ $value }}."
description: "The partition count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaBrokerHighConnectionCount
expr: sum(kafka_server_connection_count) by (job, instance) > 1000
for: 1m
labels:
severity: critical
annotations:
summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many connections: {{ $value }}."
description: "The connection count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold."

- alert: KafkaGroupHighConsumerLag
expr: sum(max(kafka_log_end_offset) by (job, topic, partition)) by (job, topic)
- on (topic) group_left (consumer_group) sum(max(kafka_group_commit_offset) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) > 10000
for: 1m
labels:
severity: warning
annotations:
summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}."
description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold."
Loading

0 comments on commit d17278e

Please sign in to comment.