-
Notifications
You must be signed in to change notification settings - Fork 239
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(metrics): add prometheus alert rules template and unit tests (#897)
Signed-off-by: Shichao Nie <[email protected]>
- Loading branch information
Showing
5 changed files
with
716 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
global: | ||
resolve_timeout: 5m | ||
# The directory from which notification templates are read. | ||
templates: | ||
- '/etc/alertmanager/template/*.tmpl' | ||
|
||
# The root route on which each incoming alert enters. | ||
route: | ||
# The labels by which incoming alerts are grouped together. For example, | ||
# multiple alerts coming in for cluster=A and alertname=LatencyHigh would | ||
# be batched into a single group. | ||
# | ||
# To aggregate by all possible labels use '...' as the sole label name. | ||
# This effectively disables aggregation entirely, passing through all | ||
# alerts as-is. This is unlikely to be what you want, unless you have | ||
# a very low alert volume or your upstream notification system performs | ||
# its own grouping. Example: group_by: [...] | ||
group_by: ['alertname', 'job', 'instance'] | ||
|
||
# When a new group of alerts is created by an incoming alert, wait at | ||
# least 'group_wait' to send the initial notification. | ||
# This way ensures that you get multiple alerts for the same group that start | ||
# firing shortly after another are batched together on the first | ||
# notification. | ||
group_wait: 30s | ||
|
||
# When the first notification was sent, wait 'group_interval' to send a batch | ||
# of new alerts that started firing for that group. | ||
group_interval: 5m | ||
|
||
# If an alert has successfully been sent, wait 'repeat_interval' to | ||
# resend them. | ||
repeat_interval: 3h | ||
|
||
# A default receiver | ||
receiver: webhook_receiver | ||
|
||
# Inhibition rules allow to mute a set of alerts given that another alert is | ||
# firing. | ||
# We use this to mute any warning-level notifications if the same alert is | ||
# already critical. | ||
inhibit_rules: | ||
- source_matchers: [severity="critical"] | ||
target_matchers: [severity="warning"] | ||
# Apply inhibition if the alertname is the same. | ||
# CAUTION: | ||
# If all label names listed in `equal` are missing | ||
# from both the source and target alerts, | ||
# the inhibition rule will apply! | ||
equal: [alertname, job, instance] | ||
|
||
|
||
receivers: | ||
- name: 'webhook_receiver' | ||
webhook_configs: | ||
- url: '${your_webhook_url}' | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
docker/telemetry/prometheus/rules/alert_rules_template.yml
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
# This is the alert rules template for AutoMQ, please modify the alert thresholds and period per your needs | ||
# before applying it to your production environment. | ||
groups: | ||
- name: kafka_alerts | ||
rules: | ||
- alert: ActiveControllerCount | ||
expr: sum(kafka_controller_active_count) by (job) != 1 | ||
for: 1m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "Illegal kafka active controller number for cluster {{ $labels.job }}" | ||
description: "Current number of active controller is {{ $value }}" | ||
|
||
- alert: KafkaClusterHighBytesInPerSec | ||
expr: sum(rate(kafka_broker_network_io_bytes_total{direction="in"}[1m])) by (job) > 50 * 1024 * 1024 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}" | ||
description: "The number of bytes per second received by Kafka cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaClusterHighBytesOutPerSec | ||
expr: sum(rate(kafka_broker_network_io_bytes_total{direction="out"}[1m])) by (job) > 50 * 1024 * 1024 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}" | ||
description: "The number of bytes per second fetched from Kafka cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighBytesInPerSec | ||
expr: rate(kafka_broker_network_io_bytes_total{direction="in"}[1m]) > 20 * 1024 * 1024 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" | ||
description: "The number of bytes per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighBytesOutPerSec | ||
expr: rate(kafka_broker_network_io_bytes_total{direction="out"}[1m]) > 20 * 1024 * 1024 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" | ||
description: "The number of bytes per second fetched from Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighProduceRequestRate | ||
expr: sum(rate(kafka_request_count_total{type="Produce"}[1m])) by (job, instance) > 1000 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka produce request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" | ||
description: "The number of produce requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighFetchRequestRate | ||
expr: sum(rate(kafka_request_count_total{type="Fetch"}[1m])) by (job, instance) > 1000 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka fetch request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" | ||
description: "The number of fetch requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighProduceLatency | ||
expr: kafka_request_time_99p_milliseconds{type="Produce"} > 100 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka produce request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}" | ||
description: "The 99th percentile of produce request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighFetchLatency | ||
expr: kafka_request_time_99p_milliseconds{type="Fetch"} > 1000 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High Kafka fetch request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}" | ||
description: "The 99th percentile of fetch request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighErrorRequestRate | ||
expr: sum(rate(kafka_request_error_count_total{error!="NONE"}[1m])) by (job, instance, error) > 0.1 | ||
for: 2m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "High Kafka error request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" | ||
description: "The error request rate of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighPartitionCount | ||
expr: kafka_partition_count > 5000 | ||
for: 1m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many partitions: {{ $value }}." | ||
description: "The partition count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaBrokerHighConnectionCount | ||
expr: sum(kafka_server_connection_count) by (job, instance) > 1000 | ||
for: 1m | ||
labels: | ||
severity: critical | ||
annotations: | ||
summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many connections: {{ $value }}." | ||
description: "The connection count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." | ||
|
||
- alert: KafkaGroupHighConsumerLag | ||
expr: sum(max(kafka_log_end_offset) by (job, topic, partition)) by (job, topic) | ||
- on (topic) group_left (consumer_group) sum(max(kafka_group_commit_offset) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) > 10000 | ||
for: 1m | ||
labels: | ||
severity: warning | ||
annotations: | ||
summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}." | ||
description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold." |
Oops, something went wrong.