Skip to content

Commit

Permalink
feat(metrics): add s3 request error rate to alert rules (#898)
Browse files Browse the repository at this point in the history
Signed-off-by: Shichao Nie <[email protected]>
  • Loading branch information
SCNieh authored Mar 7, 2024
1 parent c5b6274 commit a504f97
Show file tree
Hide file tree
Showing 2 changed files with 48 additions and 0 deletions.
9 changes: 9 additions & 0 deletions docker/telemetry/prometheus/rules/alert_rules_template.yml
Original file line number Diff line number Diff line change
Expand Up @@ -120,3 +120,12 @@ groups:
annotations:
summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}."
description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold."

- alert: KafkaHighS3RequestErrorRate
expr: sum(rate(kafka_stream_operation_latency_count{operation_type="S3Request", status="failed"}[1m])) by (job, operation_name) > 0.1
for: 1m
labels:
severity: critical
annotations:
summary: "High Kafka S3 request {{ $labels.operation_name }} error rate {{ printf \"%0.2f\" $value }} req/s for cluster {{ $labels.job }}"
description: "The S3 request {{ $labels.operation_name }} error rate of Kafka cluster {{ $labels.job }} is exceeding threshold."
39 changes: 39 additions & 0 deletions docker/telemetry/prometheus/rules/unit_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -517,3 +517,42 @@ tests:
exp_annotations:
summary: "High group consumer lag 12000 for consumer group test-group in cluster cluster_1 on topic test-topic."
description: "The consumer lag of consumer group test-group in cluster cluster_1 on topic test-topic is exceeding threshold."
# Test KafkaHighS3RequestErrorRate
- interval: 1m
# Series data.
input_series:
- series: 'kafka_stream_operation_latency_count{job="cluster_1", instance="0", operation_name="get_object", operation_type="S3Request", status="failed"}'
values: '0 5 60 180 0 0'
- series: 'kafka_stream_operation_latency_count{job="cluster_1", instance="0", operation_name="get_object", operation_type="S3Request", status="success"}'
values: '0 100 200 300 400 500'
- series: 'kafka_stream_operation_latency_count{job="cluster_1", instance="0", operation_name="append", operation_type="S3Stream"}'
values: '0 100 200 300 400 500'

alert_rule_test:
# Test no alert.
- eval_time: 0m
alertname: KafkaHighS3RequestErrorRate
exp_alerts:
- eval_time: 1m
alertname: KafkaHighS3RequestErrorRate
exp_alerts:
- eval_time: 2m
alertname: KafkaHighS3RequestErrorRate
exp_alerts:
- eval_time: 4m
alertname: KafkaHighS3RequestErrorRate
exp_alerts:
- eval_time: 5m
alertname: KafkaHighS3RequestErrorRate
exp_alerts:
# Test alert.
- eval_time: 3m
alertname: KafkaHighS3RequestErrorRate
exp_alerts:
- exp_labels:
severity: critical
job: cluster_1
operation_name: get_object
exp_annotations:
summary: "High Kafka S3 request get_object error rate 2.00 req/s for cluster cluster_1"
description: "The S3 request get_object error rate of Kafka cluster cluster_1 is exceeding threshold."

0 comments on commit a504f97

Please sign in to comment.