diff --git a/docker/telemetry/prometheus/rules/alert_rules_template.yml b/docker/telemetry/prometheus/rules/alert_rules_template.yml index 5d27e2602b..d1ea3ddb77 100644 --- a/docker/telemetry/prometheus/rules/alert_rules_template.yml +++ b/docker/telemetry/prometheus/rules/alert_rules_template.yml @@ -120,3 +120,12 @@ groups: annotations: summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}." description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold." + + - alert: KafkaHighS3RequestErrorRate + expr: sum(rate(kafka_stream_operation_latency_count{operation_type="S3Request", status="failed"}[1m])) by (job, operation_name) > 0.1 + for: 1m + labels: + severity: critical + annotations: + summary: "High Kafka S3 request {{ $labels.operation_name }} error rate {{ printf \"%0.2f\" $value }} req/s for cluster {{ $labels.job }}" + description: "The S3 request {{ $labels.operation_name }} error rate of Kafka cluster {{ $labels.job }} is exceeding threshold." \ No newline at end of file diff --git a/docker/telemetry/prometheus/rules/unit_tests.yml b/docker/telemetry/prometheus/rules/unit_tests.yml index 6a2ba2fde5..9210a68af9 100644 --- a/docker/telemetry/prometheus/rules/unit_tests.yml +++ b/docker/telemetry/prometheus/rules/unit_tests.yml @@ -517,3 +517,42 @@ tests: exp_annotations: summary: "High group consumer lag 12000 for consumer group test-group in cluster cluster_1 on topic test-topic." description: "The consumer lag of consumer group test-group in cluster cluster_1 on topic test-topic is exceeding threshold." + # Test KafkaHighS3RequestErrorRate + - interval: 1m + # Series data. + input_series: + - series: 'kafka_stream_operation_latency_count{job="cluster_1", instance="0", operation_name="get_object", operation_type="S3Request", status="failed"}' + values: '0 5 60 180 0 0' + - series: 'kafka_stream_operation_latency_count{job="cluster_1", instance="0", operation_name="get_object", operation_type="S3Request", status="success"}' + values: '0 100 200 300 400 500' + - series: 'kafka_stream_operation_latency_count{job="cluster_1", instance="0", operation_name="append", operation_type="S3Stream"}' + values: '0 100 200 300 400 500' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaHighS3RequestErrorRate + exp_alerts: + - eval_time: 1m + alertname: KafkaHighS3RequestErrorRate + exp_alerts: + - eval_time: 2m + alertname: KafkaHighS3RequestErrorRate + exp_alerts: + - eval_time: 4m + alertname: KafkaHighS3RequestErrorRate + exp_alerts: + - eval_time: 5m + alertname: KafkaHighS3RequestErrorRate + exp_alerts: + # Test alert. + - eval_time: 3m + alertname: KafkaHighS3RequestErrorRate + exp_alerts: + - exp_labels: + severity: critical + job: cluster_1 + operation_name: get_object + exp_annotations: + summary: "High Kafka S3 request get_object error rate 2.00 req/s for cluster cluster_1" + description: "The S3 request get_object error rate of Kafka cluster cluster_1 is exceeding threshold." \ No newline at end of file