From d17278e390b554c4acd4c32480b64f80dd2343f9 Mon Sep 17 00:00:00 2001 From: Shichao Nie Date: Thu, 7 Mar 2024 16:22:03 +0800 Subject: [PATCH] feat(metrics): add prometheus alert rules template and unit tests (#897) Signed-off-by: Shichao Nie --- .../telemetry/alertmanager/alertmanager.yml | 57 ++ docker/telemetry/docker-compose.yaml | 13 + docker/telemetry/prometheus/prometheus.yml | 12 +- .../prometheus/rules/alert_rules_template.yml | 122 ++++ .../telemetry/prometheus/rules/unit_tests.yml | 519 ++++++++++++++++++ 5 files changed, 716 insertions(+), 7 deletions(-) create mode 100644 docker/telemetry/alertmanager/alertmanager.yml create mode 100644 docker/telemetry/prometheus/rules/alert_rules_template.yml create mode 100644 docker/telemetry/prometheus/rules/unit_tests.yml diff --git a/docker/telemetry/alertmanager/alertmanager.yml b/docker/telemetry/alertmanager/alertmanager.yml new file mode 100644 index 0000000000..4933c2818b --- /dev/null +++ b/docker/telemetry/alertmanager/alertmanager.yml @@ -0,0 +1,57 @@ +global: + resolve_timeout: 5m +# The directory from which notification templates are read. +templates: + - '/etc/alertmanager/template/*.tmpl' + +# The root route on which each incoming alert enters. +route: + # The labels by which incoming alerts are grouped together. For example, + # multiple alerts coming in for cluster=A and alertname=LatencyHigh would + # be batched into a single group. + # + # To aggregate by all possible labels use '...' as the sole label name. + # This effectively disables aggregation entirely, passing through all + # alerts as-is. This is unlikely to be what you want, unless you have + # a very low alert volume or your upstream notification system performs + # its own grouping. Example: group_by: [...] + group_by: ['alertname', 'job', 'instance'] + + # When a new group of alerts is created by an incoming alert, wait at + # least 'group_wait' to send the initial notification. + # This way ensures that you get multiple alerts for the same group that start + # firing shortly after another are batched together on the first + # notification. + group_wait: 30s + + # When the first notification was sent, wait 'group_interval' to send a batch + # of new alerts that started firing for that group. + group_interval: 5m + + # If an alert has successfully been sent, wait 'repeat_interval' to + # resend them. + repeat_interval: 3h + + # A default receiver + receiver: webhook_receiver + +# Inhibition rules allow to mute a set of alerts given that another alert is +# firing. +# We use this to mute any warning-level notifications if the same alert is +# already critical. +inhibit_rules: + - source_matchers: [severity="critical"] + target_matchers: [severity="warning"] + # Apply inhibition if the alertname is the same. + # CAUTION: + # If all label names listed in `equal` are missing + # from both the source and target alerts, + # the inhibition rule will apply! + equal: [alertname, job, instance] + + +receivers: + - name: 'webhook_receiver' + webhook_configs: + - url: '${your_webhook_url}' + diff --git a/docker/telemetry/docker-compose.yaml b/docker/telemetry/docker-compose.yaml index 061521d6e2..2e376753f6 100644 --- a/docker/telemetry/docker-compose.yaml +++ b/docker/telemetry/docker-compose.yaml @@ -29,11 +29,24 @@ services: - "--enable-feature=otlp-write-receiver" volumes: - ./prometheus/prometheus.yml:/prometheus/prometheus.yml + - ./prometheus/rules:/prometheus/rules - ${DATA_PATH}/prometheus/data:/prometheus depends_on: - otel-collector extra_hosts: - "host.docker.internal:host-gateway" + alertmanager: + image: prom/alertmanager + ports: + - "9087:9087" + volumes: + - ./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml + - ${DATA_PATH}/alertmanager/data:/etc/alertmanager + command: + - "--config.file=/etc/alertmanager/alertmanager.yml" + - "--web.listen-address=:9087" + extra_hosts: + - "host.docker.internal:host-gateway" otel-collector: image: otel/opentelemetry-collector-contrib volumes: diff --git a/docker/telemetry/prometheus/prometheus.yml b/docker/telemetry/prometheus/prometheus.yml index c390d2b25d..3ef266bcf2 100644 --- a/docker/telemetry/prometheus/prometheus.yml +++ b/docker/telemetry/prometheus/prometheus.yml @@ -1,20 +1,18 @@ # my global config global: - scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute. - evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + scrape_interval: 30s # Set the scrape interval to every 30 seconds. Default is every 1 minute. + evaluation_interval: 30s # Evaluate rules every 30 seconds. The default is every 1 minute. # scrape_timeout is set to the global default (10s). # Alertmanager configuration alerting: alertmanagers: - static_configs: - - targets: - # - alertmanager:9093 + - targets: ["host.docker.internal:9087"] # Load rules once and periodically evaluate them according to the global 'evaluation_interval'. rule_files: - # - "first_rules.yml" - # - "second_rules.yml" + - "/prometheus/rules/alert_rules_template.yml" # A scrape configuration containing exactly one endpoint to scrape: # Here it's Prometheus itself. @@ -33,7 +31,7 @@ scrape_configs: group: 'prometheus' - job_name: "kafka" - scrape_interval: 5s + scrape_interval: 30s honor_labels: true static_configs: - targets: ["host.docker.internal:8890"] diff --git a/docker/telemetry/prometheus/rules/alert_rules_template.yml b/docker/telemetry/prometheus/rules/alert_rules_template.yml new file mode 100644 index 0000000000..5d27e2602b --- /dev/null +++ b/docker/telemetry/prometheus/rules/alert_rules_template.yml @@ -0,0 +1,122 @@ +# This is the alert rules template for AutoMQ, please modify the alert thresholds and period per your needs +# before applying it to your production environment. +groups: + - name: kafka_alerts + rules: + - alert: ActiveControllerCount + expr: sum(kafka_controller_active_count) by (job) != 1 + for: 1m + labels: + severity: critical + annotations: + summary: "Illegal kafka active controller number for cluster {{ $labels.job }}" + description: "Current number of active controller is {{ $value }}" + + - alert: KafkaClusterHighBytesInPerSec + expr: sum(rate(kafka_broker_network_io_bytes_total{direction="in"}[1m])) by (job) > 50 * 1024 * 1024 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}" + description: "The number of bytes per second received by Kafka cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaClusterHighBytesOutPerSec + expr: sum(rate(kafka_broker_network_io_bytes_total{direction="out"}[1m])) by (job) > 50 * 1024 * 1024 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for cluster {{ $labels.job }}" + description: "The number of bytes per second fetched from Kafka cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighBytesInPerSec + expr: rate(kafka_broker_network_io_bytes_total{direction="in"}[1m]) > 20 * 1024 * 1024 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka inbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" + description: "The number of bytes per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighBytesOutPerSec + expr: rate(kafka_broker_network_io_bytes_total{direction="out"}[1m]) > 20 * 1024 * 1024 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka outbound network throughput {{ printf \"%0.f\" $value }} Bytes/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" + description: "The number of bytes per second fetched from Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighProduceRequestRate + expr: sum(rate(kafka_request_count_total{type="Produce"}[1m])) by (job, instance) > 1000 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka produce request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" + description: "The number of produce requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighFetchRequestRate + expr: sum(rate(kafka_request_count_total{type="Fetch"}[1m])) by (job, instance) > 1000 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka fetch request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" + description: "The number of fetch requests per second received by Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighProduceLatency + expr: kafka_request_time_99p_milliseconds{type="Produce"} > 100 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka produce request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}" + description: "The 99th percentile of produce request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighFetchLatency + expr: kafka_request_time_99p_milliseconds{type="Fetch"} > 1000 + for: 1m + labels: + severity: warning + annotations: + summary: "High Kafka fetch request latency (P99) {{ printf \"%0.2f\" $value }} ms for broker {{ $labels.instance }} in cluster {{ $labels.job }}" + description: "The 99th percentile of fetch request latency of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighErrorRequestRate + expr: sum(rate(kafka_request_error_count_total{error!="NONE"}[1m])) by (job, instance, error) > 0.1 + for: 2m + labels: + severity: critical + annotations: + summary: "High Kafka error request rate {{ printf \"%0.2f\" $value }} req/s for broker {{ $labels.instance }} in cluster {{ $labels.job }}" + description: "The error request rate of Kafka broker {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighPartitionCount + expr: kafka_partition_count > 5000 + for: 1m + labels: + severity: critical + annotations: + summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many partitions: {{ $value }}." + description: "The partition count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaBrokerHighConnectionCount + expr: sum(kafka_server_connection_count) by (job, instance) > 1000 + for: 1m + labels: + severity: critical + annotations: + summary: "Kafka node {{ $labels.instance }} in cluster {{ $labels.job }} has too many connections: {{ $value }}." + description: "The connection count of node {{ $labels.instance }} in cluster {{ $labels.job }} is exceeding threshold." + + - alert: KafkaGroupHighConsumerLag + expr: sum(max(kafka_log_end_offset) by (job, topic, partition)) by (job, topic) + - on (topic) group_left (consumer_group) sum(max(kafka_group_commit_offset) by (job, consumer_group, topic, partition)) by (job, consumer_group, topic) > 10000 + for: 1m + labels: + severity: warning + annotations: + summary: "High group consumer lag {{ printf \"%0.f\" $value }} for consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }}." + description: "The consumer lag of consumer group {{ $labels.consumer_group }} in cluster {{ $labels.job }} on topic {{ $labels.topic }} is exceeding threshold." diff --git a/docker/telemetry/prometheus/rules/unit_tests.yml b/docker/telemetry/prometheus/rules/unit_tests.yml new file mode 100644 index 0000000000..6a2ba2fde5 --- /dev/null +++ b/docker/telemetry/prometheus/rules/unit_tests.yml @@ -0,0 +1,519 @@ +# This is the main input for unit testing. +rule_files: + # Alert rules to test + - ./alert_rules_template.yml + +evaluation_interval: 1m + +tests: + # Test ActiveControllerCount + - interval: 1m + # Series data. + input_series: + - series: 'kafka_controller_active_count{job="cluster_1", instance="0"}' + values: '1x5 0x5 1x5' + - series: 'kafka_controller_active_count{job="cluster_1", instance="1"}' + values: '0x10 1x5' + + alert_rule_test: + - eval_time: 5m + alertname: ActiveControllerCount + exp_alerts: + # Test no active controller. + - eval_time: 7m + alertname: ActiveControllerCount + exp_alerts: + - exp_labels: + severity: critical + job: cluster_1 + exp_annotations: + summary: "Illegal kafka active controller number for cluster cluster_1" + description: "Current number of active controller is 0" + # Test more than one active controller. + - eval_time: 13m + alertname: ActiveControllerCount + exp_alerts: + - exp_labels: + severity: critical + job: cluster_1 + exp_annotations: + summary: "Illegal kafka active controller number for cluster cluster_1" + description: "Current number of active controller is 2" + # Test KafkaClusterHighBytesInPerSec + - interval: 1m + # Series data. + input_series: + - series: 'kafka_broker_network_io_bytes_total{direction="in", job="cluster_1", instance="0"}' + values: '0 0 1887436800 1887436800 1887436800 1887436800' # 0 0 1800MB 1800MB 1800MB 1800MB + - series: 'kafka_broker_network_io_bytes_total{direction="in", job="cluster_1", instance="1"}' + values: '0 1572864000 3460300800 7235174400 8074035200 9227468800' # 0 1500MB 3300MB 6900MB 7700MB 8800MB + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaClusterHighBytesInPerSec + exp_alerts: + - eval_time: 1m + alertname: KafkaClusterHighBytesInPerSec + exp_alerts: + - eval_time: 2m + alertname: KafkaClusterHighBytesInPerSec + exp_alerts: + - eval_time: 4m + alertname: KafkaClusterHighBytesInPerSec + exp_alerts: + - eval_time: 5m + alertname: KafkaClusterHighBytesInPerSec + exp_alerts: + # Test alert. + - eval_time: 3m + alertname: KafkaClusterHighBytesInPerSec + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + exp_annotations: + summary: "High Kafka inbound network throughput 62914560 Bytes/s for cluster cluster_1" + description: "The number of bytes per second received by Kafka cluster cluster_1 is exceeding threshold." + # Test KafkaClusterHighBytesOutPerSec + - interval: 1m + # Series data. + input_series: + - series: 'kafka_broker_network_io_bytes_total{direction="out", job="cluster_1", instance="0"}' + values: '0 0 1887436800 1887436800 1887436800 1887436800' # 0 0 1800MB 1800MB 1800MB 1800MB + - series: 'kafka_broker_network_io_bytes_total{direction="out", job="cluster_1", instance="1"}' + values: '0 1572864000 3460300800 7235174400 8074035200 9227468800' # 0 1500MB 3300MB 6900MB 7700MB 8800MB + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaClusterHighBytesOutPerSec + exp_alerts: + - eval_time: 1m + alertname: KafkaClusterHighBytesOutPerSec + exp_alerts: + - eval_time: 2m + alertname: KafkaClusterHighBytesOutPerSec + exp_alerts: + - eval_time: 4m + alertname: KafkaClusterHighBytesOutPerSec + exp_alerts: + - eval_time: 5m + alertname: KafkaClusterHighBytesOutPerSec + exp_alerts: + # Test alert. + - eval_time: 3m + alertname: KafkaClusterHighBytesOutPerSec + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + exp_annotations: + summary: "High Kafka outbound network throughput 62914560 Bytes/s for cluster cluster_1" + description: "The number of bytes per second fetched from Kafka cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighBytesInPerSec + - interval: 1m + # Series data. + input_series: + - series: 'kafka_broker_network_io_bytes_total{direction="in", job="cluster_1", instance="0"}' + values: '0 62914560 125829120 188743680 251658240 314572800' # 0 60MB 120MB 180MB 240MB 300MB + - series: 'kafka_broker_network_io_bytes_total{direction="in", job="cluster_1", instance="1"}' + values: '0 629145600 2516582400 5033164800 7864320000 7864320000' # 0 600MB 2400MB 4800MB 7500MB 7500MB + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighBytesInPerSec + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighBytesInPerSec + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighBytesInPerSec + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighBytesInPerSec + exp_alerts: + # Test alert. + - eval_time: 3m + alertname: KafkaBrokerHighBytesInPerSec + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 1 + direction: in + exp_annotations: + summary: "High Kafka inbound network throughput 41943040 Bytes/s for broker 1 in cluster cluster_1" + description: "The number of bytes per second received by Kafka broker 1 in cluster cluster_1 is exceeding threshold." + - eval_time: 4m + alertname: KafkaBrokerHighBytesInPerSec + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 1 + direction: in + exp_annotations: + summary: "High Kafka inbound network throughput 47185920 Bytes/s for broker 1 in cluster cluster_1" + description: "The number of bytes per second received by Kafka broker 1 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighBytesOutPerSec + - interval: 1m + # Series data. + input_series: + - series: 'kafka_broker_network_io_bytes_total{direction="out", job="cluster_1", instance="0"}' + values: '0 62914560 125829120 188743680 251658240 314572800' # 0 60MB 120MB 180MB 240MB 300MB + - series: 'kafka_broker_network_io_bytes_total{direction="out", job="cluster_1", instance="1"}' + values: '0 629145600 2516582400 5033164800 7864320000 7864320000' # 0 600MB 2400MB 4800MB 7500MB 7500MB + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighBytesOutPerSec + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighBytesOutPerSec + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighBytesOutPerSec + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighBytesOutPerSec + exp_alerts: + # Test alert. + - eval_time: 3m + alertname: KafkaBrokerHighBytesOutPerSec + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 1 + direction: out + exp_annotations: + summary: "High Kafka outbound network throughput 41943040 Bytes/s for broker 1 in cluster cluster_1" + description: "The number of bytes per second fetched from Kafka broker 1 in cluster cluster_1 is exceeding threshold." + - eval_time: 4m + alertname: KafkaBrokerHighBytesOutPerSec + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 1 + direction: out + exp_annotations: + summary: "High Kafka outbound network throughput 47185920 Bytes/s for broker 1 in cluster cluster_1" + description: "The number of bytes per second fetched from Kafka broker 1 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighProduceRequestRate + - interval: 1m + # Series data. + input_series: + - series: 'kafka_request_count_total{type="Produce", job="cluster_1", instance="0"}' + values: '0 30000 67000 151000 241000 251000' + - series: 'kafka_request_count_total{type="Produce", job="cluster_1", instance="1"}' + values: '0 50 100 160 166 200' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighProduceRequestRate + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighProduceRequestRate + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighProduceRequestRate + exp_alerts: + - eval_time: 3m + alertname: KafkaBrokerHighProduceRequestRate + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighProduceRequestRate + exp_alerts: + # Test alert. + - eval_time: 4m + alertname: KafkaBrokerHighProduceRequestRate + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 0 + exp_annotations: + summary: "High Kafka produce request rate 1500.00 req/s for broker 0 in cluster cluster_1" + description: "The number of produce requests per second received by Kafka broker 0 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighFetchRequestRate + - interval: 1m + # Series data. + input_series: + - series: 'kafka_request_count_total{type="Fetch", job="cluster_1", instance="0"}' + values: '0 30000 67000 151000 241000 251000' + - series: 'kafka_request_count_total{type="Fetch", job="cluster_1", instance="1"}' + values: '0 50 100 160 166 200' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighFetchRequestRate + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighFetchRequestRate + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighFetchRequestRate + exp_alerts: + - eval_time: 3m + alertname: KafkaBrokerHighFetchRequestRate + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighFetchRequestRate + exp_alerts: + # Test alert. + - eval_time: 4m + alertname: KafkaBrokerHighFetchRequestRate + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 0 + exp_annotations: + summary: "High Kafka fetch request rate 1500.00 req/s for broker 0 in cluster cluster_1" + description: "The number of fetch requests per second received by Kafka broker 0 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighProduceLatency + - interval: 1m + # Series data. + input_series: + - series: 'kafka_request_time_99p_milliseconds{type="Produce", job="cluster_1", instance="0"}' + values: '0 10 15 11 12 20' + - series: 'kafka_request_time_99p_milliseconds{type="Produce", job="cluster_1", instance="1"}' + values: '0 16 17 120 170 10' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighProduceLatency + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighProduceLatency + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighProduceLatency + exp_alerts: + - eval_time: 3m + alertname: KafkaBrokerHighProduceLatency + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighProduceLatency + exp_alerts: + # Test alert. + - eval_time: 4m + alertname: KafkaBrokerHighProduceLatency + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 1 + type: Produce + exp_annotations: + summary: "High Kafka produce request latency (P99) 170.00 ms for broker 1 in cluster cluster_1" + description: "The 99th percentile of produce request latency of Kafka broker 1 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighFetchLatency + - interval: 1m + # Series data. + input_series: + - series: 'kafka_request_time_99p_milliseconds{type="Fetch", job="cluster_1", instance="0"}' + values: '0 10 15 11 12 20' + - series: 'kafka_request_time_99p_milliseconds{type="Fetch", job="cluster_1", instance="1"}' + values: '0 160 170 1200 1700 100' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighFetchLatency + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighFetchLatency + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighFetchLatency + exp_alerts: + - eval_time: 3m + alertname: KafkaBrokerHighFetchLatency + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighFetchLatency + exp_alerts: + # Test alert. + - eval_time: 4m + alertname: KafkaBrokerHighFetchLatency + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + instance: 1 + type: Fetch + exp_annotations: + summary: "High Kafka fetch request latency (P99) 1700.00 ms for broker 1 in cluster cluster_1" + description: "The 99th percentile of fetch request latency of Kafka broker 1 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighErrorRequestRate + - interval: 1m + # Series data. + input_series: + - series: 'kafka_request_error_count_total{error="NONE", job="cluster_1", instance="0"}' + values: '0 0 0 0 0 0' + - series: 'kafka_request_error_count_total{error="NOT_LEADER_OR_FOLLOWER", job="cluster_1", instance="1"}' + values: '0 50 100 160 166 200' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighErrorRequestRate + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighErrorRequestRate + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighErrorRequestRate + exp_alerts: + - eval_time: 4m + alertname: KafkaBrokerHighErrorRequestRate + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighErrorRequestRate + exp_alerts: + # Test alert. + - eval_time: 3m + alertname: KafkaBrokerHighErrorRequestRate + exp_alerts: + - exp_labels: + severity: critical + job: cluster_1 + instance: 1 + error: NOT_LEADER_OR_FOLLOWER + exp_annotations: + summary: "High Kafka error request rate 1.00 req/s for broker 1 in cluster cluster_1" + description: "The error request rate of Kafka broker 1 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighPartitionCount + - interval: 1m + # Series data. + input_series: + - series: 'kafka_partition_count{job="cluster_1", instance="0"}' + values: '0 100 1000 2000 3000 4000' + - series: 'kafka_partition_count{job="cluster_1", instance="1"}' + values: '0 2000 5000 6000 7000 4000' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighPartitionCount + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighPartitionCount + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighPartitionCount + exp_alerts: + - eval_time: 3m + alertname: KafkaBrokerHighPartitionCount + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighPartitionCount + exp_alerts: + # Test alert. + - eval_time: 4m + alertname: KafkaBrokerHighPartitionCount + exp_alerts: + - exp_labels: + severity: critical + job: cluster_1 + instance: 1 + exp_annotations: + summary: "Kafka node 1 in cluster cluster_1 has too many partitions: 7000." + description: "The partition count of node 1 in cluster cluster_1 is exceeding threshold." + # Test KafkaBrokerHighConnectionCount + - interval: 1m + # Series data. + input_series: + - series: 'kafka_server_connection_count{job="cluster_1", instance="0", listener="PLAINTEXT"}' + values: '0 100 200 300 400 500' + - series: 'kafka_server_connection_count{job="cluster_1", instance="1", listener="PLAINTEXT"}' + values: '0 100 600 500 300 200' + - series: 'kafka_server_connection_count{job="cluster_1", instance="1", listener="CONTROLLER"}' + values: '0 100 600 800 300 500' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaBrokerHighConnectionCount + exp_alerts: + - eval_time: 1m + alertname: KafkaBrokerHighConnectionCount + exp_alerts: + - eval_time: 2m + alertname: KafkaBrokerHighConnectionCount + exp_alerts: + - eval_time: 4m + alertname: KafkaBrokerHighConnectionCount + exp_alerts: + - eval_time: 5m + alertname: KafkaBrokerHighConnectionCount + exp_alerts: + # Test alert. + - eval_time: 3m + alertname: KafkaBrokerHighConnectionCount + exp_alerts: + - exp_labels: + severity: critical + job: cluster_1 + instance: 1 + exp_annotations: + summary: "Kafka node 1 in cluster cluster_1 has too many connections: 1300." + description: "The connection count of node 1 in cluster cluster_1 is exceeding threshold." + # Test KafkaGroupHighConsumerLag + - interval: 1m + # Series data. + input_series: + - series: 'kafka_log_end_offset{job="cluster_1", instance="0", topic="test-topic", partition="0"}' + values: '0 10000 30000 50000 70000 90000' + # Mock metrics stale since 3m on test-topic partition reassignment + - series: 'kafka_log_end_offset{job="cluster_1", instance="0", topic="test-topic", partition="1"}' + values: '0 20000 40000 60000 60000 60000' + - series: 'kafka_log_end_offset{job="cluster_1", instance="1", topic="test-topic", partition="1"}' + values: '0 20000 40000 60000 80000 100000' + # Mock metrics stale since 2m on __consumer_offsets partition reassignment + - series: 'kafka_group_commit_offset{job="cluster_1", instance="1", topic="test-topic", partition="0", consumer_group="test-group"}' + values: '0 9000 9000 9000 9000 9000' + - series: 'kafka_group_commit_offset{job="cluster_1", instance="1", topic="test-topic", partition="1", consumer_group="test-group"}' + values: '0 15000 15000 15000 15000 15000' + - series: 'kafka_group_commit_offset{job="cluster_1", instance="2", topic="test-topic", partition="0", consumer_group="test-group"}' + values: '0 9000 25000 46000 67000 84000' + - series: 'kafka_group_commit_offset{job="cluster_1", instance="2", topic="test-topic", partition="1", consumer_group="test-group"}' + values: '0 15000 38000 52000 71000 99000' + + alert_rule_test: + # Test no alert. + - eval_time: 0m + alertname: KafkaGroupHighConsumerLag + exp_alerts: + - eval_time: 1m + alertname: KafkaGroupHighConsumerLag + exp_alerts: + - eval_time: 2m + alertname: KafkaGroupHighConsumerLag + exp_alerts: + - eval_time: 3m + alertname: KafkaGroupHighConsumerLag + exp_alerts: + - eval_time: 5m + alertname: KafkaGroupHighConsumerLag + exp_alerts: + # Test alert. + - eval_time: 4m + alertname: KafkaGroupHighConsumerLag + exp_alerts: + - exp_labels: + severity: warning + job: cluster_1 + topic: test-topic + consumer_group: test-group + exp_annotations: + summary: "High group consumer lag 12000 for consumer group test-group in cluster cluster_1 on topic test-topic." + description: "The consumer lag of consumer group test-group in cluster cluster_1 on topic test-topic is exceeding threshold."