Skip to content

Commit

Permalink
update prometheus rules with namespace label (#29)
Browse files Browse the repository at this point in the history
  • Loading branch information
ravisingal authored Jul 11, 2021
1 parent d05006b commit 6578e32
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 8 deletions.
15 changes: 8 additions & 7 deletions helm/templates/prometheusrules.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,55 +15,56 @@ spec:
- name: {{ include "kafka.fullname" . }}
rules:
- alert: KafkaDown
expr: up{job="kafka",pod=~".*kafka-[0-9]+"} == 0
expr: up{job="kafka",pod=~".*kafka-[0-9]+",namespace={{ .Release.Namespace | quote }}} == 0
for: 5m
labels:
severity: warning
namespace: {{ .Release.Namespace }}
annotations:
summary: "Kafka broker is down"
message: "Kafka broker is down on {{`{{ $labels.pod }}`}}. Could not scrape jmx-exporter for 5m."
- alert: KafkaOfflinePartitions
expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount_value) > 0
expr: sum(kafka_controller_kafkacontroller_offlinepartitionscount_value{namespace={{ .Release.Namespace | quote }}}) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Kafka cluster has offline partitions"
message: "{{`{{ $value }}`}} partitions in Kafka went offline (have no leader), cluster is probably broken."
- alert: UnderReplicatedPartitions
expr: kafka_server_replicamanager_total_underreplicatedpartitions_value > 0
expr: kafka_server_replicamanager_total_underreplicatedpartitions_value{namespace={{ .Release.Namespace | quote }}} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Kafka under replicated partitions"
message: "There are {{`{{ $value }}`}} under replicated partitions on {{`{{ $labels.pod }}`}}"
- alert: AbnormalControllerState
expr: sum(kafka_controller_kafkacontroller_activecontrollercount_value) != 1
expr: sum(kafka_controller_kafkacontroller_activecontrollercount_value{namespace={{ .Release.Namespace | quote }}}) != 1
for: 5m
labels:
severity: warning
annotations:
summary: "Kafka abnormal controller state"
message: "There are {{`{{ $value }}`}} active controllers in the cluster"
- alert: UnderMinIsrPartitionCount
expr: kafka_server_replicamanager_total_underminisrpartitioncount_value > 0
expr: kafka_server_replicamanager_total_underminisrpartitioncount_value{namespace={{ .Release.Namespace | quote }}} > 0
for: 5m
labels:
severity: warning
annotations:
summary: "Kafka under min ISR partitions"
message: "There are {{`{{ $value }}`}} partitions under the min ISR on {{`{{ $labels.pod }}`}}"
- alert: KafkaBrokerContainersDown
expr: absent(container_last_seen{container_name="kafka",pod_name=~".*kafka-[0-9]+"})
expr: absent(container_last_seen{container="kafka",pod=~".*kafka-[0-9]+",namespace={{ .Release.Namespace | quote }}})
for: 5m
labels:
severity: critical
annotations:
summary: "All kafka containers down or in CrashLookBackOff status"
message: "All kafka containers have been down or in CrashLookBackOff status for 3 minutes"
- alert: KafkaContainerRestartedInTheLast5Minutes
expr: count(count_over_time(container_last_seen{container_name="kafka"}[5m])) > 2 * count(container_last_seen{container_name="kafka",pod_name=~".*kafka-[0-9]+"})
expr: count(count_over_time(container_last_seen{container="kafka",namespace={{ .Release.Namespace | quote }}}[5m])) > 2 * count(container_last_seen{container="kafka",pod=~".*kafka-[0-9]+",namespace={{ .Release.Namespace | quote }}})
for: 5m
labels:
severity: warning
Expand Down
1 change: 1 addition & 0 deletions helm/templates/servicemonitor.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ spec:
endpoints:
- port: jmx-prometheus
interval: {{ .Values.servicemonitor.interval }}
scrapeTimeout: {{ .Values.servicemonitor.scrapeTimeout }}
{{- if .Values.servicemonitor.secure }}
scheme: https
tlsConfig:
Expand Down
3 changes: 2 additions & 1 deletion helm/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,8 @@ listeners:
# Monitoring
servicemonitor:
enabled: false
interval: 15s
interval: 30s
scrapeTimeout: 20s
secure: false
tlsConfig: {}

Expand Down

0 comments on commit 6578e32

Please sign in to comment.