From 7144c217405b14afab1225ce2aab112e10e93344 Mon Sep 17 00:00:00 2001 From: Shanshan Date: Wed, 5 Feb 2025 19:29:23 +0800 Subject: [PATCH] chore: add doc to deploy plg stack and update podMonitor, add alert rules --- addons/mysql/dashboards/mysql.json | 10 +- examples/apecloud-mysql/pod-monitor.yaml | 3 +- examples/clickhouse/pod-monitor.yaml | 3 +- examples/docs/aws-lb-test.yaml | 57 ++++++++ examples/docs/install-plg-stack.md | 91 ++++++++++++ examples/docs/install-prometheus.md | 4 + examples/elasticsearch/pod-monitor.yaml | 1 - examples/etcd/pod-monitor.yaml | 3 +- examples/milvus/pod-monitor.yaml | 1 - examples/mongodb/alert-rules.yaml | 90 ++++++++++++ examples/mysql/alert-rules.yaml | 81 +++++++++++ examples/mysql/pod-monitor.yaml | 3 +- examples/oceanbase/pod-monitor.yaml | 3 +- examples/polardbx/pod-monitor.yaml | 1 - examples/postgresql/alert-rules.yaml | 178 +++++++++++++++++++++++ examples/postgresql/pod-monitor.yaml | 3 +- examples/qdrant/pod-monitor.yaml | 3 +- examples/rabbitmq/pod-monitor.yaml | 3 +- examples/redis/alert-rules.yaml | 90 ++++++++++++ examples/redis/pod-monitor.yaml | 3 +- examples/starrocks/pod-monitor.yaml | 1 - examples/tidb/pod-monitor.yaml | 1 - examples/zookeeper/pod-monitor.yaml | 3 +- 23 files changed, 606 insertions(+), 30 deletions(-) create mode 100644 examples/docs/aws-lb-test.yaml create mode 100644 examples/docs/install-plg-stack.md create mode 100644 examples/mongodb/alert-rules.yaml create mode 100644 examples/mysql/alert-rules.yaml create mode 100644 examples/postgresql/alert-rules.yaml create mode 100644 examples/redis/alert-rules.yaml diff --git a/addons/mysql/dashboards/mysql.json b/addons/mysql/dashboards/mysql.json index c3dfd0650..32853e55d 100644 --- a/addons/mysql/dashboards/mysql.json +++ b/addons/mysql/dashboards/mysql.json @@ -5271,8 +5271,8 @@ { "current": { "selected": true, - "text": "oteld-app-metrics", - "value": "oteld-app-metrics" + "text": "kubeblocks", + "value": "kubeblocks" }, "hide": 0, "includeAll": false, @@ -5282,11 +5282,11 @@ "options": [ { "selected": true, - "text": "oteld-app-metrics", - "value": "oteld-app-metrics" + "text": "kubeblocks", + "value": "kubeblocks" } ], - "query": "oteld-app-metrics", + "query": "kubeblocks", "queryValue": "", "skipUrlSync": false, "type": "custom" diff --git a/examples/apecloud-mysql/pod-monitor.yaml b/examples/apecloud-mysql/pod-monitor.yaml index c40679d3d..427f13167 100644 --- a/examples/apecloud-mysql/pod-monitor.yaml +++ b/examples/apecloud-mysql/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: acmysql-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/clickhouse/pod-monitor.yaml b/examples/clickhouse/pod-monitor.yaml index ef9d612d3..48a066ca3 100644 --- a/examples/clickhouse/pod-monitor.yaml +++ b/examples/clickhouse/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: clickhouse-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/docs/aws-lb-test.yaml b/examples/docs/aws-lb-test.yaml new file mode 100644 index 000000000..ddef75b0f --- /dev/null +++ b/examples/docs/aws-lb-test.yaml @@ -0,0 +1,57 @@ +--- +apiVersion: apps/v1 +kind: Deployment +metadata: + name: httpbin + namespace: default +spec: + replicas: 1 + selector: + matchLabels: + app: httpbin + version: v1 + template: + metadata: + labels: + app: httpbin + version: v1 + spec: + containers: + - image: docker.io/kennethreitz/httpbin + imagePullPolicy: IfNotPresent + name: httpbin + env: + - name: GUNICORN_CMD_ARGS + value: "--capture-output --error-logfile - --access-logfile - --access-logformat '%(h)s %(t)s %(r)s %(s)s Host: %({Host}i)s}'" + ports: + - containerPort: 80 +--- +apiVersion: v1 +kind: Service +metadata: + name: httpbin-internal + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: nlb # To create Network Load Balancer + service.beta.kubernetes.io/aws-load-balancer-internal: "true" +spec: + type: LoadBalancer # Regular k8s Service manifest with type as LoadBalancer + selector: + app: httpbin + ports: + - port: 8080 + targetPort: 80 +--- +apiVersion: v1 +kind: Service +metadata: + name: httpbin-externl + annotations: + service.beta.kubernetes.io/aws-load-balancer-type: nlb # To create Network Load Balancer + service.beta.kubernetes.io/aws-load-balancer-internal: "false" +spec: + type: LoadBalancer # Regular k8s Service manifest with type as LoadBalancer + selector: + app: httpbin + ports: + - port: 8080 + targetPort: 80 \ No newline at end of file diff --git a/examples/docs/install-plg-stack.md b/examples/docs/install-plg-stack.md new file mode 100644 index 000000000..262e897f2 --- /dev/null +++ b/examples/docs/install-plg-stack.md @@ -0,0 +1,91 @@ +## How to Deploy PLG Stack on Kubernetes + +**PLG stack** here refers to Promtail, Loki and Grafana, where Promtail extracts and collects logs from docker containers log files and pushes them to the Loki service which then Grafana uses to show logs in the log panel. + +### Install Loki Stack + +In this tutorial, we will show how to deploy them using loki-stack helm chart. +The Loki stack is a lightweight log aggregation solution from Grafana. + +**Step 1.** Add the Grafana Helm Chart repository and Update repo: + +```bash +# Add Grafana's Helm Chart repository and Update repo : +helm repo add grafana https://grafana.github.io/helm-charts +helm repo update +``` + +**Step 2.** Install Loki Stack: + +If you have prometheus and Grafana already installed, you may deploy the loki stack with values as follows: + +```yaml +# cat values.yaml +loki: + enabled: true + url: http://loki-stack.logging:3100 + image: + tag: 2.9.3 # set image tag to 2.8.10 or higher to fix the issue 'Failed to load log volume for this query' + persistence: + enabled: true # set to true to persist logs + +promtail: + enabled: true + config: + clients: + - url: http://loki-stack.logging:3100/loki/api/v1/push # set loki url, don't forget the `namespace` of loki service +``` + +```bash +# Deploy the Loki stack to namespace logging. customize the values.yaml as needed. +helm upgrade --install loki-stack grafana/loki-stack -n logging --create-namespace -f values.yaml +``` + +For more details please refer to [loki stack](https://github.com/grafana/helm-charts/tree/main/charts/loki-stack). + +> [!IMPORTANT] +> If you are deploying the stack with loki version 2.6.1, you may encounter the error 'Failed to load log volume for this query'. +> To fix the issue, you should upgrade the loki version to 2.8.10 or higher, as discussed in the [issue](https://github.com/grafana/grafana/issues/84144). + +**Step 3.** Check Status: + +```bash +kubectl get pods -n logging +``` + +All the pods should be in the `Running` state. + +### Configure Loki in Grafana + +#### Step 1. Add Loki Data Source to Grafana + +Visit Grafana Dashboard in your browser and Go to `Home` -> `Connections` -> `Data Sources` -> `Add new data source` -> `Loki` and fill in the following details: + +- **Name**: Loki +- **URL**: `http://loki-stack.logging:3100/`, where `logging` is the namespace where Loki is deployed. + +Click on `Save & Test` to save the data source. + +Then click on `Home` > `Explore` then choose `Loki` as the data source to filter labels and run queries, say `{namespace="default",stream="stdout"}` to see the logs. + +If you encounter the `Failed to load log volume for this query` error, please upgrade the loki version to 2.8.10 or higher. + +### Step 2. Import a Loki Dashboard for Logs + +You can import a Loki dashboard to visualize logs in Grafana or create your own dashboard. + +More dashboards can be found at [Grafana Dashboards](https://grafana.com/grafana/dashboards). + +### Example: Collect Logs for MySQL Cluster + +1. Create MySQL Cluster + +```bash +kubectl create -f examples/mysql/cluster.yaml +``` + +2. Open Grafa and import dashboard to visualize logs, for example, you can import the following dashboard: + +- + +3. You may choose the namespace and stream to filter logs and see the logs in the log panel \ No newline at end of file diff --git a/examples/docs/install-prometheus.md b/examples/docs/install-prometheus.md index 1d631e806..76b8e17ba 100644 --- a/examples/docs/install-prometheus.md +++ b/examples/docs/install-prometheus.md @@ -84,3 +84,7 @@ spec: matchLabels: release: prometheus # make sure your ServiceMonitor CR labels matches the selector ``` + +When creating a new `PodMonitor` or `ServiceMonitor`, make sure labels and namespace are set correctly matching the settings (`podMonitorNamespaceSelector` and `podMonitorSelector`,`serviceMonitorNamespaceSelector` and `serviceMonitorSelector`) in the `Prometheus` CR. + +When creating a new `PrometheusRule`, make sure labels and namespace are set correctly matching the settings (`ruleNamespaceSelector` and `ruleSelector`) in the `Prometheus` CR. diff --git a/examples/elasticsearch/pod-monitor.yaml b/examples/elasticsearch/pod-monitor.yaml index c94453006..62ed4eef0 100644 --- a/examples/elasticsearch/pod-monitor.yaml +++ b/examples/elasticsearch/pod-monitor.yaml @@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: es-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: diff --git a/examples/etcd/pod-monitor.yaml b/examples/etcd/pod-monitor.yaml index 37b137dee..cece87faa 100644 --- a/examples/etcd/pod-monitor.yaml +++ b/examples/etcd/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: etcd-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/milvus/pod-monitor.yaml b/examples/milvus/pod-monitor.yaml index 741dbb99d..cd27a09f7 100644 --- a/examples/milvus/pod-monitor.yaml +++ b/examples/milvus/pod-monitor.yaml @@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: milvus-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: diff --git a/examples/mongodb/alert-rules.yaml b/examples/mongodb/alert-rules.yaml new file mode 100644 index 000000000..9fd1cbf0f --- /dev/null +++ b/examples/mongodb/alert-rules.yaml @@ -0,0 +1,90 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: mongo-alert-rules + labels: + release: prometheus +spec: + groups: + - name: MongodbExporter + rules: + - alert: MongodbDown + expr: "max_over_time(mongodb_up[1m]) == 0" + for: 0m + labels: + severity: critical + annotations: + summary: "MongoDB is Down" + description: 'MongoDB instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: MongodbRestarted + expr: "mongodb_instance_uptime_seconds < 60" + for: 0m + labels: + severity: info + annotations: + summary: "Mongodb has just been restarted (< 60s)" + description: 'Mongodb has just been restarted {{ $value | printf "%.1f" }} seconds ago\n LABELS = {{ $labels }}' + + - alert: MongodbReplicaMemberUnhealthy + expr: "max_over_time(mongodb_rs_members_health[1m]) == 0" + for: 0m + labels: + severity: critical + annotations: + summary: "Mongodb replica member is unhealthy" + description: 'MongoDB replica member is not healthy\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: MongodbReplicationLag + expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10' + for: 0m + labels: + severity: critical + annotations: + summary: "MongoDB replication lag (> 10s)" + description: 'Mongodb replication lag is more than 10s\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: MongodbReplicationHeadroom + expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0' + for: 0m + labels: + severity: critical + annotations: + summary: "MongoDB replication headroom (< 0)" + description: 'MongoDB replication headroom is <= 0\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: MongodbNumberCursorsOpen + expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000' + for: 2m + labels: + severity: warning + annotations: + summary: "MongoDB opened cursors num (> 10k)" + description: 'Too many cursors opened by MongoDB for clients (> 10k)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: MongodbCursorsTimeouts + expr: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100" + for: 2m + labels: + severity: warning + annotations: + summary: "MongoDB cursors timeouts (>100/minute)" + description: 'Too many cursors are timing out (> 100/minute)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: MongodbTooManyConnections + expr: 'avg by(pod) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(pod) (sum (mongodb_ss_connections) by(pod)) * 100 > 80' + for: 2m + labels: + severity: warning + annotations: + summary: "MongoDB too many connections (> 80%)" + description: 'Too many connections (> 80%)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}' + + - alert: MongodbVirtualMemoryUsage + expr: "(sum(mongodb_ss_mem_virtual) BY (pod) / sum(mongodb_ss_mem_resident) BY (pod)) > 100" + for: 2m + labels: + severity: warning + annotations: + summary: MongoDB virtual memory usage high + description: "High memory usage: the quotient of (mem_virtual / mem_resident) is more than 100\n VALUE = {{ $value }}\n LABELS = {{ $labels }}" diff --git a/examples/mysql/alert-rules.yaml b/examples/mysql/alert-rules.yaml new file mode 100644 index 000000000..73d07ab21 --- /dev/null +++ b/examples/mysql/alert-rules.yaml @@ -0,0 +1,81 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: mysql-alert-rules + labels: + release: prometheus +spec: + groups: + - name: MysqldExporter + rules: + - alert: MysqlDown + expr: "max_over_time(mysql_up[1m]) == 0" + for: 0m + labels: + severity: critical + annotations: + summary: "MySQL is down" + description: "MySQL is down. (instance: {{ $labels.pod }})" + + - alert: MysqlRestarted + expr: "mysql_global_status_uptime < 60" + for: 0m + labels: + severity: info + annotations: + summary: "MySQL has just been restarted (< 60s)" + description: 'MySQL has just been restarted {{ $value | printf "%.1f" }} seconds ago. (instance: {{ $labels.pod }})' + + - alert: MysqlTooManyConnections + expr: "sum(max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 80" + for: 2m + labels: + severity: warning + annotations: + summary: "MySQL has too many connections (> 80%)" + description: '{{ $value | printf "%.2f" }} percent of MySQL connections are in use. (instance: {{ $labels.pod }})' + + - alert: MysqlConnectionErrors + expr: "sum(increase(mysql_global_status_connection_errors_total[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0" + for: 2m + labels: + severity: warning + annotations: + summary: "MySQL connection errors" + description: 'MySQL has connection errors and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' + + - alert: MysqlHighThreadsRunning + expr: "sum(max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 60" + for: 2m + labels: + severity: warning + annotations: + summary: "MySQL high threads running (> 60%)" + description: '{{ $value | printf "%.2f" }} percent of MySQL connections are in running state. (instance: {{ $labels.pod }})' + + - alert: MysqlSlowQueries + expr: "sum(increase(mysql_global_status_slow_queries[1m])) BY (namespace,app_kubernetes_io_instance,pod) > 0" + for: 2m + labels: + severity: info + annotations: + summary: "MySQL slow queries" + description: 'MySQL server has {{ $value | printf "%.2f" }} slow query. (instance: {{ $labels.pod }})' + + - alert: MysqlInnodbLogWaits + expr: "sum(rate(mysql_global_status_innodb_log_waits[5m])) BY (namespace,app_kubernetes_io_instance,pod) > 10" + for: 2m + labels: + severity: warning + annotations: + summary: "MySQL InnoDB log waits (> 10)" + description: 'MySQL innodb log writes stalling and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' + + - alert: MysqlInnodbBufferPoolHits + expr: "sum(rate(mysql_global_status_innodb_buffer_pool_reads[5m]) / rate(mysql_global_status_innodb_buffer_pool_read_requests[5m])) BY (namespace,app_kubernetes_io_instance,pod) * 100 > 5" + for: 2m + labels: + severity: warning + annotations: + summary: "MySQL InnoDB high read requests rate hitting disk (> 5%)" + description: 'High number of logical reads that InnoDB could not satisfy from the buffer pool, and had to read directly from disk. The value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' diff --git a/examples/mysql/pod-monitor.yaml b/examples/mysql/pod-monitor.yaml index 3e05872dd..efa26640a 100644 --- a/examples/mysql/pod-monitor.yaml +++ b/examples/mysql/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: mysql-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/oceanbase/pod-monitor.yaml b/examples/oceanbase/pod-monitor.yaml index 54d21fad5..cf5e02e6d 100644 --- a/examples/oceanbase/pod-monitor.yaml +++ b/examples/oceanbase/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: ob-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/polardbx/pod-monitor.yaml b/examples/polardbx/pod-monitor.yaml index a0c2c054e..30f3bf6c4 100644 --- a/examples/polardbx/pod-monitor.yaml +++ b/examples/polardbx/pod-monitor.yaml @@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: pxc-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: diff --git a/examples/postgresql/alert-rules.yaml b/examples/postgresql/alert-rules.yaml new file mode 100644 index 000000000..a53a8a62f --- /dev/null +++ b/examples/postgresql/alert-rules.yaml @@ -0,0 +1,178 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: postgresql-alert-rules + labels: + release: prometheus +spec: + groups: + - name: PostgreSQLExporter + rules: + - alert: PostgreSQLDown + expr: "max_over_time(pg_up[1m]) == 0" + for: 0m + labels: + severity: critical + annotations: + summary: "PostgreSQL is down" + description: "PostgreSQL is down. (instance: {{ $labels.pod }})" + + - alert: PostgreSQLRestarted + expr: "time() - pg_postmaster_start_time_seconds < 60" + for: 0m + labels: + severity: info + annotations: + summary: "PostgreSQL has just been restarted (< 60s)" + description: 'PostgreSQL has just been restarted {{ $value | printf "%.1f" }} seconds ago. (instance: {{ $labels.pod }})' + + - alert: PostgreSQLExporterError + expr: "pg_exporter_last_scrape_error > 0" + for: 0m + labels: + severity: warning + annotations: + summary: "PostgreSQL exporter scrape error" + description: 'PostgreSQL exporter has {{ $value | printf "%.2f" }} scrape errors. A query may be buggy in query.yaml. (instance: {{ $labels.pod }})' + + - alert: PostgreSQLTooManySlowQueries + expr: | + max by(namespace,app_kubernetes_io_instance,pod,datname) ( + max_over_time(pg_stat_activity_max_tx_duration{datname!~"template.*"}[2m]) + ) > 60 + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL database has high number of slow queries" + description: 'PostgreSQL database has slow queries and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' + + - alert: PostgreSQLTooManyConnections + expr: | + sum by (namespace,app_kubernetes_io_instance,pod) (pg_stat_activity_count{datname!~"template.*"}) + > on(namespace,app_kubernetes_io_instance,pod) + (pg_settings_max_connections - pg_settings_superuser_reserved_connections) * 0.8 + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL too many connections (> 80%)" + description: 'PostgreSQL has too many connections and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' + + - alert: PostgreSQLDeadLocks + expr: 'increase(pg_stat_database_deadlocks_total{datname!~"template.*", datname!=""}[2m]) > 5' + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL database has dead locks (> 5)" + description: 'PostgreSQL database has {{ $value | printf "%.2f"}} dead locks. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' + + - alert: PostgreSQLHighRollbackRate + expr: | + rate(pg_stat_database_xact_rollback_total{datname!~"template.*", datname!=""}[2m]) + / + rate(pg_stat_database_xact_commit_total{datname!~"template.*", datname!=""}[2m]) + > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL database has high rollback rate (> 10%)" + description: 'Ratio of transactions being aborted compared to committed is {{ $value | printf "%.2f"}} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' + + - alert: PostgreSQLTooManyLocksAcquired + expr: | + sum by (namespace,app_kubernetes_io_instance,pod) (pg_locks_count) + / on(namespace,app_kubernetes_io_instance,pod) + (pg_settings_max_locks_per_transaction * pg_settings_max_connections) + > 0.2 + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL has too many locks acquired (> 20%)" + description: 'Too many locks acquired on the database and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' + + - alert: PostgreSQLCacheHitRatio + expr: | + avg by (namespace,app_kubernetes_io_instance,pod,datname) ( + rate(pg_stat_database_blks_hit_total{datname!~"template.*", datname!=""}[2m]) + / + ( + rate( + pg_stat_database_blks_hit_total{datname!~"template.*", datname!=""}[2m] + ) + + + rate( + pg_stat_database_blks_read_total{datname!~"template.*", datname!=""}[2m] + ) + ) + ) < 0.9 + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL database has low cache hit rate (< 90%)" + description: 'Low cache hit rate and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' + + - alert: PostgreSQLMaxWriteBufferReached + expr: "rate(pg_stat_bgwriter_maxwritten_clean_total[2m]) > 0" + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL write buffers reached max" + description: 'PostgreSQL background writer stops for max and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' + + - alert: PostgreSQLHighWALFilesArchiveErrorRate + expr: | + rate(pg_stat_archiver_failed_count_total[2m]) + / ( + rate(pg_stat_archiver_archived_count_total[2m]) + rate(pg_stat_archiver_failed_count_total[2m]) + ) > 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL has high error rate in WAL files archiver(> 10%)" + description: 'PostgreSQL high error rate in WAL files archiver and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' + + - alert: PostgreSQLTableNotAutoVacuumed + expr: | + (pg_stat_user_tables_last_autovacuum > 0) + and + (time() - pg_stat_user_tables_last_autovacuum) + > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: "PostgreSQL table in database has not been auto vacuumed for 10 days" + description: "Table {{ $labels.relname }} in database has not been auto vacuumed for 10 days. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})" + + - alert: PostgreSQLTableNotAutoAnalyzed + expr: | + (pg_stat_user_tables_last_autoanalyze > 0) + and + (time() - pg_stat_user_tables_last_autoanalyze) + > 24 * 60 * 60 * 10 + for: 0m + labels: + severity: warning + annotations: + summary: "PostgreSQL table in database has not been auto analyzed for 10 days" + description: "Table {{ $labels.relname }} in database has not been auto analyzed for 10 days. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})" + + - alert: PostgreSQLTableTooManyDeadTuples + expr: | + (pg_stat_user_tables_n_dead_tup > 10000) + / + (pg_stat_user_tables_n_live_tup + pg_stat_user_tables_n_dead_tup) + >= 0.1 + for: 2m + labels: + severity: warning + annotations: + summary: "PostgreSQL table in database has too many dead tuples (> 10%)" + description: 'Table {{ $labels.relname }} in database dead tuples is too large and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }}, database: {{ $labels.datname }})' diff --git a/examples/postgresql/pod-monitor.yaml b/examples/postgresql/pod-monitor.yaml index e148019a9..5f32a8c03 100644 --- a/examples/postgresql/pod-monitor.yaml +++ b/examples/postgresql/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: pg-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/qdrant/pod-monitor.yaml b/examples/qdrant/pod-monitor.yaml index 5aaa9a032..7629e07cb 100644 --- a/examples/qdrant/pod-monitor.yaml +++ b/examples/qdrant/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: qdrant-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/rabbitmq/pod-monitor.yaml b/examples/rabbitmq/pod-monitor.yaml index 8c27c6171..b8ef638ed 100644 --- a/examples/rabbitmq/pod-monitor.yaml +++ b/examples/rabbitmq/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: rabbitmq-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/redis/alert-rules.yaml b/examples/redis/alert-rules.yaml new file mode 100644 index 000000000..17eb08c99 --- /dev/null +++ b/examples/redis/alert-rules.yaml @@ -0,0 +1,90 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: redis-alert-rules + labels: + release: prometheus +spec: + groups: + - name: RedisExporter + rules: + - alert: RedisDown + expr: "redis_up == 0" + for: 5m + labels: + severity: critical + annotations: + summary: "Redis is down" + description: "Redis is down. (instance: {{ $labels.pod }})" + + - alert: RedisCPUHigh + expr: "(rate(redis_cpu_sys_seconds_total[1m]) + rate(redis_cpu_user_seconds_total[1m])) * 100 > 80" + for: 2m + labels: + severity: warning + annotations: + summary: "Out of CPU (> 80%)" + description: 'Redis is running out of CPU and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' + + - alert: RedisMemoryHigh + expr: "(redis_memory_max_bytes == 0 or redis_memory_used_bytes * 100 / redis_memory_max_bytes) > 90" + for: 5m + labels: + severity: warning + annotations: + summary: "Out of memory (> 90%)" + description: 'Redis is running out of memory and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' + + - alert: RedisTooManyConnections + expr: "redis_connected_clients * 100 / redis_config_maxclients > 80" + for: 1m + labels: + severity: warning + annotations: + summary: "Redis has too many connections (> 80%)" + description: 'Redis has too many connections and the value is {{ $value | printf "%.2f" }} percent. (instance: {{ $labels.pod }})' + + - alert: RedisRejectedConnections + expr: "increase(redis_rejected_connections_total[1m]) > 0" + for: 5m + labels: + severity: error + annotations: + summary: "Redis has rejected connections" + description: '{{ $value | printf "%.2f" }} connections to Redis has been rejected. (instance: {{ $labels.pod }})' + + - alert: RedisKeyEviction + expr: "increase(redis_evicted_keys_total[5m]) > 0" + for: 1s + labels: + severity: error + annotations: + summary: "Redis has evicted keys" + description: 'Redis has evicted keys in the last 5 minutes and the value is {{ $value | printf "%.2f" }}. (instance: {{ $labels.pod }})' + + - alert: RedisMissingMaster + expr: 'count by (app_kubernetes_io_instance) (redis_instance_info{role="master"}) < 1' + for: 30s + labels: + severity: critical + annotations: + summary: "Redis missing master" + description: "Redis cluster has no node marked as master." + + - alert: RedisDisconnectedSlaves + expr: "count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1" + for: 0m + labels: + severity: critical + annotations: + summary: "Redis disconnected slaves" + description: "Redis not replicating for all slaves. Consider reviewing the redis replication status. (instance: {{ $labels.pod }})" + + - alert: RedisReplicationBroken + expr: "delta(redis_connected_slaves[1m]) < 0" + for: 0m + labels: + severity: critical + annotations: + summary: "Redis replication broken" + description: "Redis instance lost a slave. (instance: {{ $labels.pod }})" diff --git a/examples/redis/pod-monitor.yaml b/examples/redis/pod-monitor.yaml index c55b73419..8d526571d 100644 --- a/examples/redis/pod-monitor.yaml +++ b/examples/redis/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: redis-replication-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs diff --git a/examples/starrocks/pod-monitor.yaml b/examples/starrocks/pod-monitor.yaml index 56b734646..a4a91db41 100644 --- a/examples/starrocks/pod-monitor.yaml +++ b/examples/starrocks/pod-monitor.yaml @@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: sr-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: diff --git a/examples/tidb/pod-monitor.yaml b/examples/tidb/pod-monitor.yaml index 79a073d80..a2e0ebb05 100644 --- a/examples/tidb/pod-monitor.yaml +++ b/examples/tidb/pod-monitor.yaml @@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: tidb-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: diff --git a/examples/zookeeper/pod-monitor.yaml b/examples/zookeeper/pod-monitor.yaml index 9b4bb1309..bd7b7ef9f 100644 --- a/examples/zookeeper/pod-monitor.yaml +++ b/examples/zookeeper/pod-monitor.yaml @@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1 kind: PodMonitor metadata: name: zk-cluster-pod-monitor - namespace: monitoring # Note: this is namespace for prometheus operator labels: # this is labels set in `prometheus.spec.podMonitorSelector` release: prometheus spec: - jobLabel: kubeblocks-service + jobLabel: app.kubernetes.io/managed-by # defines the labels which are transferred from the # associated Kubernetes `Pod` object onto the ingested metrics # set the lables w.r.t you own needs