chore: add doc to deploy plg stack and update podMonitor, add alert r…

…ules
apecloud · Feb 7, 2025 · 7144c21 · 7144c21
1 parent 2bd2dae
commit 7144c21
Show file tree

Hide file tree

Showing 23 changed files with 606 additions and 30 deletions.
diff --git a/addons/mysql/dashboards/mysql.json b/addons/mysql/dashboards/mysql.json
@@ -5271,8 +5271,8 @@
       {
         "current": {
           "selected": true,
-          "text": "oteld-app-metrics",
-          "value": "oteld-app-metrics"
+          "text": "kubeblocks",
+          "value": "kubeblocks"
         },
         "hide": 0,
         "includeAll": false,
@@ -5282,11 +5282,11 @@
         "options": [
           {
             "selected": true,
-            "text": "oteld-app-metrics",
-            "value": "oteld-app-metrics"
+            "text": "kubeblocks",
+            "value": "kubeblocks"
           }
         ],
-        "query": "oteld-app-metrics",
+        "query": "kubeblocks",
         "queryValue": "",
         "skipUrlSync": false,
         "type": "custom"

diff --git a/examples/apecloud-mysql/pod-monitor.yaml b/examples/apecloud-mysql/pod-monitor.yaml
@@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
   name: acmysql-cluster-pod-monitor
-  namespace: monitoring # Note: this is namespace for prometheus operator
   labels:               # this is labels set in `prometheus.spec.podMonitorSelector`
     release: prometheus
 spec:
-  jobLabel: kubeblocks-service
+  jobLabel: app.kubernetes.io/managed-by
   # defines the labels which are transferred from the
   # associated Kubernetes `Pod` object onto the ingested metrics
   # set the lables w.r.t you own needs

diff --git a/examples/clickhouse/pod-monitor.yaml b/examples/clickhouse/pod-monitor.yaml
@@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
   name: clickhouse-pod-monitor
-  namespace: monitoring # Note: this is namespace for prometheus operator
   labels:               # this is labels set in `prometheus.spec.podMonitorSelector`
     release: prometheus
 spec:
-  jobLabel: kubeblocks-service
+  jobLabel: app.kubernetes.io/managed-by
   # defines the labels which are transferred from the
   # associated Kubernetes `Pod` object onto the ingested metrics
   # set the lables w.r.t you own needs

diff --git a/examples/docs/aws-lb-test.yaml b/examples/docs/aws-lb-test.yaml
@@ -0,0 +1,57 @@
+---
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: httpbin
+  namespace: default
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: httpbin
+      version: v1
+  template:
+    metadata:
+      labels:
+        app: httpbin
+        version: v1
+    spec:
+      containers:
+      - image: docker.io/kennethreitz/httpbin
+        imagePullPolicy: IfNotPresent
+        name: httpbin
+        env:
+        - name: GUNICORN_CMD_ARGS
+          value: "--capture-output --error-logfile - --access-logfile - --access-logformat '%(h)s %(t)s %(r)s %(s)s Host: %({Host}i)s}'"
+        ports:
+        - containerPort: 80
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: httpbin-internal
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: nlb    # To create Network Load Balancer
+    service.beta.kubernetes.io/aws-load-balancer-internal: "true"
+spec:
+  type: LoadBalancer # Regular k8s Service manifest with type as LoadBalancer
+  selector:
+    app: httpbin
+  ports:
+  - port: 8080
+    targetPort: 80
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: httpbin-externl
+  annotations:
+    service.beta.kubernetes.io/aws-load-balancer-type: nlb    # To create Network Load Balancer
+    service.beta.kubernetes.io/aws-load-balancer-internal: "false"
+spec:
+  type: LoadBalancer # Regular k8s Service manifest with type as LoadBalancer
+  selector:
+    app: httpbin
+  ports:
+  - port: 8080
+    targetPort: 80
diff --git a/examples/docs/install-plg-stack.md b/examples/docs/install-plg-stack.md
@@ -0,0 +1,91 @@
+## How to Deploy PLG Stack on Kubernetes
+
+**PLG stack** here refers to Promtail, Loki and Grafana, where Promtail extracts and collects logs from docker containers log files and pushes them to the Loki service which then Grafana uses to show logs in the log panel.
+
+### Install Loki Stack
+
+In this tutorial, we will show how to deploy them using loki-stack helm chart.
+The Loki stack is a lightweight log aggregation solution from Grafana.
+
+**Step 1.** Add the Grafana Helm Chart repository and Update repo:
+
+```bash
+# Add Grafana's Helm Chart repository and Update repo :
+helm repo add grafana https://grafana.github.io/helm-charts
+helm repo update
+```
+
+**Step 2.** Install Loki Stack:
+
+If you have prometheus and Grafana already installed, you may deploy the loki stack with values as follows:
+
+```yaml
+# cat values.yaml
+loki:
+  enabled: true
+  url: http://loki-stack.logging:3100
+  image:
+    tag: 2.9.3  # set image tag to 2.8.10 or higher to fix the issue 'Failed to load log volume for this query'
+  persistence:
+    enabled: true # set to true to persist logs
+
+promtail:
+  enabled: true
+  config:
+    clients:
+      - url: http://loki-stack.logging:3100/loki/api/v1/push # set loki url, don't forget the `namespace` of loki service
+```
+
+```bash
+# Deploy the Loki stack to namespace logging. customize the values.yaml as needed.
+helm upgrade --install loki-stack grafana/loki-stack -n logging --create-namespace -f values.yaml
+```
+
+For more details please refer to [loki stack](https://github.com/grafana/helm-charts/tree/main/charts/loki-stack).
+
+> [!IMPORTANT]
+> If you are deploying the stack with loki version 2.6.1, you may encounter the error 'Failed to load log volume for this query'.
+> To fix the issue, you should upgrade the loki version to 2.8.10 or higher, as discussed in the [issue](https://github.com/grafana/grafana/issues/84144).
+
+**Step 3.** Check Status:
+
+```bash
+kubectl get pods -n logging
+```
+
+All the pods should be in the `Running` state.
+
+### Configure Loki in Grafana
+
+#### Step 1. Add Loki Data Source to Grafana
+
+Visit Grafana Dashboard in your browser and Go to `Home` -> `Connections` -> `Data Sources` -> `Add new data source` -> `Loki` and fill in the following details:
+
+- **Name**: Loki
+- **URL**: `http://loki-stack.logging:3100/`, where `logging` is the namespace where Loki is deployed.
+
+Click on `Save & Test` to save the data source.
+
+Then click on `Home` > `Explore` then choose `Loki` as the data source to filter labels and run queries, say `{namespace="default",stream="stdout"}` to see the logs.
+
+If you encounter the `Failed to load log volume for this query` error, please upgrade the loki version to 2.8.10 or higher.
+
+### Step 2. Import a Loki Dashboard for Logs
+
+You can import a Loki dashboard to visualize logs in Grafana or create your own dashboard.
+
+More dashboards can be found at [Grafana Dashboards](https://grafana.com/grafana/dashboards).
+
+### Example: Collect Logs for MySQL Cluster
+
+1. Create MySQL Cluster
+
+```bash
+kubectl create -f examples/mysql/cluster.yaml
+```
+
+2. Open Grafa and import dashboard to visualize logs, for example, you can import the following dashboard:
+
+- <https://grafana.com/grafana/dashboards/16966-container-log-dashboard/>
+
+3. You may choose the namespace and stream to filter logs and see the logs in the log panel
diff --git a/examples/docs/install-prometheus.md b/examples/docs/install-prometheus.md
@@ -84,3 +84,7 @@ spec:
     matchLabels:
       release: prometheus # make sure your ServiceMonitor CR labels matches the selector
 ```
+
+When creating a new `PodMonitor` or `ServiceMonitor`, make sure labels and namespace are set correctly matching the settings (`podMonitorNamespaceSelector` and `podMonitorSelector`,`serviceMonitorNamespaceSelector` and `serviceMonitorSelector`) in the `Prometheus` CR.
+
+When creating a new `PrometheusRule`, make sure labels and namespace are set correctly matching the settings (`ruleNamespaceSelector` and `ruleSelector`) in the `Prometheus` CR.
diff --git a/examples/elasticsearch/pod-monitor.yaml b/examples/elasticsearch/pod-monitor.yaml
@@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
   name: es-cluster-pod-monitor
-  namespace: monitoring # Note: this is namespace for prometheus operator
   labels:               # this is labels set in `prometheus.spec.podMonitorSelector`
     release: prometheus
 spec:

diff --git a/examples/etcd/pod-monitor.yaml b/examples/etcd/pod-monitor.yaml
@@ -3,11 +3,10 @@ apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
   name: etcd-cluster-pod-monitor
-  namespace: monitoring # Note: this is namespace for prometheus operator
   labels:               # this is labels set in `prometheus.spec.podMonitorSelector`
     release: prometheus
 spec:
-  jobLabel: kubeblocks-service
+  jobLabel: app.kubernetes.io/managed-by
   # defines the labels which are transferred from the
   # associated Kubernetes `Pod` object onto the ingested metrics
   # set the lables w.r.t you own needs

diff --git a/examples/milvus/pod-monitor.yaml b/examples/milvus/pod-monitor.yaml
@@ -3,7 +3,6 @@ apiVersion: monitoring.coreos.com/v1
 kind: PodMonitor
 metadata:
   name: milvus-cluster-pod-monitor
-  namespace: monitoring # Note: this is namespace for prometheus operator
   labels:               # this is labels set in `prometheus.spec.podMonitorSelector`
     release: prometheus
 spec:

diff --git a/examples/mongodb/alert-rules.yaml b/examples/mongodb/alert-rules.yaml
@@ -0,0 +1,90 @@
+apiVersion: monitoring.coreos.com/v1
+kind: PrometheusRule
+metadata:
+  name: mongo-alert-rules
+  labels:
+    release: prometheus
+spec:
+  groups:
+    - name: MongodbExporter
+      rules:
+        - alert: MongodbDown
+          expr: "max_over_time(mongodb_up[1m]) == 0"
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "MongoDB is Down"
+            description: 'MongoDB instance is down\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}'
+
+        - alert: MongodbRestarted
+          expr: "mongodb_instance_uptime_seconds < 60"
+          for: 0m
+          labels:
+            severity: info
+          annotations:
+            summary: "Mongodb has just been restarted (< 60s)"
+            description: 'Mongodb has just been restarted {{ $value | printf "%.1f" }} seconds ago\n LABELS = {{ $labels }}'
+
+        - alert: MongodbReplicaMemberUnhealthy
+          expr: "max_over_time(mongodb_rs_members_health[1m]) == 0"
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "Mongodb replica member is unhealthy"
+            description: 'MongoDB replica member is not healthy\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}'
+
+        - alert: MongodbReplicationLag
+          expr: '(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"}) / 1000 > 10'
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "MongoDB replication lag (> 10s)"
+            description: 'Mongodb replication lag is more than 10s\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}'
+
+        - alert: MongodbReplicationHeadroom
+          expr: 'sum(avg(mongodb_mongod_replset_oplog_head_timestamp - mongodb_mongod_replset_oplog_tail_timestamp)) - sum(avg(mongodb_rs_members_optimeDate{member_state="PRIMARY"} - on (pod) group_right mongodb_rs_members_optimeDate{member_state="SECONDARY"})) <= 0'
+          for: 0m
+          labels:
+            severity: critical
+          annotations:
+            summary: "MongoDB replication headroom (< 0)"
+            description: 'MongoDB replication headroom is <= 0\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}'
+
+        - alert: MongodbNumberCursorsOpen
+          expr: 'mongodb_ss_metrics_cursor_open{csr_type="total"} > 10 * 1000'
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: "MongoDB opened cursors num (> 10k)"
+            description: 'Too many cursors opened by MongoDB for clients (> 10k)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}'
+
+        - alert: MongodbCursorsTimeouts
+          expr: "increase(mongodb_ss_metrics_cursor_timedOut[1m]) > 100"
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: "MongoDB cursors timeouts (>100/minute)"
+            description: 'Too many cursors are timing out (> 100/minute)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}'
+
+        - alert: MongodbTooManyConnections
+          expr: 'avg by(pod) (rate(mongodb_ss_connections{conn_type="current"}[1m])) / avg by(pod) (sum (mongodb_ss_connections) by(pod)) * 100 > 80'
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: "MongoDB too many connections (> 80%)"
+            description: 'Too many connections (> 80%)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}'
+
+        - alert: MongodbVirtualMemoryUsage
+          expr: "(sum(mongodb_ss_mem_virtual) BY (pod) / sum(mongodb_ss_mem_resident) BY (pod)) > 100"
+          for: 2m
+          labels:
+            severity: warning
+          annotations:
+            summary: MongoDB virtual memory usage high
+            description: "High memory usage: the quotient of (mem_virtual / mem_resident) is more than 100\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"