From 4991729ef9bcf1b90be0ff6164ba79deac640b50 Mon Sep 17 00:00:00 2001 From: Kushal Shukla Date: Wed, 18 Sep 2024 23:27:38 +0530 Subject: [PATCH 1/4] Signed-off-by: kushal shukla Replaced Old metrics with the new ones --- .../prombench/benchmark/6_loadgen.yaml | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/prombench/manifests/prombench/benchmark/6_loadgen.yaml b/prombench/manifests/prombench/benchmark/6_loadgen.yaml index 0c3b4d6ff..f70bf0b10 100644 --- a/prombench/manifests/prombench/benchmark/6_loadgen.yaml +++ b/prombench/manifests/prombench/benchmark/6_loadgen.yaml @@ -48,6 +48,30 @@ data: - expr: histogram_quantile(0.99, sum by(path, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - expr: histogram_quantile(0.99, sum by(path, method, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - expr: histogram_quantile(0.99, sum by(instance, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) + - name: arithmetic operation + interval: 10s + type: instant + queries: + - expr: sum by (container, pod) (rate(container_cpu_usage_seconds_total[5m])) / sum by (container, pod) (container_spec_cpu_quota) + - expr: rate(node_network_receive_bytes_total[5m]) * 60 + - expr: sum(rate(storage_operation_duration_seconds_sum[5m])) + sum(rate(storage_operation_duration_seconds_count[5m])) + - expr: sum(rate(kubelet_runtime_operations_duration_seconds_sum[5m])) by (operation_type) / sum(rate(kubelet_runtime_operations_duration_seconds_count[5m])) by (operation_type) + - name: logic_operator + interval: 10s + type: instant + queries: + - expr: node_filesystem_avail_bytes{mountpoint="/"} and on(instance) node_filesystem_size_bytes{mountpoint="/"} + - expr: container_ulimits_soft{namespace="kube-system"} or container_ulimits_soft{cloud_google_com_gke_nodepool="main-node"} + - expr: container_memory_working_set_bytes{namespace=~"^prombench-[0-9]+"} or container_memory_rss{namespace=~"^prombench-[0-9]+"} + - expr: rate(node_network_receive_bytes_total{device="eth0"}[5m]) unless rate(node_network_receive_packets_dropped_total{device="eth0"}[5m]) + - name: topk + interval: 10s + type: instant + queries: + - expr: topk(3, sum(rate(node_cpu_seconds_total{mode="user"}[5m])) by (instance)) + - expr: topk(5, sum(container_cpu_usage_seconds_total) by (instance)) + - expr: topk(3, sum(container_memory_usage_bytes) by (pod)) + - expr: topk(5, sum(container_sockets) by (namespace)) --- apiVersion: apps/v1 kind: Deployment From 0a30d46bad9be1cea00b6a18fca8d96048398c19 Mon Sep 17 00:00:00 2001 From: Kushal Shukla Date: Tue, 1 Oct 2024 12:11:14 +0530 Subject: [PATCH 2/4] Signed-off-by: Kushal Shukla updated metrics with some heavy count --- .../prombench/benchmark/6_loadgen.yaml | 29 ++++++++++--------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/prombench/manifests/prombench/benchmark/6_loadgen.yaml b/prombench/manifests/prombench/benchmark/6_loadgen.yaml index f70bf0b10..24939050e 100644 --- a/prombench/manifests/prombench/benchmark/6_loadgen.yaml +++ b/prombench/manifests/prombench/benchmark/6_loadgen.yaml @@ -48,30 +48,33 @@ data: - expr: histogram_quantile(0.99, sum by(path, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - expr: histogram_quantile(0.99, sum by(path, method, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - expr: histogram_quantile(0.99, sum by(instance, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - - name: arithmetic operation + - name: arithmetic_operation interval: 10s type: instant queries: - - expr: sum by (container, pod) (rate(container_cpu_usage_seconds_total[5m])) / sum by (container, pod) (container_spec_cpu_quota) - - expr: rate(node_network_receive_bytes_total[5m]) * 60 - - expr: sum(rate(storage_operation_duration_seconds_sum[5m])) + sum(rate(storage_operation_duration_seconds_count[5m])) - - expr: sum(rate(kubelet_runtime_operations_duration_seconds_sum[5m])) by (operation_type) / sum(rate(kubelet_runtime_operations_duration_seconds_count[5m])) by (operation_type) + - expr: rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]) + 100 + - expr: rate(go_memstats_frees_total[5m]) * 60 + - expr: rate(codelab_api_requests_total{method=~"GET|POST"}[5m]) - rate(codelab_api_request_duration_seconds_sum{method=~"GET|POST"}[5m]) + - expr: rate(go_gc_duration_seconds_sum{job=~"fake-webservers-[0-9]+"}[5m]) / rate(go_gc_duration_seconds_count{job=~"fake-webservers-[0-9]+"}[5m]) + - expr: sum by (instance, job) (rate(codelab_api_request_errors_total[5m])) / sum by (instance, job) (rate(go_memstats_mallocs_total[10m])) - name: logic_operator interval: 10s type: instant queries: - - expr: node_filesystem_avail_bytes{mountpoint="/"} and on(instance) node_filesystem_size_bytes{mountpoint="/"} - - expr: container_ulimits_soft{namespace="kube-system"} or container_ulimits_soft{cloud_google_com_gke_nodepool="main-node"} - - expr: container_memory_working_set_bytes{namespace=~"^prombench-[0-9]+"} or container_memory_rss{namespace=~"^prombench-[0-9]+"} - - expr: rate(node_network_receive_bytes_total{device="eth0"}[5m]) unless rate(node_network_receive_packets_dropped_total{device="eth0"}[5m]) + - expr: codelab_api_request_duration_seconds_bucket{method="GET"} or codelab_api_request_duration_seconds_bucket{method="POST"} + - expr: codelab_api_request_duration_seconds_sum{status="200"} or codelab_api_request_duration_seconds_sum{status="500"} + - expr: codelab_api_request_duration_seconds_bucket{status="200"} and codelab_api_request_duration_seconds_bucket{method="GET"} + - expr: codelab_api_request_duration_seconds_count{method="POST"} and codelab_api_request_duration_seconds_count{status="500"} + - expr: codelab_api_request_duration_seconds_bucket{status="200"} or codelab_api_request_duration_seconds_bucket{method="GET"} - name: topk interval: 10s type: instant queries: - - expr: topk(3, sum(rate(node_cpu_seconds_total{mode="user"}[5m])) by (instance)) - - expr: topk(5, sum(container_cpu_usage_seconds_total) by (instance)) - - expr: topk(3, sum(container_memory_usage_bytes) by (pod)) - - expr: topk(5, sum(container_sockets) by (namespace)) + - expr: topk(2000, sum(rate(go_gc_duration_seconds_count[5m])) by (instance, job)) + - expr: topk(10000, sum(codelab_api_request_duration_seconds_bucket) by (method,job)) + - expr: topk(1000, count(go_threads) by (job, instance)) + - expr: topk(2000, count(codelab_api_http_requests_in_progress) by (job, instance)) + - expr: topk(15000, count(codelab_api_request_duration_seconds_sum) by (job, instance)) --- apiVersion: apps/v1 kind: Deployment From b74ac896fa594072f2f891c3a9a782e5eb806b1e Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Sun, 20 Oct 2024 18:01:08 +0100 Subject: [PATCH 3/4] Trim down newly added queries Slow down arithmetic_operation and logic_operator; take out a few queries to avoid overloading the server. Stop querying `_bucket` series directly; those should be used by `histogram_quantile` or similar. Use more realistic `k` parameters to `topk`. Signed-off-by: Bryan Boreham --- .../prombench/benchmark/6_loadgen.yaml | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/prombench/manifests/prombench/benchmark/6_loadgen.yaml b/prombench/manifests/prombench/benchmark/6_loadgen.yaml index 24939050e..fd20d6e6c 100644 --- a/prombench/manifests/prombench/benchmark/6_loadgen.yaml +++ b/prombench/manifests/prombench/benchmark/6_loadgen.yaml @@ -49,32 +49,25 @@ data: - expr: histogram_quantile(0.99, sum by(path, method, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - expr: histogram_quantile(0.99, sum by(instance, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - name: arithmetic_operation - interval: 10s + interval: 30s type: instant queries: - - expr: rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]) + 100 - expr: rate(go_memstats_frees_total[5m]) * 60 - expr: rate(codelab_api_requests_total{method=~"GET|POST"}[5m]) - rate(codelab_api_request_duration_seconds_sum{method=~"GET|POST"}[5m]) - expr: rate(go_gc_duration_seconds_sum{job=~"fake-webservers-[0-9]+"}[5m]) / rate(go_gc_duration_seconds_count{job=~"fake-webservers-[0-9]+"}[5m]) - expr: sum by (instance, job) (rate(codelab_api_request_errors_total[5m])) / sum by (instance, job) (rate(go_memstats_mallocs_total[10m])) - name: logic_operator - interval: 10s + interval: 30s type: instant queries: - - expr: codelab_api_request_duration_seconds_bucket{method="GET"} or codelab_api_request_duration_seconds_bucket{method="POST"} - expr: codelab_api_request_duration_seconds_sum{status="200"} or codelab_api_request_duration_seconds_sum{status="500"} - - expr: codelab_api_request_duration_seconds_bucket{status="200"} and codelab_api_request_duration_seconds_bucket{method="GET"} - - expr: codelab_api_request_duration_seconds_count{method="POST"} and codelab_api_request_duration_seconds_count{status="500"} - - expr: codelab_api_request_duration_seconds_bucket{status="200"} or codelab_api_request_duration_seconds_bucket{method="GET"} + - expr: codelab_api_request_duration_seconds_count{method="POST"} unless codelab_api_request_duration_seconds_count{status="500"} - name: topk interval: 10s type: instant queries: - - expr: topk(2000, sum(rate(go_gc_duration_seconds_count[5m])) by (instance, job)) - - expr: topk(10000, sum(codelab_api_request_duration_seconds_bucket) by (method,job)) - - expr: topk(1000, count(go_threads) by (job, instance)) - - expr: topk(2000, count(codelab_api_http_requests_in_progress) by (job, instance)) - - expr: topk(15000, count(codelab_api_request_duration_seconds_sum) by (job, instance)) + - expr: topk(20, sum(rate(go_gc_duration_seconds_count[5m])) by (instance, job)) + - expr: topk(10, sum(codelab_api_request_duration_seconds_count) by (method,job)) --- apiVersion: apps/v1 kind: Deployment From c66f3655b8f11c923407f23c96c89b18e80af7eb Mon Sep 17 00:00:00 2001 From: Bryan Boreham Date: Sun, 20 Oct 2024 18:03:44 +0100 Subject: [PATCH 4/4] Trim down pre-existing queries For balance, to retain about the same overall load on the server as before. Signed-off-by: Bryan Boreham --- prombench/manifests/prombench/benchmark/6_loadgen.yaml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/prombench/manifests/prombench/benchmark/6_loadgen.yaml b/prombench/manifests/prombench/benchmark/6_loadgen.yaml index fd20d6e6c..86b72d6cb 100644 --- a/prombench/manifests/prombench/benchmark/6_loadgen.yaml +++ b/prombench/manifests/prombench/benchmark/6_loadgen.yaml @@ -20,7 +20,7 @@ data: - expr: codelab_api_http_requests_in_progress - expr: 'codelab_api_requests_total{method="GET",path="/api/bar",status="200"}' - name: aggr_instant - interval: 5s + interval: 15s type: instant queries: - expr: sum by(image) (container_memory_rss) @@ -45,7 +45,6 @@ data: queries: - expr: rate(codelab_api_requests_total{method=~"GET|POST"}[5m]) - expr: sum without(instance) (rate(codelab_api_requests_total{method=~"GET|POST"}[5m])) - - expr: histogram_quantile(0.99, sum by(path, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - expr: histogram_quantile(0.99, sum by(path, method, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - expr: histogram_quantile(0.99, sum by(instance, le) (rate(codelab_api_request_duration_seconds_bucket{method="POST"}[5m]))) - name: arithmetic_operation