diff --git a/.changelog/3528.added.txt b/.changelog/3528.added.txt new file mode 100644 index 0000000000..49082bf701 --- /dev/null +++ b/.changelog/3528.added.txt @@ -0,0 +1 @@ +feat(metrics): allow customizing kubelet metrics \ No newline at end of file diff --git a/deploy/helm/sumologic/README.md b/deploy/helm/sumologic/README.md index fe37b571dd..867f7bba99 100644 --- a/deploy/helm/sumologic/README.md +++ b/deploy/helm/sumologic/README.md @@ -132,7 +132,9 @@ The following table lists the configurable parameters of the Sumo Logic chart an | `sumologic.metrics.collector.otelcol.affinity` | Affinity for the otelcol metrics collector. | `{}` | | `sumologic.metrics.enableDefaultFilters` | Enable default metric filters for Sumo Apps. | `false` | | `sumologic.metrics.collector.otelcol.kubelet.enabled` | Enable collection of kubelet metrics. | `true` | +| `sumologic.metrics.collector.otelcol.kubelet.metricRegex` | Regex for scraped kubelet metrics. | `See [values.yaml]` | | `sumologic.metrics.collector.otelcol.cAdvisor.enabled` | Enable collection of cAdvisor metrics. | `true` | +| `sumologic.metrics.collector.otelcol.cAdvisor.metricRegex` | Regex for scraped cAdvisor metrics. | `See [values.yaml]` | | `sumologic.metrics.collector.otelcol.annotatedPods.enabled` | Enable collection of metrics from Pods annotated with prometheus.io/\* keys. See [help.sumologic.com/docs/send-data/kubernetes/collecting-metrics/](https://help.sumologic.com/docs/send-data/kubernetes/collecting-metrics#application-metrics-are-exposed-one-endpoint-scenario) for more information. | `true` | | `sumologic.metrics.collector.otelcol.allocationStrategy` | Allocation strategy for the scrape target allocator. Valid values are: least-weighted and consistent-hashing. See: https://github.com/open-telemetry/opentelemetry-operator/blob/main/docs/api.md#opentelemetrycollectorspectargetallocator | `least-weighted` | | `sumologic.metrics.collector.otelcol.config.merge` | Configuration for otelcol metrics collector, merged with defaults. See also https://github.com/SumoLogic/sumologic-otel-collector/blob/main/docs/configuration.md. | {} | diff --git a/deploy/helm/sumologic/conf/metrics/collector/otelcol/config.yaml b/deploy/helm/sumologic/conf/metrics/collector/otelcol/config.yaml index bc107d6e7b..3df5cd998a 100644 --- a/deploy/helm/sumologic/conf/metrics/collector/otelcol/config.yaml +++ b/deploy/helm/sumologic/conf/metrics/collector/otelcol/config.yaml @@ -126,7 +126,7 @@ receivers: - role: node metric_relabel_configs: - action: keep - regex: (?:kubelet_docker_operations_errors(?:|_total)|kubelet_(?:docker|runtime)_operations_duration_seconds_(?:count|sum)|kubelet_running_(?:container|pod)(?:_count|s)|kubelet_(:?docker|runtime)_operations_latency_microseconds(?:|_count|_sum)) + regex: {{ $collectorConfig.kubelet.metricRegex }} source_labels: [__name__] - action: labeldrop regex: id @@ -164,7 +164,7 @@ receivers: source_labels: [__name__] target_label: job - action: keep - regex: (?:container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_throttled_seconds_total|container_network_receive_bytes_total|container_network_transmit_bytes_total) + regex: {{ $collectorConfig.cAdvisor.metricRegex }} source_labels: [__name__] ## Drop container metrics with container tag set to an empty string: ## these are the pod aggregated container metrics which can be aggregated diff --git a/deploy/helm/sumologic/values.yaml b/deploy/helm/sumologic/values.yaml index d522348053..bafd1b4fa9 100644 --- a/deploy/helm/sumologic/values.yaml +++ b/deploy/helm/sumologic/values.yaml @@ -523,10 +523,12 @@ sumologic: ## Configuration for kubelet metrics kubelet: enabled: true + metricRegex: (?:kubelet_docker_operations_errors(?:|_total)|kubelet_(?:docker|runtime)_operations_duration_seconds_(?:count|sum)|kubelet_running_(?:container|pod)(?:_count|s)|kubelet_(:?docker|runtime)_operations_latency_microseconds(?:|_count|_sum)) ## Configuration for cAdvisor metrics cAdvisor: enabled: true + metricRegex: (?:container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_fs_usage_bytes|container_fs_limit_bytes|container_cpu_cfs_throttled_seconds_total|container_network_receive_bytes_total|container_network_transmit_bytes_total) ## Enable collection of metrics from Pods annotated with prometheus.io/* keys. ## See https://help.sumologic.com/docs/send-data/kubernetes/collecting-metrics#application-metrics-are-exposed-one-endpoint-scenario for more information. diff --git a/tests/helm/testdata/goldenfile/metrics_collector_otc/kubelet.input.yaml b/tests/helm/testdata/goldenfile/metrics_collector_otc/kubelet.input.yaml new file mode 100644 index 0000000000..38bca39d18 --- /dev/null +++ b/tests/helm/testdata/goldenfile/metrics_collector_otc/kubelet.input.yaml @@ -0,0 +1,8 @@ +sumologic: + metrics: + collector: + otelcol: + kubelet: + metricRegex: kubelet_running_pods + cAdvisor: + metricRegex: container_cpu_usage_total diff --git a/tests/helm/testdata/goldenfile/metrics_collector_otc/kubelet.output.yaml b/tests/helm/testdata/goldenfile/metrics_collector_otc/kubelet.output.yaml new file mode 100644 index 0000000000..adbf7b0862 --- /dev/null +++ b/tests/helm/testdata/goldenfile/metrics_collector_otc/kubelet.output.yaml @@ -0,0 +1,259 @@ +--- +# Source: sumologic/templates/metrics/collector/otelcol/opentelemetrycollector.yaml +apiVersion: opentelemetry.io/v1alpha1 +kind: OpenTelemetryCollector +metadata: + name: RELEASE-NAME-sumologic-metrics + namespace: sumologic + labels: + sumologic.com/app: otelcol + sumologic.com/component: metrics + chart: "sumologic-%CURRENT_CHART_VERSION%" + release: "RELEASE-NAME" + heritage: "Helm" + sumologic.com/scrape: "true" +spec: + image: "public.ecr.aws/sumologic/sumologic-otel-collector:0.92.0-sumo-0" + mode: statefulset + replicas: 1 + serviceAccount: RELEASE-NAME-sumologic-metrics + targetAllocator: + enabled: true + filterStrategy: relabel-config + prometheusCR: + enabled: true + scrapeInterval: 30s + serviceMonitorSelector: + release: RELEASE-NAME + podMonitorSelector: + release: RELEASE-NAME + resources: {} + autoscaler: + maxReplicas: 10 + minReplicas: 1 + targetCPUUtilization: 70 + targetMemoryUtilization: 70 + env: + - name: METADATA_METRICS_SVC + valueFrom: + configMapKeyRef: + name: sumologic-configmap + key: metadataMetrics + - name: NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + podAnnotations: + ## The operator adds this annotation by default, but we use our own ServiceMonitor + prometheus.io/scrape: "false" + podSecurityContext: + fsGroup: 999 + ports: + - name: pprof + port: 1777 + resources: + limits: + cpu: 1000m + memory: 2Gi + requests: + cpu: 100m + memory: 768Mi + volumes: + - name: tmp + emptyDir: {} + - name: file-storage + emptyDir: {} + volumeMounts: + - name: tmp + mountPath: /tmp + - name: file-storage + mountPath: /var/lib/storage/otc + config: | + exporters: + otlphttp: + disable_keep_alives: true + endpoint: http://${METADATA_METRICS_SVC}.${NAMESPACE}.svc.cluster.local.:4318 + sending_queue: + num_consumers: 10 + queue_size: 10000 + storage: file_storage + extensions: + file_storage: + compaction: + directory: /tmp + on_rebound: true + directory: /var/lib/storage/otc + timeout: 10s + health_check: {} + pprof: {} + processors: + batch: + send_batch_max_size: 2000 + send_batch_size: 1000 + timeout: 1s + filter/drop_stale_datapoints: + metrics: + datapoint: + - flags == FLAG_NO_RECORDED_VALUE + transform/drop_unnecessary_attributes: + error_mode: ignore + metric_statements: + - context: resource + statements: + - delete_key(attributes, "http.scheme") + - delete_key(attributes, "net.host.name") + - delete_key(attributes, "net.host.port") + - delete_key(attributes, "service.instance.id") + - delete_matching_keys(attributes, "k8s.*") + transform/extract_sum_count_from_histograms: + error_mode: ignore + metric_statements: + - context: metric + statements: + - extract_sum_metric(true) where IsMatch(name, "^(apiserver_request_duration_seconds|coredns_dns_request_duration_seconds|kubelet_runtime_operations_duration_seconds)$") + - extract_count_metric(true) where IsMatch(name, "^(apiserver_request_duration_seconds|coredns_dns_request_duration_seconds|kubelet_runtime_operations_duration_seconds)$") + receivers: + prometheus: + config: + global: + scrape_interval: 30s + scrape_configs: + - job_name: pod-annotations + kubernetes_sd_configs: + - role: pod + relabel_configs: + - action: keep + regex: true + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_scrape + - action: replace + regex: (.+) + source_labels: + - __meta_kubernetes_pod_annotation_prometheus_io_path + target_label: __metrics_path__ + - action: replace + regex: ([^:]+)(?::\d+)?;(\d+) + replacement: $1:$2 + source_labels: + - __address__ + - __meta_kubernetes_pod_annotation_prometheus_io_port + target_label: __address__ + - action: replace + regex: (.*) + replacement: $1 + separator: ; + source_labels: + - __metrics_path__ + target_label: endpoint + - action: replace + source_labels: + - __meta_kubernetes_namespace + target_label: namespace + - action: labelmap + regex: __meta_kubernetes_pod_label_(.+) + - action: replace + regex: (.*) + replacement: $1 + separator: ; + source_labels: + - __meta_kubernetes_pod_name + target_label: pod + - authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + honor_labels: true + job_name: kubelet + kubernetes_sd_configs: + - role: node + metric_relabel_configs: + - action: keep + regex: kubelet_running_pods + source_labels: + - __name__ + - action: labeldrop + regex: id + relabel_configs: + - source_labels: + - __meta_kubernetes_node_name + target_label: node + - replacement: https-metrics + target_label: endpoint + - action: replace + source_labels: + - __metrics_path__ + target_label: metrics_path + - action: replace + source_labels: + - __address__ + target_label: instance + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + - authorization: + credentials_file: /var/run/secrets/kubernetes.io/serviceaccount/token + honor_labels: true + job_name: cadvisor + kubernetes_sd_configs: + - role: node + metric_relabel_configs: + - action: replace + regex: .* + replacement: kubelet + source_labels: + - __name__ + target_label: job + - action: keep + regex: container_cpu_usage_total + source_labels: + - __name__ + - action: drop + regex: (?:container_cpu_usage_seconds_total|container_memory_working_set_bytes|container_fs_usage_bytes|container_fs_limit_bytes);$ + source_labels: + - __name__ + - container + - action: labelmap + regex: container_name + replacement: container + - action: drop + regex: POD + source_labels: + - container + - action: labeldrop + regex: (id|name) + metrics_path: /metrics/cadvisor + relabel_configs: + - replacement: https-metrics + target_label: endpoint + - action: replace + source_labels: + - __metrics_path__ + target_label: metrics_path + - action: replace + source_labels: + - __address__ + target_label: instance + scheme: https + tls_config: + ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt + insecure_skip_verify: true + service: + extensions: + - health_check + - pprof + - file_storage + pipelines: + metrics: + exporters: + - otlphttp + processors: + - batch + - filter/drop_stale_datapoints + - transform/extract_sum_count_from_histograms + - transform/drop_unnecessary_attributes + receivers: + - prometheus + telemetry: + logs: + level: info + metrics: + address: 0.0.0.0:8888