From adc8ed96c12ceab121d6d6367ccfac5044d1497a Mon Sep 17 00:00:00 2001 From: Joshua MacDonald Date: Tue, 13 Aug 2024 15:59:13 -0700 Subject: [PATCH] Update charts and configurations for OTel-Arrow v0.25 with OTel-Contrib v0.107 components --- arrow/config/gateway-collector.yaml | 48 +++++-- arrow/otelcolarrow-build.yaml | 63 +++++---- charts/collector-k8s/Chart.yaml | 4 +- charts/kube-otel-stack/Chart.yaml | 4 +- charts/kube-otel-stack/values.yaml | 195 +++++++++++++++------------- charts/otel-cloud-stack/Chart.yaml | 4 +- charts/otel-cloud-stack/arrow.yaml | 54 -------- charts/otel-cloud-stack/values.yaml | 79 +++++++++-- example/README.md | 16 +-- example/vm/config.yaml | 2 +- 10 files changed, 249 insertions(+), 220 deletions(-) delete mode 100644 charts/otel-cloud-stack/arrow.yaml diff --git a/arrow/config/gateway-collector.yaml b/arrow/config/gateway-collector.yaml index 1166a15..f1d2018 100644 --- a/arrow/config/gateway-collector.yaml +++ b/arrow/config/gateway-collector.yaml @@ -8,7 +8,7 @@ # resources: # requests: # cpu: 2 -# memory: 3Gi +# memory: 4Gi # limits: # cpu: 2 # memory: 8Gi @@ -17,13 +17,15 @@ # # resources: # requests: -# cpu: 8 -# memory: 6Gi +# cpu: 32 +# memory: 48Gi # limits: -# cpu: 8 -# memory: 24Gi +# cpu: 32 +# memory: 64Gi # # In the larger configuration tested, we used `max_in_flight_size_mib: 256`. +# In the smaller configuration, relatively more memory is required due to +# garbage collection costs. receivers: # otelarrow is the OpenTelemetry Protocol with Apache Arrow receiver @@ -60,8 +62,9 @@ processors: send_batch_size: 1000 timeout: 1s - # For larger configurations, consider raising this parameter. - max_in_flight_size_mib: 128 + # Use max_in_flight_size_mib=64 in a 2 cpu configuration; + # Use max_in_flight_size_mib=1024 in a 16 cpu configuration. + max_in_flight_size_mib: 64 exporters: otelarrow: @@ -69,11 +72,6 @@ exporters: headers: "lightstep-access-token": "${LS_TOKEN}" - arrow: - disabled: false - max_stream_lifetime: 2m - num_streams: 6 - # The pipeline will continue trying requests until they timeout. # Timeout and retry settings are independent. If retry_on_failure # is also enabled, each (retried) request will also have this @@ -86,11 +84,33 @@ exporters: retry_on_failure: enabled: false - # Do not enable the sending queue. The concurrent batch processor - # is a better way to parallelize the export. + # Do not enable the sending queue. The concurrent batch processor, + # configured here, a better way to parallelize the export. sending_queue: enabled: false + # OTel-Arrow notes: these settings are specific to OTel-Arrow + # To discard this configuration and restore the standard OTLP + # exporter, simply drop the arrow section below and replace + # "otelarrow" by "otlp" as the component name above and in + # service::pipelines::traces::exporters. + arrow: + # This prevents the OTel-Arrow exporter from falling back to + # standard OTLP in case of misconfiguration. + disable_downgrade: true + + # We recommend a small number of streams, since they consume + # substantial resources. More than one stream is recommended + # to help balance load. + # + # Use num_streams=2 in a 2 cpu configuration; + # Use num_streams=12 in a 16 cpu configuration. + num_streams: 2 + + # A stream lifetime limit is required to avoid spurious + # disconnect error messages in the collector logs. + max_stream_lifetime: 4m + service: pipelines: traces: diff --git a/arrow/otelcolarrow-build.yaml b/arrow/otelcolarrow-build.yaml index 1aab8e0..8b72998 100644 --- a/arrow/otelcolarrow-build.yaml +++ b/arrow/otelcolarrow-build.yaml @@ -37,13 +37,13 @@ dist: # description: My Organization telemetry collector # name: otelarrowcol - version: 0.24.0 + version: 0.25.0 description: ServiceNow Cloud Observability OpenTelemetry Protocol with Apache Arrow gateway collector # This indicates which version of the core collector components as # well as the builder version. At the time of this writing, we # recommend using at least the current release. - otelcol_version: 0.103.0 + otelcol_version: 0.107.0 # We recommend building in both the OTLP exporter and the OTel-Arrow # exporter. @@ -51,30 +51,28 @@ exporters: # This is the core OpenTelemetry Protocol with Apache Arrow exporter, # recommended for exporting to ServiceNow Cloud Observability using # either OTel-Arrow or standard OTLP over gRPC. - - gomod: github.com/open-telemetry/otel-arrow/collector v0.24.0 - import: github.com/open-telemetry/otel-arrow/collector/exporter/otelarrowexporter + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/exporter/otelarrowexporter v0.107.0 # The following components may be useful for debugging. # As an alternate to the OTel-Arrow exporter, we recommend building - # with the core OTLP exporter as a fallback. These components use - # compatible configuration. - - gomod: go.opentelemetry.io/collector/exporter/otlpexporter v0.103.0 + # with the core OTLP exporter as a fallback. This component and the + # OTel-Arrow exporter use compatible configuration. + - gomod: go.opentelemetry.io/collector/exporter/otlpexporter v0.107.0 # In case OTLP/HTTP export is required, as opposed to the two # gRPC options above. - - gomod: go.opentelemetry.io/collector/exporter/otlphttpexporter v0.103.0 + - gomod: go.opentelemetry.io/collector/exporter/otlphttpexporter v0.107.0 # In case you want to record an OTLP telemetry session to a JSON # file, we recommend this additional utility. See also the # obfuscation processor, listed below. The OTel-Arrow provides # offline tools that can help explain poor compression performance # using inputs generated by this exporter, for example. - - gomod: github.com/open-telemetry/otel-arrow/collector v0.24.0 - import: github.com/open-telemetry/otel-arrow/collector/exporter/fileexporter + - gomod: github.com/open-telemetry/otel-arrow/collector/exporter/fileexporter v0.25.0 # The debug exporter, useful for printing telemetry to the console. - - gomod: go.opentelemetry.io/collector/exporter/debugexporter v0.103.0 + - gomod: go.opentelemetry.io/collector/exporter/debugexporter v0.107.0 receivers: # The OTel-Arrow receiver supports multiple protocols including OTel-Arrow, OTLP gRPC, @@ -84,60 +82,57 @@ receivers: # This enables other OTel Collectors or experimental SDKs that support OTel-Arrow to # send to this collector, such as this one for the OTel-Go Trace SDK: # https://github.com/lightstep/otel-launcher-go/tree/main/lightstep/sdk/trace/exporters/otlp/otelcol - - gomod: github.com/open-telemetry/otel-arrow/collector v0.24.0 - import: github.com/open-telemetry/otel-arrow/collector/receiver/otelarrowreceiver + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/otelarrowreceiver v0.107.0 # To support receiving OTLP/HTTP. - - gomod: go.opentelemetry.io/collector/receiver/otlpreceiver v0.103.0 + - gomod: go.opentelemetry.io/collector/receiver/otlpreceiver v0.107.0 # You may wish to enable other receivers from the Collector-Contrib repository or # elsewhere. Here are some that might be useful: # - # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.103.0 - # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver v0.103.0 - # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kubeletstatsreceiver v0.103.0 - # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/k8sclusterreceiver v0.103.0 + # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.107.0 + # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver v0.107.0 + # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kubeletstatsreceiver v0.107.0 + # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/k8sclusterreceiver v0.107.0 processors: # We recommend use of the OTel-Arrow concurrent batch processor. - - gomod: github.com/open-telemetry/otel-arrow/collector v0.24.0 - import: github.com/open-telemetry/otel-arrow/collector/processor/concurrentbatchprocessor + - gomod: github.com/open-telemetry/otel-arrow/collector/processor/concurrentbatchprocessor v0.25.0 # We recommend building with the follow processor for obfuscation, in case you want to # record telemetry sessions for offline analysis. - - gomod: github.com/open-telemetry/otel-arrow/collector v0.24.0 - import: github.com/open-telemetry/otel-arrow/collector/processor/obfuscationprocessor + - gomod: github.com/open-telemetry/otel-arrow/collector/processor/obfuscationprocessor v0.25.0 # We emphatically DO NOT recommend use of the memory limiter # processor or the core batch processor, i.e., do not build with # either of: # - go.opentelemetry.io/collector/processor/memorylimiterprocessor. - # - gomod: go.opentelemetry.io/collector/processor/batchprocessor v0.103.0 + # - gomod: go.opentelemetry.io/collector/processor/batchprocessor v0.107.0 # These Collector-Contrib components are referred to in the charts in this package. - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.103.0 - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.103.0 - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourceprocessor v0.103.0 - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor v0.103.0 - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/remotetapprocessor v0.103.0 - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor v0.103.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourcedetectionprocessor v0.107.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/k8sattributesprocessor v0.107.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/resourceprocessor v0.107.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/metricstransformprocessor v0.107.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/remotetapprocessor v0.107.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/transformprocessor v0.107.0 # ServiceNow does not at this time recommend the probabilistic sampler processor, # because it leads to undercounting of spans. We will update this recommendation # when the sampler is fully supported. - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/probabilisticsamplerprocessor v0.103.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/probabilisticsamplerprocessor v0.107.0 # You may be interested in other transform components in the # Collector-Contrib repository. Here are some that might be useful: # - # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor v0.103.0 + # - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor v0.107.0 extensions: # You may be interested in externsions from the Collector-Contrib repository. None are # required. Here are some that might be useful: - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/extension/healthcheckextension v0.103.0 - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/extension/pprofextension v0.103.0 - - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/extension/opampextension v0.103.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/extension/healthcheckextension v0.107.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/extension/pprofextension v0.107.0 + - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/extension/opampextension v0.107.0 # We do not support use of headersetterextension to apply the lightstep-access-token # header on a per-request basis. Contact ServiceNow if you are interested in diff --git a/charts/collector-k8s/Chart.yaml b/charts/collector-k8s/Chart.yaml index cde5855..9be8de7 100644 --- a/charts/collector-k8s/Chart.yaml +++ b/charts/collector-k8s/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: otelcollector description: Deprecated chart for using the OpenTelemetry Collector to scape static or dynamic metric targets. type: application -version: 0.3.2 -appVersion: 0.91.1 +version: 0.3.3 +appVersion: 0.106.1 deprecated: true dependencies: [] # cert manager must be manually installed because it has CRDs diff --git a/charts/kube-otel-stack/Chart.yaml b/charts/kube-otel-stack/Chart.yaml index 392ab39..d623adf 100644 --- a/charts/kube-otel-stack/Chart.yaml +++ b/charts/kube-otel-stack/Chart.yaml @@ -2,8 +2,8 @@ apiVersion: v2 name: kube-otel-stack description: Chart for sending Kubernetes metrics to Lightstep using the OpenTelemetry Operator. type: application -version: 0.8.0 -appVersion: 0.92.0 +version: 0.9.0 +appVersion: 0.106.1 dependencies: # cert manager must be manually installed because it has CRDs # https://github.com/kubernetes-sigs/security-profiles-operator/issues/1062 diff --git a/charts/kube-otel-stack/values.yaml b/charts/kube-otel-stack/values.yaml index fc0c750..9c9a21a 100644 --- a/charts/kube-otel-stack/values.yaml +++ b/charts/kube-otel-stack/values.yaml @@ -54,26 +54,36 @@ autoinstrumentation: collectors: [] ## Default collector for tracing +## +## ServiceNow recommends the OTel-Arrow exporter for sending data to +## Cloud Observability. tracesCollector: enabled: false name: traces clusterName: "" - - image: otel/opentelemetry-collector-contrib:0.105.0 + image: "ghcr.io/lightstep/otel-collector-charts/otelarrowcol-experimental:latest" mode: deployment - replicas: 1 hpa: - enabled: false - minReplicas: 3 - maxReplicas: 10 - targetMemoryUtilization: 70 + minReplicas: 1 + maxReplicas: 3 + targetMemoryUtilization: 60 resources: - limits: - cpu: 250m - memory: 250Mi + # OTel-Arrow gateways with this resource configuration have been + # exercised at approximately 25,000 spans per second per cpu core + # in our internal production setup, for reference. Thus, a 2 cpu + # instance may be expected to process 50,000 spans/sec. + # + # When scaling these values, raise or lower the following settings + # proportionally: + # + # - concurrentbatch::max_in_flight_size_mib + # - exporters::otelarrow::arrow::num_streams requests: - cpu: 250m - memory: 250Mi + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 8Gi env: - name: LS_TOKEN valueFrom: @@ -89,51 +99,59 @@ tracesCollector: http: endpoint: "0.0.0.0:4318" processors: - # We recommend use of the batch processor. We recommend the settings - # below for traces. - # - # Note: We are aware of ongoing efforts within OpenTelemetry to - # configure batching in the exporter, where it is possible to - # configure batch size limits in terms of bytes, instead of items. - # We will update these recommendations when batching by size is - # available. - batch: - # In this example, the processor will wait to accumulate at least - # 1000 spans for up to 1 second, then flush the batch. In cases - # where the arriving data is already batched, such that combining - # the pending batch with the arriving data would exceed 1500 - # items, then 1500 items will be sent by splitting the data. - # - # Note: the batch processor has a side-effect of returning success - # to the producer, before waiting for the consumer to respond. - # This is appropriate default in most cases, it means that SDKs - # sending to the gateway will not see or report errors. - # - # The batch processor responds to "back-pressure" from the - # exporter, meaning it is never directly responsible for dropping - # spans. Note that our current recommendation for exporter - # settings does not respond with back-pressure to the batch - # processor. Due to exporter settings, this collector - # configuration will drop data when the ServiceNow service is - # (intentionally or accidentally) refusing data, instead of - # applying pressure backward, discussed in the `exporters` - # section. - send_batch_size: 1000 - send_batch_max_size: 1500 - timeout: 1s resourcedetection/env: detectors: [env] timeout: 2s override: false + concurrentbatch: + send_batch_size: 1000 + timeout: 1s + send_batch_max_size: 1500 + + # Use max_in_flight_size_mib=64 in a 2 cpu configuration; + # Use max_in_flight_size_mib=1024 in a 16 cpu configuration. + max_in_flight_size_mib: 64 k8sattributes: passthrough: false pod_association: + - sources: + - from: resource_attribute + name: k8s.pod.uid - sources: - from: resource_attribute name: k8s.pod.name + - from: resource_attribute + name: k8s.namespace.name + - from: resource_attribute + name: k8s.node.name + - sources: + - from: resource_attribute + name: k8s.pod.ip + - sources: + - from: resource_attribute + name: k8s.pod.name + - from: resource_attribute + name: k8s.namespace.name + - sources: + - from: connection extract: + labels: + - tag_name: service.name + key: app.kubernetes.io/name + from: pod + - tag_name: service.name + key: k8s-app + from: pod + - tag_name: k8s.app.instance + key: app.kubernetes.io/instance + from: pod + - tag_name: service.version + key: app.kubernetes.io/version + from: pod + - tag_name: k8s.app.component + key: app.kubernetes.io/component + from: pod metadata: - - k8s.cluster.uid - k8s.namespace.name - k8s.pod.name - k8s.pod.uid @@ -146,75 +164,68 @@ tracesCollector: - k8s.daemonset.uid - k8s.job.name - k8s.job.uid + - k8s.container.name - k8s.cronjob.name - k8s.statefulset.name - k8s.statefulset.uid - container.image.tag - container.image.name - resource: - attributes: - - key: lightstep.helm_chart - value: kube-otel-stack - action: insert + - k8s.cluster.uid exporters: - otlp: + otelarrow: endpoint: ingest.lightstep.com:443 headers: "lightstep-access-token": "${LS_TOKEN}" - # Queue settings are required. It does not make sense to use - # the exporter without a queue, it has to do with - # requiring the "num_consumers" limit configured in this - # section (i.e., we require a queue in order to limit the - # number of concurrent exports). - # - # Note that the queue settings are applied in unit-terms - # produced by the batch processor, so a number like 100 means - # the queue has support for 100 pre-batched items. With up to - # 1500 spans each (from the batch processor), this - # configuration allows 150,000 spans to occupy memory. - sending_queue: - enabled: true - num_consumers: 4 - queue_size: 100 - - # Retry settings are optional. - # - # Note that while retries are attempted, this component will - # begin to drop arriving data if the queue is not large - # enough. + # The pipeline will continue trying requests until they timeout. + # Timeout and retry settings are independent. If retry_on_failure + # is also enabled, each (retried) request will also have this + # timeout. + timeout: 30s + + # Retries are disabled by default. Since the most likely reason + # for failure is timeout, having retry-on-failure enabled implies + # dedicating a significant amount of additional memory to the task. retry_on_failure: - # We recommend disabling retries, since while the export is - # blocked it is likely that arriving spans will drop, and - # Otherwise, collectors will need substantial additional - # memory to survive transient failures. Nevertheless, we - # recommend a limited retry policy to gracefully occasional - # failures, paired with a modest queue size. - # - # Note there is a persistent storage option inherited from a - # common collector component. When persistent storage is - # configured, the default retry configuration is sensible. + enabled: false + + # Do not enable the sending queue. The concurrent batch processor + # is a better way to parallelize exports. + sending_queue: + enabled: false + + # OTel-Arrow notes: these settings are specific to OTel-Arrow + # To discard this configuration and restore the standard OTLP + # exporter, simply drop the arrow section below and replace + # "otelarrow" by "otlp" as the component name above and in + # service::pipelines::traces::exporters. + arrow: + # This prevents the OTel-Arrow exporter from falling back to + # standard OTLP in case of misconfiguration. + disable_downgrade: true + + # We recommend a small number of streams, since they consume + # substantial resources. More than one stream is recommended + # to help balance load. # - # For more details on retry and queue settings, please refer to - # https://github.com/open-telemetry/opentelemetry-collector/blob/main/exporter/exporterhelper/README.md - enabled: true - max_elapsed_time: 60s + # Use num_streams=2 in a 2 cpu configuration; + # Use num_streams=12 in a 16 cpu configuration. + num_streams: 2 - # While we expect latency under one second, typically, we - # recommend a longer timeout than the default. - timeout: 30s + # A stream lifetime limit is required to avoid spurious + # disconnect error messages in the collector logs. + max_stream_lifetime: 4m service: pipelines: traces: receivers: [otlp] processors: - - resource - resourcedetection/env - k8sattributes - - batch - exporters: [otlp] + - concurrentbatch + exporters: [otelarrow] ## Default collector for metrics (includes infrastructure metrics) metricsCollector: diff --git a/charts/otel-cloud-stack/Chart.yaml b/charts/otel-cloud-stack/Chart.yaml index ee6710f..44a5db8 100644 --- a/charts/otel-cloud-stack/Chart.yaml +++ b/charts/otel-cloud-stack/Chart.yaml @@ -15,10 +15,10 @@ type: application # This is the chart version. This version number should be incremented each time you make changes # to the chart and its templates, including the app version. # Versions are expected to follow Semantic Versioning (https://semver.org/) -version: "0.7.0" +version: "0.8.0" # This is the version number of the application being deployed. This version number should be # incremented each time you make changes to the application. Versions are not expected to # follow Semantic Versioning. They should reflect the version the application is using. # It is recommended to use it with quotes. -appVersion: "1.19.0" +appVersion: "1.30.0" diff --git a/charts/otel-cloud-stack/arrow.yaml b/charts/otel-cloud-stack/arrow.yaml deleted file mode 100644 index e26153b..0000000 --- a/charts/otel-cloud-stack/arrow.yaml +++ /dev/null @@ -1,54 +0,0 @@ -# This is a BETA feature, please use at your own risk. -# OTel-Arrow notes: to use OTel-Arrow requires an image with the -# OTel-Arrow components built in. The collector-contrib image -# does not include these components yet, so a custom image will be -# needed. See https://github.com/lightstep/otel-collector-charts/blob/main/gateway-build.yaml -tracesCollector: - image: "ghcr.io/lightstep/otel-collector-charts/otelarrowcol-experimental:latest" - resources: - # OTel-Arrow notes: to use OTel-Arrow in a gateway configuration, - # we recommend the following adjustments: - # - # - # OTel-Arrow gateways with this resource configuration have been - # exercised at rates above 20,000 spans per second in our internal - # production setup, for reference. - limits: - cpu: 2000m - memory: 8Gi - requests: - cpu: 1500m - memory: 6Gi - config: - receivers: - otelarrow: - protocols: - grpc: - endpoint: "0.0.0.0:4317" - otlp: - protocols: - http: - endpoint: "0.0.0.0:4318" - exporters: - otelarrow: - # OTel-Arrow notes: To use OTel-Arrow during early-access - # specifically requires the following endpoint. This endpoint - # supports both OTLP and OTel-Arrow. - endpoint: spaningest.lightstep.com:443 - - # OTel-Arrow notes: these settings are specific to OTel-Arrow. - # To use this configuration, replace "otlp" with "otelarrow" above - # and uncomment below. - arrow: - # This prevents the OTel-Arrow exporter from falling back to - # standard OTLP in case of misconfiguration. - disable_downgrade: true - - # We recommend a small number of streams, since they consume - # substantial resources. More than one stream is recommended - # to help balance load. - num_streams: 2 - - # A stream lifetime limit is required to avoid spurious - # disconnect error messages in the collector logs. - max_stream_lifetime: 4m30s diff --git a/charts/otel-cloud-stack/values.yaml b/charts/otel-cloud-stack/values.yaml index 16d68d3..1848963 100644 --- a/charts/otel-cloud-stack/values.yaml +++ b/charts/otel-cloud-stack/values.yaml @@ -422,23 +422,36 @@ clusterCollector: exporters: [otlp, logging] ## Default collector for tracing +## +## ServiceNow recommends the OTel-Arrow exporter for sending data to +## Cloud Observability. tracesCollector: enabled: false name: traces clusterName: "" - image: otel/opentelemetry-collector-contrib:0.105.0 + image: "ghcr.io/lightstep/otel-collector-charts/otelarrowcol-experimental:latest" mode: deployment hpa: minReplicas: 1 maxReplicas: 3 - targetMemoryUtilization: 70 + targetMemoryUtilization: 60 resources: - limits: - cpu: 250m - memory: 250Mi + # OTel-Arrow gateways with this resource configuration have been + # exercised at approximately 25,000 spans per second per cpu core + # in our internal production setup, for reference. Thus, a 2 cpu + # instance may be expected to process 50,000 spans/sec. + # + # When scaling these values, raise or lower the following settings + # proportionally: + # + # - concurrentbatch::max_in_flight_size_mib + # - exporters::otelarrow::arrow::num_streams requests: - cpu: 250m - memory: 250Mi + cpu: 2 + memory: 4Gi + limits: + cpu: 2 + memory: 8Gi env: - name: LS_TOKEN valueFrom: @@ -458,10 +471,14 @@ tracesCollector: detectors: [env] timeout: 2s override: false - batch: + concurrentbatch: send_batch_size: 1000 timeout: 1s send_batch_max_size: 1500 + + # Use max_in_flight_size_mib=64 in a 2 cpu configuration; + # Use max_in_flight_size_mib=1024 in a 16 cpu configuration. + max_in_flight_size_mib: 64 k8sattributes: passthrough: false pod_association: @@ -524,10 +541,50 @@ tracesCollector: - k8s.cluster.uid exporters: - otlp: + otelarrow: endpoint: ingest.lightstep.com:443 headers: "lightstep-access-token": "${LS_TOKEN}" + + # The pipeline will continue trying requests until they timeout. + # Timeout and retry settings are independent. If retry_on_failure + # is also enabled, each (retried) request will also have this + # timeout. + timeout: 30s + + # Retries are disabled by default. Since the most likely reason + # for failure is timeout, having retry-on-failure enabled implies + # dedicating a significant amount of additional memory to the task. + retry_on_failure: + enabled: false + + # Do not enable the sending queue. The concurrent batch processor + # is a better way to parallelize exports. + sending_queue: + enabled: false + + # OTel-Arrow notes: these settings are specific to OTel-Arrow + # To discard this configuration and restore the standard OTLP + # exporter, simply drop the arrow section below and replace + # "otelarrow" by "otlp" as the component name above and in + # service::pipelines::traces::exporters. + arrow: + # This prevents the OTel-Arrow exporter from falling back to + # standard OTLP in case of misconfiguration. + disable_downgrade: true + + # We recommend a small number of streams, since they consume + # substantial resources. More than one stream is recommended + # to help balance load. + # + # Use num_streams=2 in a 2 cpu configuration; + # Use num_streams=12 in a 16 cpu configuration. + num_streams: 2 + + # A stream lifetime limit is required to avoid spurious + # disconnect error messages in the collector logs. + max_stream_lifetime: 4m + service: pipelines: traces: @@ -535,8 +592,8 @@ tracesCollector: processors: - resourcedetection/env - k8sattributes - - batch - exporters: [otlp] + - concurrentbatch + exporters: [otelarrow] logsCollector: enabled: false diff --git a/example/README.md b/example/README.md index 9946b09..3417af9 100644 --- a/example/README.md +++ b/example/README.md @@ -26,11 +26,11 @@ GO111MODULE=on go install go.opentelemetry.io/collector/cmd/builder@latest 2. Uncomment lines in example/config.yaml that refer to required components, such as: ```yaml -# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor v0.103.0 -# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.103.0 -# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver v0.103.0 -# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kubeletstatsreceiver v0.103.0 -# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/k8sclusterreceiver v0.103.0 +# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/processor/attributesprocessor v0.107.0 +# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/prometheusreceiver v0.107.0 +# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/hostmetricsreceiver v0.107.0 +# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/kubeletstatsreceiver v0.107.0 +# - gomod: github.com/open-telemetry/opentelemetry-collector-contrib/receiver/k8sclusterreceiver v0.107.0 ``` ### Usage @@ -38,17 +38,17 @@ GO111MODULE=on go install go.opentelemetry.io/collector/cmd/builder@latest - Generate the custom collector binary: ```bash -builder --config ./otelcolarrow-build.yaml +builder --config ../arrow/otelcolarrow-build.yaml ``` - Run the custom collector: ```bash -./dist/otelarrowcol --config ../example/vm/config.yaml +./dist/otelarrowcol --config ./vm/config.yaml ``` #### Troubleshooting - Note this example config does not use the concurrent batch processor. Uncomment this line instead: -` - gomod: go.opentelemetry.io/collector/processor/batchprocessor v0.103.0` +` - gomod: go.opentelemetry.io/collector/processor/batchprocessor v0.107.0` diff --git a/example/vm/config.yaml b/example/vm/config.yaml index 0dbee06..e8e4045 100644 --- a/example/vm/config.yaml +++ b/example/vm/config.yaml @@ -1,4 +1,4 @@ -# Last Collector-Contrib Validation: v0.103.0 +# Last Collector-Contrib Validation: v0.107.0 receivers: # Receivers bring data into the OpenTelemetry Collector. # Generally, a receiver accepts data in a specified format,