From be5cd26236a84c71a38f67d69f65e4e331dd8a96 Mon Sep 17 00:00:00 2001 From: David Luna Date: Tue, 9 Apr 2024 15:38:09 +0200 Subject: [PATCH 1/4] chore(host-metrics): solve some TODO comments --- .../opentelemetry-node/lib/metrics/host.js | 64 +++++++++++++------ packages/opentelemetry-node/lib/sdk.js | 16 ++++- 2 files changed, 56 insertions(+), 24 deletions(-) diff --git a/packages/opentelemetry-node/lib/metrics/host.js b/packages/opentelemetry-node/lib/metrics/host.js index 95164126..f72ef1d7 100644 --- a/packages/opentelemetry-node/lib/metrics/host.js +++ b/packages/opentelemetry-node/lib/metrics/host.js @@ -7,6 +7,9 @@ * should export all the necessary building blocks like `LastValueAccumulation` * class and `AggregatorKind` enum * - we might look for an alternate solution in this case (no proposal for now) + * + * Most of this code will be removed when https://github.com/open-telemetry/opentelemetry-js/issues/4616 + * is solved. */ /** * @typedef {import('@opentelemetry/api').HrTime} HrTime @@ -33,10 +36,12 @@ const {Aggregation, DataPointType, View} = metrics; const {HostMetrics} = require('@opentelemetry/host-metrics'); /** - * Copied from `@opentelemetry/sdk-metrics` since it's not exported + * TODO: Copied from `@opentelemetry/sdk-metrics` since it's not exported * https://github.com/open-telemetry/opentelemetry-js/blob/f86251d40fbf615be87319c8a1f5643afb820076/packages/sdk-metrics/src/aggregator/LastValue.ts#L34 * - * @todo remoce this class and require it when exported + * Remove when https://github.com/open-telemetry/opentelemetry-js/issues/4616 is fixed + * + * @todo remove this class when sdk.metrics exports it * @class * @implements {Accumulation} */ @@ -81,8 +86,9 @@ class LastValueAccumulation { * @implements {Aggregator} */ class SystemCpuUtilizationAggregator { - // TODO: hardcoded the value of `AggregatorKind` enum for GAUGE - // remove when exported + // TODO: Hardcoded the value of `AggregatorKind` enum for GAUGE. Remove + // when issue below is fixed + // Issue: https://github.com/open-telemetry/opentelemetry-js/issues/4616 // https://github.com/open-telemetry/opentelemetry-js/blob/f86251d40fbf615be87319c8a1f5643afb820076/packages/sdk-metrics/src/aggregator/types.ts#L23 kind = 2; @@ -139,8 +145,19 @@ class SystemCpuUtilizationAggregator { } /** - * Does the sum of data points grouping by `system.cpu.logical_number` so we have the total - * utilization per CPU. Basically the value would be 1 - idle_value + * Groups data points by `system.cpu.logical_number` so we have the total + * utilization per CPU. + * + * We cannot sum up the utilization of all the states since `os.cpus()` is + * not returning all of the possible states but limited to: user, nice, sys, idle, irq + * https://nodejs.org/api/all.html#all_os_oscpus + * + * where in linux we have more: user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice + * https://man7.org/linux/man-pages/man5/proc.5.html + * + * So in order to have the most accurate metric of utilization we use + * the formula 1 - (idle utilization) + * * As an example given the data points: * - { value: 0.1, attributes: { 'system.cpu.logical_number': 0, 'system.cpu.state': 'idle' } } * - { value: 0.5, attributes: { 'system.cpu.logical_number': 0, 'system.cpu.state': 'system' } } @@ -169,15 +186,6 @@ class SystemCpuUtilizationAggregator { accumulationByAttributes, endTime ) { - // We cannot sum up the utilization of all the states since `os.cpus()` is - // not returning all of the possible states but limited to: user, nice, sys, idle, irq - // https://nodejs.org/api/all.html#all_os_oscpus - // - // where in linux we have more: user, nice, system, idle, iowait, irq, softirq, steal, guest, guest_nice - // https://man7.org/linux/man-pages/man5/proc.5.html - // - // So in order to have the most accurate metric of utilization we use - // the formula 1 - (idle utilization) return { descriptor, aggregationTemporality, @@ -206,27 +214,41 @@ class SystemCpuUtilizationAggregation extends Aggregation { /** @type {HostMetrics} */ let hostMetricsInstance; function enableHostMetrics() { - // TODO: make this configurable, user might collect host metrics with a separate utility + // NOTE: we set the scope name to the package name `@opentelemetry/host-metrics` like + // other instrumentations do. This way we can differentiate if the user collects the + // host metrics with a different utility or package hostMetricsInstance = new HostMetrics({ - name: '', + name: '@opentelemetry/host-metrics', }); hostMetricsInstance.start(); } +// It is known that host metrics sends a lot of data so for now we drop some +// instruments that are not handled by Kibana and doing aggregations +// for others that we want to include shorly (CPU metrics) +// Ref (data amount issue): https://github.com/elastic/elastic-otel-node/issues/51 +// Ref (metrics in Kibana): https://github.com/elastic/kibana/pull/174700 +// TODO: if metrics filter config becomes a thing we may want to convert this to a +// function that receives the filter as a param (or gets it from env) /** @type {metrics.View[]} */ const HOST_METRICS_VIEWS = [ - // drop `system.network.*` metrics for now + // drop `system.network.*` (not in Kibana) new View({ instrumentName: 'system.network.*', aggregation: Aggregation.Drop(), }), - // drop `system.cpu.time` also - // TODO: check if we can do an aggregation here + // drop `system.cpu.time` (not in Kibana) new View({ instrumentName: 'system.cpu.time', aggregation: Aggregation.Drop(), }), - // use the aggregation we craeted above + // drop `process.*` (not in Kibana) + new View({ + instrumentName: 'process.*', + aggregation: Aggregation.Drop(), + }), + // Do an aggregation to avoid cardinality problems because of the possible + // permutations of state & logical_number attributes new View({ instrumentName: 'system.cpu.utilization', aggregation: new SystemCpuUtilizationAggregation(), diff --git a/packages/opentelemetry-node/lib/sdk.js b/packages/opentelemetry-node/lib/sdk.js index 8f93ddb5..47850ff1 100644 --- a/packages/opentelemetry-node/lib/sdk.js +++ b/packages/opentelemetry-node/lib/sdk.js @@ -64,14 +64,24 @@ class ElasticNodeSDK extends NodeSDK { // TODO metrics exporter should do for metrics what `TracerProviderWithEnvExporters` does for traces, does that include `url` export endpoint? // TODO what `temporalityPreference`? // TODO make the Millis values configurable. What would otel java do? - // TODO config to disable metrics? E.g. otherwise `http.server.duration` will send every period forever and data can be distracting - const metricsDisabled = process.env.ETEL_METRICS_DISABLED === 'true'; // TODO hack for now + // References about metrics config + // - java: https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure + // has no references to timeouts or intervals + // - net: https://opentelemetry.io/docs/languages/net/automatic/config/ + // defines OTEL_METRIC_EXPORT_INTERVAL and OTEL_METRIC_EXPORT_TIMEOUT + + // Disble metrics by config + // E.g. otherwise `http.server.duration` will send every period forever and data can be distracting + // - In this 1st pass we have a config to disable all + // - In a 2nd pass we could add another config to filter the metrics being sent and provide a default + // filter. A list of coma separated patterns would be okay to use it in env and also a config file + const metricsDisabled = + process.env.ELASTIC_OTEL_METRICS_DISABLED === 'true'; if (!metricsDisabled) { const metricsInterval = Number(process.env.ETEL_METRICS_INTERVAL_MS) || 30000; defaultConfig.metricReader = new metrics.PeriodicExportingMetricReader({ - // exporter: new metrics.ConsoleMetricExporter(), exporter: new OTLPMetricExporter(), exportIntervalMillis: metricsInterval, exportTimeoutMillis: metricsInterval, // TODO same val appropriate for timeout? From 4e2a97b32c1c258aeb4d764cd567075f95ddea46 Mon Sep 17 00:00:00 2001 From: David Luna Date: Tue, 9 Apr 2024 19:48:52 +0200 Subject: [PATCH 2/4] chore(host-metrics): remove TODO --- packages/opentelemetry-node/lib/metrics/host.js | 2 -- 1 file changed, 2 deletions(-) diff --git a/packages/opentelemetry-node/lib/metrics/host.js b/packages/opentelemetry-node/lib/metrics/host.js index f72ef1d7..c55d863e 100644 --- a/packages/opentelemetry-node/lib/metrics/host.js +++ b/packages/opentelemetry-node/lib/metrics/host.js @@ -228,8 +228,6 @@ function enableHostMetrics() { // for others that we want to include shorly (CPU metrics) // Ref (data amount issue): https://github.com/elastic/elastic-otel-node/issues/51 // Ref (metrics in Kibana): https://github.com/elastic/kibana/pull/174700 -// TODO: if metrics filter config becomes a thing we may want to convert this to a -// function that receives the filter as a param (or gets it from env) /** @type {metrics.View[]} */ const HOST_METRICS_VIEWS = [ // drop `system.network.*` (not in Kibana) From 2c213d40561bbd8bb9ea17ddfe22eaf73920b5c9 Mon Sep 17 00:00:00 2001 From: David Luna Date: Tue, 9 Apr 2024 19:54:48 +0200 Subject: [PATCH 3/4] chore(host-metrics): remove comments about metrics disabling --- packages/opentelemetry-node/lib/sdk.js | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/packages/opentelemetry-node/lib/sdk.js b/packages/opentelemetry-node/lib/sdk.js index 13aa564f..834585da 100644 --- a/packages/opentelemetry-node/lib/sdk.js +++ b/packages/opentelemetry-node/lib/sdk.js @@ -83,17 +83,8 @@ class ElasticNodeSDK extends NodeSDK { // TODO metrics exporter should do for metrics what `TracerProviderWithEnvExporters` does for traces, does that include `url` export endpoint? // TODO what `temporalityPreference`? // TODO make the Millis values configurable. What would otel java do? - // References about metrics config - // - java: https://github.com/open-telemetry/opentelemetry-java/tree/main/sdk-extensions/autoconfigure - // has no references to timeouts or intervals - // - net: https://opentelemetry.io/docs/languages/net/automatic/config/ - // defines OTEL_METRIC_EXPORT_INTERVAL and OTEL_METRIC_EXPORT_TIMEOUT - - // Disble metrics by config - // E.g. otherwise `http.server.duration` will send every period forever and data can be distracting - // - In this 1st pass we have a config to disable all - // - In a 2nd pass we could add another config to filter the metrics being sent and provide a default - // filter. A list of coma separated patterns would be okay to use it in env and also a config file + + // Disable metrics by config const metricsDisabled = process.env.ELASTIC_OTEL_METRICS_DISABLED === 'true'; if (!metricsDisabled) { From 8d0fc1be37fe0b6c1690baed49da16ea4bdef055 Mon Sep 17 00:00:00 2001 From: David Luna Date: Tue, 9 Apr 2024 20:00:13 +0200 Subject: [PATCH 4/4] chore(host-metrics): pass empty config to host metrics --- packages/opentelemetry-node/lib/metrics/host.js | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/packages/opentelemetry-node/lib/metrics/host.js b/packages/opentelemetry-node/lib/metrics/host.js index 995521b2..393ef108 100644 --- a/packages/opentelemetry-node/lib/metrics/host.js +++ b/packages/opentelemetry-node/lib/metrics/host.js @@ -233,12 +233,8 @@ class SystemCpuUtilizationAggregation extends Aggregation { /** @type {HostMetrics} */ let hostMetricsInstance; function enableHostMetrics() { - // NOTE: we set the scope name to the package name `@opentelemetry/host-metrics` like - // other instrumentations do. This way we can differentiate if the user collects the - // host metrics with a different utility or package - hostMetricsInstance = new HostMetrics({ - name: '@opentelemetry/host-metrics', - }); + // @ts-ignore - config interface expects a `name` property but there is a default value + hostMetricsInstance = new HostMetrics({}); hostMetricsInstance.start(); }