From c7eb5b58d2dbd15dfa9ed02dc8a3cd9b6b80aac0 Mon Sep 17 00:00:00 2001 From: Suzy Wang Date: Wed, 16 Oct 2024 07:04:48 -0700 Subject: [PATCH 1/2] test(opentelemetry): otel metrics --- .../09-hybrid_mode/09-config-compat_spec.lua | 2 + .../37-opentelemetry/01-otlp_spec.lua | 65 +++++++++++++++++++ spec/fixtures/opentelemetry/otelcol.yaml | 4 ++ 3 files changed, 71 insertions(+) diff --git a/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua b/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua index 6d9f2842c6fe..0b37e8d079f8 100644 --- a/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua +++ b/spec/02-integration/09-hybrid_mode/09-config-compat_spec.lua @@ -255,6 +255,7 @@ describe("CP/DP config compat transformations #" .. strategy, function() expected_otel_prior_35.config.propagation = nil expected_otel_prior_35.config.traces_endpoint = nil expected_otel_prior_35.config.logs_endpoint = nil + expected_otel_prior_35.config.metrics_endpoint = nil expected_otel_prior_35.config.endpoint = "http://1.1.1.1:12345/v1/trace" expected_otel_prior_35.config.queue.concurrency_limit = nil @@ -281,6 +282,7 @@ describe("CP/DP config compat transformations #" .. strategy, function() expected_otel_prior_34.config.propagation = nil expected_otel_prior_34.config.traces_endpoint = nil expected_otel_prior_34.config.logs_endpoint = nil + expected_otel_prior_34.config.metrics_endpoint = nil expected_otel_prior_34.config.endpoint = "http://1.1.1.1:12345/v1/trace" expected_otel_prior_34.config.queue.concurrency_limit = nil do_assert(uuid(), "3.3.0", expected_otel_prior_34) diff --git a/spec/03-plugins/37-opentelemetry/01-otlp_spec.lua b/spec/03-plugins/37-opentelemetry/01-otlp_spec.lua index 07add4f743cc..ec5030063c01 100644 --- a/spec/03-plugins/37-opentelemetry/01-otlp_spec.lua +++ b/spec/03-plugins/37-opentelemetry/01-otlp_spec.lua @@ -9,6 +9,7 @@ local time_ns = require("kong.tools.time").time_ns local deep_copy = require("kong.tools.table").deep_copy local insert = table.insert local tracer = kong.tracing.new("test") +local math_rand = math.random local function table_compare(expected, passed) if type(expected) ~= type(passed) then @@ -51,6 +52,14 @@ local pb_decode_log = function(data) return pb.decode("opentelemetry.proto.logs.v1.LogRecord", data) end +local pb_encode_metrics = function(data) + return pb.encode("opentelemetry.proto.metrics.v1.Metric", data) +end + +local pb_decode_metrics = function(data) + return pb.decode("opentelemetry.proto.metrics.v1.Metric", data) +end + describe("Plugin: opentelemetry (otlp)", function() local old_ngx_get_phase @@ -168,6 +177,62 @@ describe("Plugin: opentelemetry (otlp)", function() end end) + it("encode/decode pb (metrics)", function () + local N = 1000 + local metric_type = {"sum", "histogram", "gauge"} + local otel_to_prom_metric = { + ["sum"] = "counter", + ["histogram"] = "histogram", + ["gauge"] = "gauge", + } + local test_metrics = [[ + # HELP kong_bandwidth_bytes Total bandwidth (ingress/egress) throughput in bytes + # TYPE kong_bandwidth_bytes counter + kong_bandwidth_bytes{service="kong",route="kong.route-1",direction="egress",consumer=""} 264 + kong_bandwidth_bytes{service="kong",route="kong.route-1",direction="ingress",consumer=""} 93]] + local data + for i = 1, N do + local name = "kong_metric_" .. tostring(i) + local m_type = metric_type[math_rand(#metric_type)] + local help = "# HELP " .. name .. " description of " .. name + local typ = "# TYPE " .. name .. " " .. otel_to_prom_metric[m_type] + if m_type == "sum" then + data = name.."{service=\"kong_oss\",route=\"kong.route\",direction=\"ingress\",consumer=\"\"} ".. tonumber(math_rand(100)) + elseif m_type == "histogram" then + data = name.."{service=\"kong\",route=\"kong.route\",le=\"" .. tostring(math_rand(100)).. "\"} ".. tostring(math_rand(100)) + data = data .. "\n" .. name.."{service=\"kong\",route=\"kong.route-1\",le=\"" .. tostring(math_rand(100)).. "\"} ".. tostring(math_rand(100)) + else + data = name.."{node_id=\"849373c5-45c1-4c1d-b595-fdeaea6daed8\",subsystem=\"http\"} ".. tostring(math_rand(100)) + end + local metric = "\n" .. help .. "\n" .. typ .. "\n" .. data + test_metrics = test_metrics .. metric + end + + local metric_seg_start = 0 + local string_div = 2 + local metric_seg_end, div = string.find(test_metrics, "# HELP", string_div, true) + string_div = div +1 + + while metric_seg_start ~= metric_seg_end do + local metric = string.sub(test_metrics, metric_seg_start, metric_seg_end-1) + local pb_metric = otlp.transform_metric(metric) + local pb_data = pb_encode_metrics(pb_metric) + local decoded_metric = pb_decode_metrics(pb_data) + + local ok, err = table_compare(pb_metric, decoded_metric) + assert.is_true(ok, err) + + metric_seg_start = metric_seg_end + metric_seg_end, div = string.find(test_metrics, "# HELP", string_div, true) + + if not metric_seg_end then + metric_seg_end = #test_metrics + else + string_div = div+1 + end + end + end) + it("check lengths of trace_id and span_id ", function () local TRACE_ID_LEN, PARENT_SPAN_ID_LEN = 16, 8 local default_span = { diff --git a/spec/fixtures/opentelemetry/otelcol.yaml b/spec/fixtures/opentelemetry/otelcol.yaml index a15acde86bfe..3cc0c8f67b18 100644 --- a/spec/fixtures/opentelemetry/otelcol.yaml +++ b/spec/fixtures/opentelemetry/otelcol.yaml @@ -30,3 +30,7 @@ service: receivers: [otlp] processors: [batch] exporters: [logging, file] + metrics: + receivers: [otlp] + processors: [batch] + exporters: [logging, file] \ No newline at end of file From 76fae49e58d54c3e91bb6a92c8c95d18380552bd Mon Sep 17 00:00:00 2001 From: Suzy Wang Date: Wed, 16 Oct 2024 07:07:51 -0700 Subject: [PATCH 2/2] feat(opentelemetry): otel metrics --- changelog/unreleased/kong/otel-metrics.yml | 3 + kong-3.9.0-0.rockspec | 1 + kong/clustering/compat/removed_fields.lua | 1 + .../metrics/v1/metrics_service.proto | 79 ++ .../proto/metrics/v1/metrics.proto | 708 ++++++++++++++++++ kong/plugins/opentelemetry/handler.lua | 17 +- kong/plugins/opentelemetry/metrics.lua | 80 ++ kong/plugins/opentelemetry/otlp.lua | 131 +++- kong/plugins/opentelemetry/proto.lua | 2 + kong/plugins/opentelemetry/schema.lua | 2 + 10 files changed, 1021 insertions(+), 3 deletions(-) create mode 100644 changelog/unreleased/kong/otel-metrics.yml create mode 100644 kong/include/opentelemetry/proto/collector/metrics/v1/metrics_service.proto create mode 100644 kong/include/opentelemetry/proto/metrics/v1/metrics.proto create mode 100644 kong/plugins/opentelemetry/metrics.lua diff --git a/changelog/unreleased/kong/otel-metrics.yml b/changelog/unreleased/kong/otel-metrics.yml new file mode 100644 index 000000000000..e9a1b30bc9a6 --- /dev/null +++ b/changelog/unreleased/kong/otel-metrics.yml @@ -0,0 +1,3 @@ +message: "**OpenTelemetry:** Added support for OpenTelemetry metrics." +type: feature +scope: Plugin diff --git a/kong-3.9.0-0.rockspec b/kong-3.9.0-0.rockspec index 6bf6989b3334..857b901c8164 100644 --- a/kong-3.9.0-0.rockspec +++ b/kong-3.9.0-0.rockspec @@ -594,6 +594,7 @@ build = { ["kong.plugins.opentelemetry.otlp"] = "kong/plugins/opentelemetry/otlp.lua", ["kong.plugins.opentelemetry.traces"] = "kong/plugins/opentelemetry/traces.lua", ["kong.plugins.opentelemetry.logs"] = "kong/plugins/opentelemetry/logs.lua", + ["kong.plugins.opentelemetry.metrics"] = "kong/plugins/opentelemetry/metrics.lua", ["kong.plugins.opentelemetry.utils"] = "kong/plugins/opentelemetry/utils.lua", ["kong.plugins.ai-proxy.handler"] = "kong/plugins/ai-proxy/handler.lua", diff --git a/kong/clustering/compat/removed_fields.lua b/kong/clustering/compat/removed_fields.lua index 00bafc650f8d..b849baf8ccb8 100644 --- a/kong/clustering/compat/removed_fields.lua +++ b/kong/clustering/compat/removed_fields.lua @@ -166,6 +166,7 @@ return { opentelemetry = { "traces_endpoint", "logs_endpoint", + "metrics_endpoint", "queue.concurrency_limit", }, ai_proxy = { diff --git a/kong/include/opentelemetry/proto/collector/metrics/v1/metrics_service.proto b/kong/include/opentelemetry/proto/collector/metrics/v1/metrics_service.proto new file mode 100644 index 000000000000..aef200125ede --- /dev/null +++ b/kong/include/opentelemetry/proto/collector/metrics/v1/metrics_service.proto @@ -0,0 +1,79 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.collector.metrics.v1; + +import "opentelemetry/proto/metrics/v1/metrics.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Collector.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.collector.metrics.v1"; +option java_outer_classname = "MetricsServiceProto"; +option go_package = "go.opentelemetry.io/proto/otlp/collector/metrics/v1"; + +// Service that can be used to push metrics between one Application +// instrumented with OpenTelemetry and a collector, or between a collector and a +// central collector. +service MetricsService { + // For performance reasons, it is recommended to keep this RPC + // alive for the entire life of the application. + rpc Export(ExportMetricsServiceRequest) returns (ExportMetricsServiceResponse) {} +} + +message ExportMetricsServiceRequest { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain one + // element. Intermediary nodes (such as OpenTelemetry Collector) that receive + // data from multiple origins typically batch the data before forwarding further and + // in that case this array will contain multiple elements. + repeated opentelemetry.proto.metrics.v1.ResourceMetrics resource_metrics = 1; +} + +message ExportMetricsServiceResponse { + // The details of a partially successful export request. + // + // If the request is only partially accepted + // (i.e. when the server accepts only parts of the data and rejects the rest) + // the server MUST initialize the `partial_success` field and MUST + // set the `rejected_` with the number of items it rejected. + // + // Servers MAY also make use of the `partial_success` field to convey + // warnings/suggestions to senders even when the request was fully accepted. + // In such cases, the `rejected_` MUST have a value of `0` and + // the `error_message` MUST be non-empty. + // + // A `partial_success` message with an empty value (rejected_ = 0 and + // `error_message` = "") is equivalent to it not being set/present. Senders + // SHOULD interpret it the same way as in the full success case. + ExportMetricsPartialSuccess partial_success = 1; +} + +message ExportMetricsPartialSuccess { + // The number of rejected data points. + // + // A `rejected_` field holding a `0` value indicates that the + // request was fully accepted. + int64 rejected_data_points = 1; + + // A developer-facing human-readable message in English. It should be used + // either to explain why the server rejected parts of the data during a partial + // success or to convey warnings/suggestions during a full success. The message + // should offer guidance on how users can address such issues. + // + // error_message is an optional field. An error_message with an empty value + // is equivalent to it not being set. + string error_message = 2; +} \ No newline at end of file diff --git a/kong/include/opentelemetry/proto/metrics/v1/metrics.proto b/kong/include/opentelemetry/proto/metrics/v1/metrics.proto new file mode 100644 index 000000000000..509cbfdf418f --- /dev/null +++ b/kong/include/opentelemetry/proto/metrics/v1/metrics.proto @@ -0,0 +1,708 @@ +// Copyright 2019, OpenTelemetry Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +syntax = "proto3"; + +package opentelemetry.proto.metrics.v1; + +import "opentelemetry/proto/common/v1/common.proto"; +import "opentelemetry/proto/resource/v1/resource.proto"; + +option csharp_namespace = "OpenTelemetry.Proto.Metrics.V1"; +option java_multiple_files = true; +option java_package = "io.opentelemetry.proto.metrics.v1"; +option java_outer_classname = "MetricsProto"; +option go_package = "go.opentelemetry.io/proto/otlp/metrics/v1"; + +// MetricsData represents the metrics data that can be stored in a persistent +// storage, OR can be embedded by other protocols that transfer OTLP metrics +// data but do not implement the OTLP protocol. +// +// MetricsData +// └─── ResourceMetrics +// ├── Resource +// ├── SchemaURL +// └── ScopeMetrics +// ├── Scope +// ├── SchemaURL +// └── Metric +// ├── Name +// ├── Description +// ├── Unit +// └── data +// ├── Gauge +// ├── Sum +// ├── Histogram +// ├── ExponentialHistogram +// └── Summary +// +// The main difference between this message and collector protocol is that +// in this message there will not be any "control" or "metadata" specific to +// OTLP protocol. +// +// When new fields are added into this message, the OTLP request MUST be updated +// as well. +message MetricsData { + // An array of ResourceMetrics. + // For data coming from a single resource this array will typically contain + // one element. Intermediary nodes that receive data from multiple origins + // typically batch the data before forwarding further and in that case this + // array will contain multiple elements. + repeated ResourceMetrics resource_metrics = 1; +} + +// A collection of ScopeMetrics from a Resource. +message ResourceMetrics { + reserved 1000; + + // The resource for the metrics in this message. + // If this field is not set then no resource info is known. + opentelemetry.proto.resource.v1.Resource resource = 1; + + // A list of metrics that originate from a resource. + repeated ScopeMetrics scope_metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the resource data + // is recorded in. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to the data in the "resource" field. It does not apply + // to the data in the "scope_metrics" field which have their own schema_url field. + string schema_url = 3; +} + +// A collection of Metrics produced by an Scope. +message ScopeMetrics { + // The instrumentation scope information for the metrics in this message. + // Semantically when InstrumentationScope isn't set, it is equivalent with + // an empty instrumentation scope name (unknown). + opentelemetry.proto.common.v1.InstrumentationScope scope = 1; + + // A list of metrics that originate from an instrumentation library. + repeated Metric metrics = 2; + + // The Schema URL, if known. This is the identifier of the Schema that the metric data + // is recorded in. To learn more about Schema URL see + // https://opentelemetry.io/docs/specs/otel/schemas/#schema-url + // This schema_url applies to all metrics in the "metrics" field. + string schema_url = 3; +} + +// Defines a Metric which has one or more timeseries. The following is a +// brief summary of the Metric data model. For more details, see: +// +// https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/metrics/data-model.md +// +// The data model and relation between entities is shown in the +// diagram below. Here, "DataPoint" is the term used to refer to any +// one of the specific data point value types, and "points" is the term used +// to refer to any one of the lists of points contained in the Metric. +// +// - Metric is composed of a metadata and data. +// - Metadata part contains a name, description, unit. +// - Data is one of the possible types (Sum, Gauge, Histogram, Summary). +// - DataPoint contains timestamps, attributes, and one of the possible value type +// fields. +// +// Metric +// +------------+ +// |name | +// |description | +// |unit | +------------------------------------+ +// |data |---> |Gauge, Sum, Histogram, Summary, ... | +// +------------+ +------------------------------------+ +// +// Data [One of Gauge, Sum, Histogram, Summary, ...] +// +-----------+ +// |... | // Metadata about the Data. +// |points |--+ +// +-----------+ | +// | +---------------------------+ +// | |DataPoint 1 | +// v |+------+------+ +------+ | +// +-----+ ||label |label |...|label | | +// | 1 |-->||value1|value2|...|valueN| | +// +-----+ |+------+------+ +------+ | +// | . | |+-----+ | +// | . | ||value| | +// | . | |+-----+ | +// | . | +---------------------------+ +// | . | . +// | . | . +// | . | . +// | . | +---------------------------+ +// | . | |DataPoint M | +// +-----+ |+------+------+ +------+ | +// | M |-->||label |label |...|label | | +// +-----+ ||value1|value2|...|valueN| | +// |+------+------+ +------+ | +// |+-----+ | +// ||value| | +// |+-----+ | +// +---------------------------+ +// +// Each distinct type of DataPoint represents the output of a specific +// aggregation function, the result of applying the DataPoint's +// associated function of to one or more measurements. +// +// All DataPoint types have three common fields: +// - Attributes includes key-value pairs associated with the data point +// - TimeUnixNano is required, set to the end time of the aggregation +// - StartTimeUnixNano is optional, but strongly encouraged for DataPoints +// having an AggregationTemporality field, as discussed below. +// +// Both TimeUnixNano and StartTimeUnixNano values are expressed as +// UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January 1970. +// +// # TimeUnixNano +// +// This field is required, having consistent interpretation across +// DataPoint types. TimeUnixNano is the moment corresponding to when +// the data point's aggregate value was captured. +// +// Data points with the 0 value for TimeUnixNano SHOULD be rejected +// by consumers. +// +// # StartTimeUnixNano +// +// StartTimeUnixNano in general allows detecting when a sequence of +// observations is unbroken. This field indicates to consumers the +// start time for points with cumulative and delta +// AggregationTemporality, and it should be included whenever possible +// to support correct rate calculation. Although it may be omitted +// when the start time is truly unknown, setting StartTimeUnixNano is +// strongly encouraged. +message Metric { + reserved 4, 6, 8; + + // name of the metric. + string name = 1; + + // description of the metric, which can be used in documentation. + string description = 2; + + // unit in which the metric value is reported. Follows the format + // described by http://unitsofmeasure.org/ucum.html. + string unit = 3; + + // Data determines the aggregation type (if any) of the metric, what is the + // reported value type for the data points, as well as the relatationship to + // the time interval over which they are reported. + oneof data { + Gauge gauge = 5; + Sum sum = 7; + Histogram histogram = 9; + ExponentialHistogram exponential_histogram = 10; + Summary summary = 11; + } + + // Additional metadata attributes that describe the metric. [Optional]. + // Attributes are non-identifying. + // Consumers SHOULD NOT need to be aware of these attributes. + // These attributes MAY be used to encode information allowing + // for lossless roundtrip translation to / from another data model. + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue metadata = 12; +} + +// Gauge represents the type of a scalar metric that always exports the +// "current value" for every data point. It should be used for an "unknown" +// aggregation. +// +// A Gauge does not support different aggregation temporalities. Given the +// aggregation is unknown, points cannot be combined using the same +// aggregation, regardless of aggregation temporalities. Therefore, +// AggregationTemporality is not included. Consequently, this also means +// "StartTimeUnixNano" is ignored for all data points. +message Gauge { + repeated NumberDataPoint data_points = 1; +} + +// Sum represents the type of a scalar metric that is calculated as a sum of all +// reported measurements over a time interval. +message Sum { + repeated NumberDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; + + // If "true" means that the sum is monotonic. + bool is_monotonic = 3; +} + +// Histogram represents the type of a metric that is calculated by aggregating +// as a Histogram of all reported measurements over a time interval. +message Histogram { + repeated HistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// ExponentialHistogram represents the type of a metric that is calculated by aggregating +// as a ExponentialHistogram of all reported double measurements over a time interval. +message ExponentialHistogram { + repeated ExponentialHistogramDataPoint data_points = 1; + + // aggregation_temporality describes if the aggregator reports delta changes + // since last report time, or cumulative changes since a fixed start time. + AggregationTemporality aggregation_temporality = 2; +} + +// Summary metric data are used to convey quantile summaries, +// a Prometheus (see: https://prometheus.io/docs/concepts/metric_types/#summary) +// and OpenMetrics (see: https://github.com/OpenObservability/OpenMetrics/blob/4dbf6075567ab43296eed941037c12951faafb92/protos/prometheus.proto#L45) +// data type. These data points cannot always be merged in a meaningful way. +// While they can be useful in some applications, histogram data points are +// recommended for new applications. +message Summary { + repeated SummaryDataPoint data_points = 1; +} + +// AggregationTemporality defines how a metric aggregator reports aggregated +// values. It describes how those values relate to the time interval over +// which they are aggregated. +enum AggregationTemporality { + // UNSPECIFIED is the default AggregationTemporality, it MUST not be used. + AGGREGATION_TEMPORALITY_UNSPECIFIED = 0; + + // DELTA is an AggregationTemporality for a metric aggregator which reports + // changes since last report time. Successive metrics contain aggregation of + // values from continuous and non-overlapping intervals. + // + // The values for a DELTA metric are based only on the time interval + // associated with one measurement cycle. There is no dependency on + // previous measurements like is the case for CUMULATIVE metrics. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // DELTA metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0+1 to + // t_0+2 with a value of 2. + AGGREGATION_TEMPORALITY_DELTA = 1; + + // CUMULATIVE is an AggregationTemporality for a metric aggregator which + // reports changes since a fixed start time. This means that current values + // of a CUMULATIVE metric depend on all previous measurements since the + // start time. Because of this, the sender is required to retain this state + // in some form. If this state is lost or invalidated, the CUMULATIVE metric + // values MUST be reset and a new fixed start time following the last + // reported measurement time sent MUST be used. + // + // For example, consider a system measuring the number of requests that + // it receives and reports the sum of these requests every second as a + // CUMULATIVE metric: + // + // 1. The system starts receiving at time=t_0. + // 2. A request is received, the system measures 1 request. + // 3. A request is received, the system measures 1 request. + // 4. A request is received, the system measures 1 request. + // 5. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+1 with a value of 3. + // 6. A request is received, the system measures 1 request. + // 7. A request is received, the system measures 1 request. + // 8. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_0 to + // t_0+2 with a value of 5. + // 9. The system experiences a fault and loses state. + // 10. The system recovers and resumes receiving at time=t_1. + // 11. A request is received, the system measures 1 request. + // 12. The 1 second collection cycle ends. A metric is exported for the + // number of requests received over the interval of time t_1 to + // t_0+1 with a value of 1. + // + // Note: Even though, when reporting changes since last report time, using + // CUMULATIVE is valid, it is not recommended. This may cause problems for + // systems that do not use start_time to determine when the aggregation + // value was reset (e.g. Prometheus). + AGGREGATION_TEMPORALITY_CUMULATIVE = 2; +} + +// DataPointFlags is defined as a protobuf 'uint32' type and is to be used as a +// bit-field representing 32 distinct boolean flags. Each flag defined in this +// enum is a bit-mask. To test the presence of a single flag in the flags of +// a data point, for example, use an expression like: +// +// (point.flags & DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK) == DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK +// +enum DataPointFlags { + // The zero value for the enum. Should not be used for comparisons. + // Instead use bitwise "and" with the appropriate mask as shown above. + DATA_POINT_FLAGS_DO_NOT_USE = 0; + + // This DataPoint is valid but has no recorded value. This value + // SHOULD be used to reflect explicitly missing data in a series, as + // for an equivalent to the Prometheus "staleness marker". + DATA_POINT_FLAGS_NO_RECORDED_VALUE_MASK = 1; + + // Bits 2-31 are reserved for future use. +} + +// NumberDataPoint is a single data point in a timeseries that describes the +// time-varying scalar value of a metric. +message NumberDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // The value itself. A point is considered invalid when one of the recognized + // value fields is not present inside this oneof. + oneof value { + double as_double = 4; + sfixed64 as_int = 6; + } + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 5; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// HistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Histogram. A Histogram contains summary statistics +// for a population of values, it may optionally contain the distribution of +// those values across a set of buckets. +// +// If the histogram contains the distribution of values, then both +// "explicit_bounds" and "bucket counts" fields must be defined. +// If the histogram does not contain the distribution of values, then both +// "explicit_bounds" and "bucket_counts" must be omitted and only "count" and +// "sum" are known. +message HistogramDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 9; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. This + // value must be equal to the sum of the "count" fields in buckets if a + // histogram is provided. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#histogram + double sum = 5; + + // bucket_counts is an optional field contains the count values of histogram + // for each bucket. + // + // The sum of the bucket_counts must equal the value in the count field. + // + // The number of elements in bucket_counts array must be by one greater than + // the number of elements in explicit_bounds array. + repeated fixed64 bucket_counts = 6; + + // explicit_bounds specifies buckets with explicitly defined bounds for values. + // + // The boundaries for bucket at index i are: + // + // (-infinity, explicit_bounds[i]] for i == 0 + // (explicit_bounds[i-1], explicit_bounds[i]] for 0 < i < size(explicit_bounds) + // (explicit_bounds[i-1], +infinity) for i == size(explicit_bounds) + // + // The values in the explicit_bounds array must be strictly increasing. + // + // Histogram buckets are inclusive of their upper boundary, except the last + // bucket where the boundary is at infinity. This format is intentionally + // compatible with the OpenMetrics histogram definition. + repeated double explicit_bounds = 7; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 8; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // min is the minimum value over (start_time, end_time]. + double min = 11; + + // max is the maximum value over (start_time, end_time]. + double max = 12; +} + +// ExponentialHistogramDataPoint is a single data point in a timeseries that describes the +// time-varying values of a ExponentialHistogram of double values. A ExponentialHistogram contains +// summary statistics for a population of values, it may optionally contain the +// distribution of those values across a set of buckets. +// +message ExponentialHistogramDataPoint { + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 1; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be + // non-negative. This value must be equal to the sum of the "bucket_counts" + // values in the positive and negative Buckets plus the "zero_count" field. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#histogram + double sum = 5; + + // scale describes the resolution of the histogram. Boundaries are + // located at powers of the base, where: + // + // base = (2^(2^-scale)) + // + // The histogram bucket identified by `index`, a signed integer, + // contains values that are greater than (base^index) and + // less than or equal to (base^(index+1)). + // + // The positive and negative ranges of the histogram are expressed + // separately. Negative values are mapped by their absolute value + // into the negative range using the same scale as the positive range. + // + // scale is not restricted by the protocol, as the permissible + // values depend on the range of the data. + sint32 scale = 6; + + // zero_count is the count of values that are either exactly zero or + // within the region considered zero by the instrumentation at the + // tolerated degree of precision. This bucket stores values that + // cannot be expressed using the standard exponential formula as + // well as values that have been rounded to zero. + // + // Implementations MAY consider the zero bucket to have probability + // mass equal to (zero_count / count). + fixed64 zero_count = 7; + + // positive carries the positive range of exponential bucket counts. + Buckets positive = 8; + + // negative carries the negative range of exponential bucket counts. + Buckets negative = 9; + + // Buckets are a set of bucket counts, encoded in a contiguous array + // of counts. + message Buckets { + // Offset is the bucket index of the first entry in the bucket_counts array. + // + // Note: This uses a varint encoding as a simple form of compression. + sint32 offset = 1; + + // bucket_counts is an array of count values, where bucket_counts[i] carries + // the count of the bucket at index (offset+i). bucket_counts[i] is the count + // of values greater than base^(offset+i) and less than or equal to + // base^(offset+i+1). + // + // Note: By contrast, the explicit HistogramDataPoint uses + // fixed64. This field is expected to have many buckets, + // especially zeros, so uint64 has been selected to ensure + // varint encoding. + repeated uint64 bucket_counts = 2; + } + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 10; + + // (Optional) List of exemplars collected from + // measurements that were used to form the data point + repeated Exemplar exemplars = 11; + + // min is the minimum value over (start_time, end_time]. + double min = 12; + + // max is the maximum value over (start_time, end_time]. + double max = 13; + + // ZeroThreshold may be optionally set to convey the width of the zero + // region. Where the zero region is defined as the closed interval + // [-ZeroThreshold, ZeroThreshold]. + // When ZeroThreshold is 0, zero count bucket stores values that cannot be + // expressed using the standard exponential formula as well as values that + // have been rounded to zero. + double zero_threshold = 14; +} + +// SummaryDataPoint is a single data point in a timeseries that describes the +// time-varying values of a Summary metric. +message SummaryDataPoint { + reserved 1; + + // The set of key/value pairs that uniquely identify the timeseries from + // where this point belongs. The list may be empty (may contain 0 elements). + // Attribute keys MUST be unique (it is not allowed to have more than one + // attribute with the same key). + repeated opentelemetry.proto.common.v1.KeyValue attributes = 7; + + // StartTimeUnixNano is optional but strongly encouraged, see the + // the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 start_time_unix_nano = 2; + + // TimeUnixNano is required, see the detailed comments above Metric. + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 3; + + // count is the number of values in the population. Must be non-negative. + fixed64 count = 4; + + // sum of the values in the population. If count is zero then this field + // must be zero. + // + // Note: Sum should only be filled out when measuring non-negative discrete + // events, and is assumed to be monotonic over the values of these events. + // Negative events *can* be recorded, but sum should not be filled out when + // doing so. This is specifically to enforce compatibility w/ OpenMetrics, + // see: https://github.com/OpenObservability/OpenMetrics/blob/main/specification/OpenMetrics.md#summary + double sum = 5; + + // Represents the value at a given quantile of a distribution. + // + // To record Min and Max values following conventions are used: + // - The 1.0 quantile is equivalent to the maximum value observed. + // - The 0.0 quantile is equivalent to the minimum value observed. + // + // See the following issue for more context: + // https://github.com/open-telemetry/opentelemetry-proto/issues/125 + message ValueAtQuantile { + // The quantile of a distribution. Must be in the interval + // [0.0, 1.0]. + double quantile = 1; + + // The value at the given quantile of a distribution. + // + // Quantile values must NOT be negative. + double value = 2; + } + + // (Optional) list of values at different quantiles of the distribution calculated + // from the current snapshot. The quantiles must be strictly increasing. + repeated ValueAtQuantile quantile_values = 6; + + // Flags that apply to this specific data point. See DataPointFlags + // for the available flags and their meaning. + uint32 flags = 8; +} + +// A representation of an exemplar, which is a sample input measurement. +// Exemplars also hold information about the environment when the measurement +// was recorded, for example the span and trace ID of the active span when the +// exemplar was recorded. +message Exemplar { + reserved 1; + + // The set of key/value pairs that were filtered out by the aggregator, but + // recorded alongside the original measurement. Only key/value pairs that were + // filtered out by the aggregator should be included + repeated opentelemetry.proto.common.v1.KeyValue filtered_attributes = 7; + + // time_unix_nano is the exact time when this exemplar was recorded + // + // Value is UNIX Epoch time in nanoseconds since 00:00:00 UTC on 1 January + // 1970. + fixed64 time_unix_nano = 2; + + // The value of the measurement that was recorded. An exemplar is + // considered invalid when one of the recognized value fields is not present + // inside this oneof. + oneof value { + double as_double = 3; + sfixed64 as_int = 6; + } + + // (Optional) Span ID of the exemplar trace. + // span_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes span_id = 4; + + // (Optional) Trace ID of the exemplar trace. + // trace_id may be missing if the measurement is not recorded inside a trace + // or if the trace is not sampled. + bytes trace_id = 5; +} \ No newline at end of file diff --git a/kong/plugins/opentelemetry/handler.lua b/kong/plugins/opentelemetry/handler.lua index ba3c635425fd..2fe5bf9a97fd 100644 --- a/kong/plugins/opentelemetry/handler.lua +++ b/kong/plugins/opentelemetry/handler.lua @@ -1,11 +1,13 @@ local otel_traces = require "kong.plugins.opentelemetry.traces" local otel_logs = require "kong.plugins.opentelemetry.logs" +local otel_metrics = require "kong.plugins.opentelemetry.metrics" local dynamic_hook = require "kong.dynamic_hook" local o11y_logs = require "kong.observability.logs" local kong_meta = require "kong.meta" - +local exporter = require('kong.plugins.prometheus.exporter') +local prometheus local OpenTelemetryHandler = { VERSION = kong_meta.version, @@ -20,6 +22,10 @@ function OpenTelemetryHandler:configure(configs) dynamic_hook.hook("observability_logs", "push", o11y_logs.maybe_push) dynamic_hook.enable_by_default("observability_logs") end + + if config.metrics_endpoint then + prometheus = exporter.get_prometheus() + end end end end @@ -51,6 +57,15 @@ function OpenTelemetryHandler:log(conf) if conf.logs_endpoint then otel_logs.log(conf) end + + -- Metrics + if conf.metrics_endpoint then + local str = "" + prometheus:metric_data(function(d) + str = str .. d + end) + otel_metrics.log(conf, str) + end end diff --git a/kong/plugins/opentelemetry/metrics.lua b/kong/plugins/opentelemetry/metrics.lua new file mode 100644 index 000000000000..b9077a86161a --- /dev/null +++ b/kong/plugins/opentelemetry/metrics.lua @@ -0,0 +1,80 @@ +local otlp = require "kong.plugins.opentelemetry.otlp" +local Queue = require "kong.tools.queue" +local clone = require "table.clone" +local otel_utils = require "kong.plugins.opentelemetry.utils" + +local ngx = ngx +local ngx_log = ngx.log +local ngx_ERR = ngx.ERR +local ngx_DEBUG = ngx.DEBUG + +local _log_prefix = otel_utils._log_prefix +local encode_data = otlp.transform_metric +local encode_metrics = otlp.encode_metrics +local http_export_request = otel_utils.http_export_request +local get_headers = otel_utils.get_headers + + +local function http_export_metrics(conf, metrics) + local headers = get_headers(conf.headers) + local payload = encode_metrics(metrics, conf.resource_attributes) + + local ok, err = http_export_request({ + connect_timeout = conf.connect_timeout, + send_timeout = conf.send_timeout, + read_timeout = conf.read_timeout, + endpoint = conf.metrics_endpoint, + }, payload, headers) + + if ok then + ngx_log(ngx_DEBUG, _log_prefix, "exporter sent ", #metrics, + " metrics to ", conf.metrics_endpoint) + else + ngx_log(ngx_ERR, _log_prefix, err) + end + return ok, err +end + + +local function log(conf, metrics) + -- + local queue_conf = clone(Queue.get_plugin_params("opentelemetry", conf)) + queue_conf.name = queue_conf.name .. ":metrics" + + local metrics_counter = 0 + local metric_seg_start = 0 + local string_div = 2 + local metric_seg_end, div = string.find(metrics, "# HELP", string_div, true) + string_div = div +1 + + while metric_seg_start ~= metric_seg_end do + metrics_counter = metrics_counter + 1 + local metric = string.sub(metrics, metric_seg_start, metric_seg_end-1) + + local ok, err = Queue.enqueue( + queue_conf, + http_export_metrics, + conf, + encode_data(metric) + ) + if not ok then + kong.log.err("Failed to enqueue span to log server: ", err) + end + + metric_seg_start = metric_seg_end + metric_seg_end, div = string.find(metrics, "# HELP", string_div, true) + + if not metric_seg_end then + metric_seg_end = #metrics + else + string_div = div+1 + end + + end + ngx_log(ngx_DEBUG, _log_prefix, "total metrics in current request: ", metrics_counter) +end + + +return { + log = log +} diff --git a/kong/plugins/opentelemetry/otlp.lua b/kong/plugins/opentelemetry/otlp.lua index ded49eb3ed2c..54ec9490f663 100644 --- a/kong/plugins/opentelemetry/otlp.lua +++ b/kong/plugins/opentelemetry/otlp.lua @@ -33,6 +33,13 @@ local TYPE_TO_ATTRIBUTE_TYPES = { boolean = "bool_value", } +local PROMETHEUS_TO_OTLP_METRIC_TYPE = { + ["gauge"] = "gauge", + ["counter"] = "sum", + ["histogram"] = "histogram", + ["unknown"] = "gauge" +} + local function transform_attributes(attr) if type(attr) ~= "table" then error("invalid attributes", 2) @@ -123,7 +130,89 @@ local function transform_span(span) return pb_span end -local encode_traces, encode_logs, prepare_logs + +-- Read prometheus metrics and transform into OTLP metric +-- https://opentelemetry.io/docs/specs/otel/compatibility/prometheus_and_openmetrics/ +local function transform_metric(metric) + assert(type(metric) == "string") + -- prometheus metric to otlp metric + local help, name = "" + local typ = "gauge" + local dataValue = {} + + local value_found, histo_sum, histo_count + local bucketCounts = {} + local explicitBounds = {} + for line in metric:gmatch("([^\n]*)\n?") do + if string.find(line, "# HELP") then + for word in line:gmatch("%S+") do + if string.find(word, "kong_") then + name = string.gsub(word, "kong_", "") + elseif word ~= "#" and word ~= "HELP" then + help = help .. " " .. word + end + end + elseif string.find(line, "# TYPE") then + for word in line:gmatch("%S+") do + typ = PROMETHEUS_TO_OTLP_METRIC_TYPE[word] + end + else + local start = string.find(line, " ") + local eof = #line + if start and eof then + value_found = string.sub(line, start+1, eof) + end + if typ ~= "histogram" then + dataValue = { + as_double = tonumber(value_found) + } + else + if string.find(line, "_count") then + histo_count = value_found + elseif string.find(line, "_sum") then + histo_sum = value_found + elseif string.find(line, "_bucket") then + table.insert(bucketCounts, value_found) + local bucket_col = line:match('le=%".*%"') + local value = bucket_col:match('%d+') + if value then table.insert(explicitBounds, value) end + end + end + end + end + + local isMonotonic, aggregationTemporality + if typ == "counter" then + typ = "sum" + isMonotonic = "true" + elseif typ == "unknown" then + typ = "gauge" + elseif typ == "histogram" then + aggregationTemporality = 1 + dataValue = { + sum = histo_sum, + count = histo_count, + bucketCounts = bucketCounts, + explicitBounds = explicitBounds + } + end + + local pb_metrics = { + name = name, + unit = "1", + description = help, + [typ] = { + aggregationTemporality = aggregationTemporality, + isMonotonic = isMonotonic, + data_points = {dataValue} + }, + } + + return pb_metrics +end + + +local encode_traces, encode_metrics, encode_logs, prepare_logs do local attributes_cache = setmetatable({}, { __mode = "k" }) local function default_resource_attributes() @@ -249,6 +338,42 @@ do return logs end + + local pb_memo_metrics = { + resource_metrics = { + { resource = { + attributes = {} + }, + scope_metrics = { + { scope = { + name = "kong-internal", + version = "0.1.0", + }, + metrics = {}, }, + }, }, + }, + } + + encode_metrics = function(metrics, resource_attributes) + local tab = tablepool_fetch(POOL_OTLP, 0, 4) + if not tab.resource_metrics then + tab.resource_metrics = deep_copy(pb_memo_metrics.resource_metrics) + end + + local resource = tab.resource_metrics[1].resource + resource.attributes = render_resource_attributes(resource_attributes) + + local scoped = tab.resource_metrics[1].scope_metrics[1] + + scoped.metrics = metrics + local pb_metrics = pb.encode("opentelemetry.proto.collector.metrics.v1.ExportMetricsServiceRequest", tab) + + -- remove reference + scoped.metrics = nil + tablepool_release(POOL_OTLP, tab, true) -- no clear + + return pb_metrics + end end return { @@ -257,4 +382,6 @@ return { encode_traces = encode_traces, encode_logs = encode_logs, prepare_logs = prepare_logs, -} + encode_metrics = encode_metrics, + transform_metric = transform_metric, +} \ No newline at end of file diff --git a/kong/plugins/opentelemetry/proto.lua b/kong/plugins/opentelemetry/proto.lua index bf63e9ebb035..78b6acc8cdba 100644 --- a/kong/plugins/opentelemetry/proto.lua +++ b/kong/plugins/opentelemetry/proto.lua @@ -2,6 +2,7 @@ local grpc = require "kong.tools.grpc" local proto_fpath = "opentelemetry/proto/collector/trace/v1/trace_service.proto" local proto_logs_fpath = "opentelemetry/proto/collector/logs/v1/logs_service.proto" +local proto_metrics_fpath = "opentelemetry/proto/collector/metrics/v1/metrics_service.proto" local function load_proto() local grpc_util = grpc.new() @@ -9,6 +10,7 @@ local function load_proto() protoc_instance:loadfile(proto_fpath) protoc_instance:loadfile(proto_logs_fpath) + protoc_instance:loadfile(proto_metrics_fpath) end load_proto() diff --git a/kong/plugins/opentelemetry/schema.lua b/kong/plugins/opentelemetry/schema.lua index 41a127859c6c..f0e4eecfcb08 100644 --- a/kong/plugins/opentelemetry/schema.lua +++ b/kong/plugins/opentelemetry/schema.lua @@ -37,6 +37,7 @@ return { fields = { { traces_endpoint = typedefs.url { referenceable = true } }, -- OTLP/HTTP { logs_endpoint = typedefs.url { referenceable = true } }, + { metrics_endpoint = typedefs.url { referenceable = true } }, { headers = { description = "The custom headers to be added in the HTTP request sent to the OTLP server. This setting is useful for adding the authentication headers (token) for the APM backend.", type = "map", keys = typedefs.header_name, values = { @@ -96,6 +97,7 @@ return { { at_least_one_of = { "traces_endpoint", "logs_endpoint", + "metrics_endpoint" } }, }, shorthand_fields = {