From 4e5f83814c4a0eed2a1ca9bab0693b9e32240c97 Mon Sep 17 00:00:00 2001 From: Jeff Wright <74204404+wrijeff@users.noreply.github.com> Date: Wed, 19 May 2021 15:41:16 +0000 Subject: [PATCH 1/2] Added trace-analytics schema documentation. Signed-off-by: Jeff Wright <74204404+wrijeff@users.noreply.github.com> --- .../otel-v1-apm-service-map-index-template.md | 125 ++++++++++++ .../otel-v1-apm-span-index-template.md | 182 ++++++++++++++++++ docs/schemas/trace-analytics/readme.md | 91 +++++++++ 3 files changed, 398 insertions(+) create mode 100644 docs/schemas/trace-analytics/otel-v1-apm-service-map-index-template.md create mode 100644 docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md create mode 100644 docs/schemas/trace-analytics/readme.md diff --git a/docs/schemas/trace-analytics/otel-v1-apm-service-map-index-template.md b/docs/schemas/trace-analytics/otel-v1-apm-service-map-index-template.md new file mode 100644 index 000000000..17812bf7b --- /dev/null +++ b/docs/schemas/trace-analytics/otel-v1-apm-service-map-index-template.md @@ -0,0 +1,125 @@ +# otel-v1-apm-service-map-index-template + +## Description +Documents in this index correspond to edges in a service map. Edges are created when a request crosses service boundaries. Documents will exclusively contain either a _destination_ or a _target_: +* Destination: corresponds to a client span calling another service. The _destination_ is the other service being called. +* Target: corresponds to a server span. The _target_ is the operation or API being called by the client. + +```json +{ + "version": 0, + "mappings": { + "date_detection": false, + "dynamic_templates": [ + { + "strings_as_keyword": { + "mapping": { + "ignore_above": 1024, + "type": "keyword" + }, + "match_mapping_type": "string" + } + } + ], + "_source": { + "enabled": true + }, + "properties": { + "hashId": { + "ignore_above": 1024, + "type": "keyword" + }, + "serviceName": { + "ignore_above": 1024, + "type": "keyword" + }, + "kind": { + "ignore_above": 1024, + "type": "keyword" + }, + "destination": { + "properties": { + "domain": { + "ignore_above": 1024, + "type": "keyword" + }, + "resource": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "target": { + "properties": { + "domain": { + "ignore_above": 1024, + "type": "keyword" + }, + "resource": { + "ignore_above": 1024, + "type": "keyword" + } + } + }, + "traceGroupName": { + "ignore_above": 1024, + "type": "keyword" + } + } + } +} +``` + +## Fields +* hashId - A deterministic hash of this relationship. +* kind - The span kind, corresponding to the source of the relationship. See [OpenTelemetry - SpanKind](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#spankind). +* serviceName - The name of the service which emitted the span. Currently derived from the `opentelemetry.proto.resource.v1.Resource` associated with the span. +* destination.domain - The serviceName of the service being called by this client. +* destination.resource - The span name (API, operation, etc.) being called by this client. +* target.domain - The serviceName of the service being called by a client. +* target.resource - The span name (API, operation, etc.) being called by a client. +* traceGroupName - The top-level span name which started the request chain. + +## Example Documents +The two example documents below illustrate the "inventory" service calling the "database" service's `updateItem` API. +```json +{ + "_index": "otel-v1-apm-service-map", + "_type": "_doc", + "_id": "7/jRp2VF7544pBN6+mK2vw==", + "_score": 1, + "_source": { + "serviceName": "inventory", + "kind": "SPAN_KIND_CLIENT", + "destination": { + "resource": "updateItem", + "domain": "database" + }, + "target": null, + "traceGroupName": "client_checkout", + "hashId": "7/jRp2VF7544pBN6+mK2vw==" + } +} +``` + +```json +{ + "_index": "otel-v1-apm-service-map", + "_type": "_doc", + "_id": "lZcUyuhGYfnaQqt+r73njA==", + "_version": 3, + "_score": 0, + "_source": { + "serviceName": "database", + "kind": "SPAN_KIND_SERVER", + "destination": null, + "target": { + "resource": "updateItem", + "domain": "database" + }, + "traceGroupName": "client_checkout", + "hashId": "lZcUyuhGYfnaQqt+r73njA==" + } +} +``` + diff --git a/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md b/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md new file mode 100644 index 000000000..0e775927d --- /dev/null +++ b/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md @@ -0,0 +1,182 @@ +# otel-v1-apm-span-index-template + +## Description +Documents in this index correspond to spans following the [OpenTelemetry tracing specification](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md). Many fields are directly copied from the span, however some fields are derived and not present in the original span. + +```json +{ + "version": 0, + "mappings": { + "date_detection": false, + "dynamic_templates": [ + { + "resource_attributes_map": { + "mapping": { + "type":"keyword" + }, + "path_match":"resource.attributes.*" + } + }, + { + "attributes_map": { + "mapping": { + "type":"keyword" + }, + "path_match":"attributes.*" + } + } + ], + "_source": { + "enabled": true + }, + "properties": { + "traceId": { + "ignore_above": 256, + "type": "keyword" + }, + "spanId": { + "ignore_above": 256, + "type": "keyword" + }, + "parentSpanId": { + "ignore_above": 256, + "type": "keyword" + }, + "name": { + "ignore_above": 1024, + "type": "keyword" + }, + "traceGroup": { + "ignore_above": 1024, + "type": "keyword" + }, + "traceGroupFields": { + "properties": { + "endTime": { + "type": "date_nanos" + }, + "durationInNanos": { + "type": "long" + }, + "statusCode": { + "type": "integer" + } + } + }, + "kind": { + "ignore_above": 128, + "type": "keyword" + }, + "startTime": { + "type": "date_nanos" + }, + "endTime": { + "type": "date_nanos" + }, + "status": { + "properties": { + "code": { + "type": "integer" + }, + "message": { + "type": "keyword" + } + } + }, + "serviceName": { + "type": "keyword" + }, + "durationInNanos": { + "type": "long" + }, + "events": { + "type": "nested", + "properties": { + "time": { + "type": "date_nanos" + } + } + }, + "links": { + "type": "nested" + } + } + } +} +``` + +## Fields +Many fields are either copied or derived from the [trace specification protobuff](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto) format. + +* traceId - A unique identifier for a trace. All spans from the same trace share the same traceId. +* spanId - A unique identifier for a span within a trace, assigned when the span is created. +* traceState - Conveys information about request position in multiple distributed tracing graphs. +* parentSpanId - The `spanId` of this span's parent span. If this is a root span, then this field must be empty. +* name - A description of the span's operation. +* kind - The type of span. See [OpenTelemetry - SpanKind](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#spankind). +* startTime - The start time of the span. +* endTime - The end time of the span. +* durationInNanos - Difference in nanoseconds between `startTime` and `endTime`. +* serviceName - Currently derived from the `opentelemetry.proto.resource.v1.Resource` associated with the span, the resource from the span originates. +* events - A list of events. See [OpenTelemetry - Events](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#add-events). +* links - A list of linked spans. See [OpenTelemetry - Links](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#specifying-links). +* droppedAttributesCount - The number of attributes that were discarded. +* droppedEventsCount - The number of events that were discarded. +* droppedLinksCount - The number of links that were dropped. +* traceGroup - A derived field, the `name` of the trace's root span. +* traceGroupFields.endTime - A derived field, the `endTime` of the trace's root span. +* traceGroupFields.statusCode - A derived field, the `status.code` of the trace's root span. +* traceGroupFields.durationInNanos - A derived field, the `durationInNanos` of the trace's root span. +* span.attributes.* - All span attributes are split into a list of keywords. +* resource.attributes.* - All resource attributes are split into a list of keywords. +* status.code - The status of the span. See [OpenTelemetry - Status](https://github.com/open-telemetry/opentelemetry-specification/blob/main/specification/trace/api.md#set-status). + + +## Example Documents + +```json +{ + "_index": "otel-v1-apm-span-000006", + "_type": "_doc", + "_id": "fe0e3811627189df", + "_score": 1, + "_source": { + "traceId": "0000000000000000856bfa5aeba5ec77", + "spanId": "fe0e3811627189df", + "traceState": "", + "parentSpanId": "856bfa5aeba5ec77", + "name": "/getcart", + "kind": "SPAN_KIND_UNSPECIFIED", + "startTime": "2021-05-18T18:58:44.695Z", + "endTime": "2021-05-18T18:58:44.760Z", + "durationInNanos": 65000000, + "serviceName": "cartservice", + "events": [], + "links": [], + "droppedAttributesCount": 0, + "droppedEventsCount": 0, + "droppedLinksCount": 0, + "traceGroup": "/cart", + "traceGroupFields.endTime": "2021-05-18T18:58:44.983Z", + "traceGroupFields.statusCode": 0, + "traceGroupFields.durationInNanos": 387000000, + "span.attributes.http@method": "GET", + "span.attributes.http@url": "http://cartservice/GetCart", + "span.attributes.instance": "cartservice-d847fdcf5-j6s2f", + "span.attributes.version": "v5", + "span.attributes.region": "us-east-1", + "resource.attributes.service@name": "cartservice", + "span.attributes.net@host@ip": "172.22.0.8", + "status.code": 0 + }, + "fields": { + "startTime": [ + "2021-05-18T18:58:44.695Z" + ], + "endTime": [ + "2021-05-18T18:58:44.760Z" + ] + } +} +``` + diff --git a/docs/schemas/trace-analytics/readme.md b/docs/schemas/trace-analytics/readme.md new file mode 100644 index 000000000..4d7ed2758 --- /dev/null +++ b/docs/schemas/trace-analytics/readme.md @@ -0,0 +1,91 @@ +# Trace Analytics Schema Versioning + +The purpose of this document is to outline the structure and versioning formats followed by Data Prepper (DP) the Trace Analytics (TA) plugin for Kibana and OpenSearch Dashboards. Each individual schema shall have its own supporting document explaining its structure, fields, and purpose. + +## Tenets + +1. Schemas shall follow [semantic versioning](https://semver.org/), excluding patch version numbers. +2. Schema versions shall be detached from Data Prepper and Trace Analytics plugin versions. +3. Index and index template names shall only include the major version (e.g. "otel-v1-apm-span"). The minor version shall be included within the actual schema as a field. +4. Forward and backward-compatibility promises shall only apply to schemas of the same major version. +5. A major version increase shall require Data Prepper (writer) artifacts to be made available before Trace Analytics plugin (reader) updates are made available. +6. A major version increase shall result in a new index template and indexes being created in an Elasticsearch/OpenSearch cluster. + +## Versioning Format + +A schema will be versioned following the [Semantic Versioning 2.0.0 spec](https://semver.org/). A schema version will include a major and minor version number. Patch version numbers will not be used as "patching" is not applicable to a versioned schema; all changes no matter how trivial will have implications. + +1. **Major versions** will be incremented for breaking, backwards-incompatible changes. +2. **Minor versions** will be incremented for backwards-compatible feature additions. This can be thought of an "append-only" change to the schema. + +**Schema versions are detached from Data Prepper and Trace Analytics plugin versions.** A TA plugin version increase does not necessarily affect the version of the schema or Data Prepper. Instead, both DP and TA versions will be *compatible* with a specific schema version. Examples include: + +* Trace Analytics plugin v1.5 includes features built on schema version 1.2.0 +* Data Prepper v1.1 emits documents following schema version 1.2.0 + +### Major version changes + +Major version changes include removing a field, renaming a field, or changing an existing field's datatype. + +* Schema 1.0 to 2.0 *changes the type of a field* from Keyword to Numeric +* Schema 1.0 to 2.0 *removes* *field* "latency" from the schema +* Schema 1.0 to 2.0 *renames field* "end" to "endTime" + * A rename is effectively a field addition and removal in a single operation + +### Minor version changes + +Minor version changes include adding a new field or adding a new nested field. + +* Schema 1.2 to 1.3 adds a new field, "fieldC", as a Keyword +* Schema 2.11 to 2.12 adds a new nested field, "name" to an existing collection, "parentSpan", resulting in "parentSpan.name" + +## Compatibility Promises + +The following compatibility promises are made *only for schemas of the same major version*. + +* ***Backwards compatibility*** - features built on version 1.x of the schema **will not break, but may degrade** if data from a **prior** 1.x schema version is used. +* ***Forwards compatibility*** - features built on version 1.x of the schema **will not break** if data from a **later** 1.x schema version is used. + +### Read-compatibility examples + +1. A plugin built on schema 1.1 but consuming schema 1.2 data will function 100% without issue +2. A plugin built on schema 1.2 but consuming schema 1.1 data will continue to function, however some features might be degraded +3. A plugin built on schema 1.0 but consuming schema 2.0 data is not guaranteed to function +4. A plugin built on schema 2.0 but consuming schema 1.0 data is not guaranteed to function + +### Write-compatibility examples + +1. A writer built on schema 1.2 but writing to a cluster containing schema 1.1 data will succeed +2. A writer built on schema 1.1 but writing to a cluster containing schema 1.2 data will succeed +3. A writer built on schema 1.0 but writing to a cluster containing schema 2.0 data will succeed +4. A writer built on schema 2.0 but writing to a cluster containing schema 1.0 data will succeed + +### Handling minor version updates + +Minor version updates will occur as new fields are needed to support new Trace Analytics features. The steps to update a schema minor version are to: + +1. Ensure both TA and DP owners are aligned with requirements +2. Update the schema in the schema repo + 1. Add new fields to the schema JSON + 2. Increment the `version` field of the schema by 1 + 3. Update the documentation to describe the new fields +3. Add test data to the TA plugin test suite + 1. Don't update existing data in place, instead add new data following the new schema version. The test suite is expected to pass with a range of minor versions being tested at the same time. +4. Update DP to start emitting documents following the new schema version +5. Add new features to the TA plugin utilizing the new data + +### Handling Major version updates + +As schema owners, we will do our best to avoid introducing major version changes. However as our schemas are heavily tied to the OpenTelemetry spec, there is always the risk of an upstream backwards-incompatible change requiring a major version increase. + +Due to the potentially disjointed release schedules of both OpenSearch and the managed offering, we need to ensure that rolling out a major version change is carefully planned. + +A typical migration plan will first make Data Prepper artifacts available so that users can start ingesting their data to the new index. To prevent data loss during the migration period, users can be encouraged to simultaneously write to both the old and new indexes. This can be done by either running both old and new versions of Data Prepper side-by-side, or perhaps Data Prepper itself can be updated to write to dual indexes (TODO). Once users have the ability to write to the new index, Trace Analytics plugin updates will be made available which make use of the new major version index. + +The steps to handle a schema major version update are to: + +1. Update Data Prepper to use the new schema. Increment the Data Prepper major version and make new artifacts available to users. + * This must *always be done first*. Plugin changes cannot go out before Data Prepper artifacts are made available. + * Encourage users to start using the new Data Prepper version ahead of the plugin release. These will allow the user to upgrade their Trace Analytics plugin and immediately have new major version data to work with. +2. Update the Trace Analytics plugin to read from the new indexes. Increment the plugin version and release to OpenSearch and/or the managed offering. + * Communicate to users the need to use the new version of Data Prepper after upgrading their TA plugin From 04dd7bd18977294800cf4b77d7f01914def75f23 Mon Sep 17 00:00:00 2001 From: Jeff Wright <74204404+wrijeff@users.noreply.github.com> Date: Thu, 20 May 2021 20:12:32 +0000 Subject: [PATCH 2/2] Revised per PR feedback. Signed-off-by: Jeff Wright <74204404+wrijeff@users.noreply.github.com> --- .../otel-v1-apm-span-index-template.md | 2 +- docs/schemas/trace-analytics/readme.md | 38 ++++++++++++------- 2 files changed, 26 insertions(+), 14 deletions(-) diff --git a/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md b/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md index 0e775927d..5dea6ebcf 100644 --- a/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md +++ b/docs/schemas/trace-analytics/otel-v1-apm-span-index-template.md @@ -106,7 +106,7 @@ Documents in this index correspond to spans following the [OpenTelemetry tracing ``` ## Fields -Many fields are either copied or derived from the [trace specification protobuff](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto) format. +Many fields are either copied or derived from the [trace specification protobuf](https://github.com/open-telemetry/opentelemetry-proto/blob/main/opentelemetry/proto/trace/v1/trace.proto) format. * traceId - A unique identifier for a trace. All spans from the same trace share the same traceId. * spanId - A unique identifier for a span within a trace, assigned when the span is created. diff --git a/docs/schemas/trace-analytics/readme.md b/docs/schemas/trace-analytics/readme.md index 4d7ed2758..1e028e0bd 100644 --- a/docs/schemas/trace-analytics/readme.md +++ b/docs/schemas/trace-analytics/readme.md @@ -4,7 +4,7 @@ The purpose of this document is to outline the structure and versioning formats ## Tenets -1. Schemas shall follow [semantic versioning](https://semver.org/), excluding patch version numbers. +1. Schemas shall follow the [Semantic Versioning 2.0.0 spec](https://semver.org/), excluding patch version numbers. 2. Schema versions shall be detached from Data Prepper and Trace Analytics plugin versions. 3. Index and index template names shall only include the major version (e.g. "otel-v1-apm-span"). The minor version shall be included within the actual schema as a field. 4. Forward and backward-compatibility promises shall only apply to schemas of the same major version. @@ -43,8 +43,8 @@ Minor version changes include adding a new field or adding a new nested field. The following compatibility promises are made *only for schemas of the same major version*. -* ***Backwards compatibility*** - features built on version 1.x of the schema **will not break, but may degrade** if data from a **prior** 1.x schema version is used. -* ***Forwards compatibility*** - features built on version 1.x of the schema **will not break** if data from a **later** 1.x schema version is used. +* ***Backwards compatibility*** - Trace Analytics UI features built on version 1.x of the schema **will not break, but may degrade** if data from a **prior** 1.x schema version is used. +* ***Forwards compatibility*** - Trace Analytics UI features built on version 1.x of the schema **will not break** if data from a **later** 1.x schema version is used. ### Read-compatibility examples @@ -74,18 +74,30 @@ Minor version updates will occur as new fields are needed to support new Trace A 4. Update DP to start emitting documents following the new schema version 5. Add new features to the TA plugin utilizing the new data -### Handling Major version updates +### Handling major version updates -As schema owners, we will do our best to avoid introducing major version changes. However as our schemas are heavily tied to the OpenTelemetry spec, there is always the risk of an upstream backwards-incompatible change requiring a major version increase. +As schema owners, we will do our best to avoid introducing major version changes. However, as our schemas are heavily tied to the OpenTelemetry spec, there is always the risk of an upstream backwards-incompatible change requiring a major version increase. -Due to the potentially disjointed release schedules of both OpenSearch and the managed offering, we need to ensure that rolling out a major version change is carefully planned. +The following procedures may be considered while performing a major upgrade across both Data Prepper and the Trace Analytics plugin. -A typical migration plan will first make Data Prepper artifacts available so that users can start ingesting their data to the new index. To prevent data loss during the migration period, users can be encouraged to simultaneously write to both the old and new indexes. This can be done by either running both old and new versions of Data Prepper side-by-side, or perhaps Data Prepper itself can be updated to write to dual indexes (TODO). Once users have the ability to write to the new index, Trace Analytics plugin updates will be made available which make use of the new major version index. +#### Approach #1: Upgrade both Data Prepper and the Trace Analytics plugin as simultaneously as possible +The simplest approach to perform a major version upgrade is to simultaneously upgrade both Data Prepper and the Trace Analytics plugin. This approach will result in a usability drop while both components are on differing major versions. -The steps to handle a schema major version update are to: +As an example, assume both DP and TA are being upgraded from v1 to v2: +* If DP is upgraded to v2 before TA, then the old TA v1 dashboard will not visualize the new data ingested by DP v2. +* IF TA is upgraded to v2 before DP, then the new TA v2 dashboard will not visualize the old data ingested by DP v1. -1. Update Data Prepper to use the new schema. Increment the Data Prepper major version and make new artifacts available to users. - * This must *always be done first*. Plugin changes cannot go out before Data Prepper artifacts are made available. - * Encourage users to start using the new Data Prepper version ahead of the plugin release. These will allow the user to upgrade their Trace Analytics plugin and immediately have new major version data to work with. -2. Update the Trace Analytics plugin to read from the new indexes. Increment the plugin version and release to OpenSearch and/or the managed offering. - * Communicate to users the need to use the new version of Data Prepper after upgrading their TA plugin +Both component upgrades should happen as close as possible to each other to minimize this window. + +#### Approach #2: Run both old and new Data Prepper versions side-by-side until the Trace Analytics plugin is upgraded +To avoid usability downtime of the Trace Analytics plugin, run two versions of Data Prepper simultaneously before upgrading the plugin. + +* DP v1 writes to the v1 index +* DP v2 writes to the v2 index +* TA v1 reads from the v1 index +* TA v2 reads from the v2 index + +Upgrading the Trace Analytics plugin from v1 to v2 will result in no usability downtime, as both plugin versions will have populated indexes to read from. This approach is more complex in that it requires additional Data Prepper instances and results in duplicate data being written (until the old DP instances are shut down). + +#### Mitigating data loss +Upgrading Data Prepper and the Trace Analytics plugin to a new major version schema will result in data written to old indexes being unusable. If users wish to avoid this data loss, the [reindexing APIs](https://opendistro.github.io/for-elasticsearch-docs/docs/elasticsearch/reindex-data/) must be used. Additionally, transforms might required depending on the differences between the two major schema versions.