Traces downsampling policy (#191)

* Traces downsampling policy * fmt * Review remarks: better naming, test code
canonical · Oct 7, 2024 · 09e390c · 09e390c
1 parent 699e3d7
commit 09e390c
Show file tree

Hide file tree

Showing 4 changed files with 196 additions and 1 deletion.
diff --git a/charmcraft-22.04.yaml b/charmcraft-22.04.yaml
@@ -175,3 +175,27 @@ config:
         even if there is no integration currently requesting it.
       type: boolean
       default: false
+    tracing_sample_rate_charm:
+      description: >
+        This property defines the percentage of charm traces that are sent to tracing backend.
+        Setting it to 100 would mean all charm traces are kept, setting to 0 means charm traces
+        aren't sent to tracing backend at all. Anything outside of 0-100 range will be normalised 
+        to this range by Grafana Agent.
+      type: float
+      default: 100.0
+    tracing_sample_rate_workload:
+      description: >
+        This property defines the percentage of workload traces that are sent to tracing backend.
+        Setting it to 100 would mean all workload traces are kept, setting to 0 means workload traces
+        aren't sent to tracing backend at all. Anything outside of 0-100 range will be normalised 
+        to this range by Grafana Agent.
+      type: float
+      default: 1.0
+    tracing_sample_rate_error:
+      description: >
+        This property defines the percentage of error traces (regardless of the type) that are sent to tracing backend.
+        Setting it to 100 would mean all error traces are kept, setting to 0 means error traces
+        aren't sent to tracing backend at all. Anything outside of 0-100 range will be normalised 
+        to this range by Grafana Agent.
+      type: float
+      default: 100.0
diff --git a/charmcraft-24.04.yaml b/charmcraft-24.04.yaml
@@ -140,3 +140,24 @@ config:
         even if there is no integration currently requesting it.
       type: boolean
       default: false
+    tracing_sample_rate_charm:
+      description: >
+        This property defines the percentage of charm traces that are sent to the tracing backend.
+        Setting it to 100 would mean all charm traces are kept, setting to 0 means charm traces
+        aren't sent to the tracing backend at all.
+      type: float
+      default: 100.0
+    tracing_sample_rate_workload:
+      description: >
+        This property defines the percentage of workload traces that are sent to the tracing backend.
+        Setting it to 100 would mean all workload traces are kept, setting to 0 means workload traces
+        aren't sent to the tracing backend at all.
+      type: float
+      default: 1.0
+    tracing_sample_rate_error:
+      description: >
+        This property defines the percentage of error traces (from all sources) that are sent to the tracing backend.
+        Setting it to 100 would mean all error traces are kept, setting to 0 means error traces
+        aren't sent to the tracing backend at all.
+      type: float
+      default: 100.0
diff --git a/src/grafana_agent.py b/src/grafana_agent.py
@@ -868,6 +868,94 @@ def _receiver_config(protocol: str):
 
         return config
 
+    @property
+    def _tracing_sampling(self) -> Dict[str, Any]:
+        # policies, as defined by tail sampling processor definition:
+        # https://github.com/open-telemetry/opentelemetry-collector-contrib/tree/main/processor/tailsamplingprocessor
+        # each of them is evaluated separately and processor decides whether to pass the trace through or not
+        # see the description of tail sampling processor above for the full decision tree
+        return {
+            "policies": [
+                {
+                    "name": "error-traces-policy",
+                    "type": "and",
+                    "and": {
+                        "and_sub_policy": [
+                            {
+                                "name": "trace-status-policy",
+                                "type": "status_code",
+                                "status_code": {"status_codes": ["ERROR"]},
+                                # status_code processor is using span_status property of spans within a trace
+                                # see https://opentelemetry.io/docs/concepts/signals/traces/#span-status for reference
+                            },
+                            {
+                                "name": "probabilistic-policy",
+                                "type": "probabilistic",
+                                "probabilistic": {
+                                    "sampling_percentage": self.config.get(
+                                        "tracing_sample_rate_error"
+                                    )
+                                },
+                            },
+                        ]
+                    },
+                },
+                {
+                    "name": "charm-traces-policy",
+                    "type": "and",
+                    "and": {
+                        "and_sub_policy": [
+                            {
+                                "name": "service-name-policy",
+                                "type": "string_attribute",
+                                "string_attribute": {
+                                    "key": "service.name",
+                                    "values": [".+-charm"],
+                                    "enabled_regex_matching": True,
+                                },
+                            },
+                            {
+                                "name": "probabilistic-policy",
+                                "type": "probabilistic",
+                                "probabilistic": {
+                                    "sampling_percentage": self.config.get(
+                                        "tracing_sample_rate_charm"
+                                    )
+                                },
+                            },
+                        ]
+                    },
+                },
+                {
+                    "name": "workload-traces-policy",
+                    "type": "and",
+                    "and": {
+                        "and_sub_policy": [
+                            {
+                                "name": "service-name-policy",
+                                "type": "string_attribute",
+                                "string_attribute": {
+                                    "key": "service.name",
+                                    "values": [".+-charm"],
+                                    "enabled_regex_matching": True,
+                                    "invert_match": True,
+                                },
+                            },
+                            {
+                                "name": "probabilistic-policy",
+                                "type": "probabilistic",
+                                "probabilistic": {
+                                    "sampling_percentage": self.config.get(
+                                        "tracing_sample_rate_workload"
+                                    )
+                                },
+                            },
+                        ]
+                    },
+                },
+            ]
+        }
+
     @property
     def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]:
         """The tracing section of the config.
@@ -877,6 +965,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]:
         """
         endpoints = self._tempo_endpoints_with_tls()
         receivers = self._tracing_receivers
+        sampling = self._tracing_sampling
 
         if not receivers:
             # pushing a config with an empty receivers section will cause gagent to error out
@@ -888,6 +977,7 @@ def _tempo_config(self) -> Dict[str, Union[Any, List[Any]]]:
                     "name": "tempo",
                     "remote_write": endpoints,
                     "receivers": receivers,
+                    "tail_sampling": sampling,
                 }
             ]
         }

diff --git a/tests/scenario/test_machine_charm/test_tracing_configuration.py b/tests/scenario/test_machine_charm/test_tracing_configuration.py
@@ -1,11 +1,18 @@
 from typing import get_args
+from unittest.mock import patch
 
 import pytest
+import yaml
 from charms.grafana_agent.v0.cos_agent import ReceiverProtocol
 from charms.tempo_k8s.v2.tracing import ReceiverProtocol as TracingReceiverProtocol
-from scenario import Context, State
+from scenario import Context, Relation, State, SubordinateRelation
 
 from charm import GrafanaAgentMachineCharm
+from lib.charms.grafana_agent.v0.cos_agent import (
+    CosAgentProviderUnitData,
+    Receiver,
+)
+from lib.charms.tempo_k8s.v2.tracing import TracingProviderAppData
 
 
 def test_cos_agent_receiver_protocols_match_with_tracing():
@@ -28,3 +35,56 @@ def test_always_enable_config_variables_are_generated_for_tracing_protocols(
     with context.manager("config-changed", state) as mgr:
         charm: GrafanaAgentMachineCharm = mgr.charm
         assert protocol in charm.requested_tracing_protocols
+
+
+@pytest.mark.parametrize(
+    "sampling_config",
+    (
+        {
+            "always_enable_otlp_http": True,
+        },
+        {
+            "always_enable_otlp_http": True,
+            "tracing_sample_rate_charm": 23.0,
+            "tracing_sample_rate_workload": 13.13,
+            "tracing_sample_rate_error": 42.42,
+        },
+    ),
+)
+def test_tracing_sampling_config_is_present(
+    vroot, placeholder_cfg_path, mock_config_path, sampling_config
+):
+    # GIVEN a tracing relation over the tracing-provider endpoint and one over tracing
+    context = Context(
+        charm_type=GrafanaAgentMachineCharm,
+        charm_root=vroot,
+    )
+    tracing_provider = SubordinateRelation(
+        "cos-agent",
+        remote_unit_data=CosAgentProviderUnitData(
+            metrics_alert_rules={},
+            log_alert_rules={},
+            metrics_scrape_jobs=[],
+            log_slots=[],
+            dashboards=[],
+            subordinate=True,
+            tracing_protocols=["otlp_http", "otlp_grpc"],
+        ).dump(),
+    )
+    tracing = Relation(
+        "tracing",
+        remote_app_data=TracingProviderAppData(
+            receivers=[
+                Receiver(protocol={"name": "otlp_grpc", "type": "grpc"}, url="http:foo.com:1111")
+            ]
+        ).dump(),
+    )
+
+    state = State(leader=True, relations=[tracing, tracing_provider], config=sampling_config)
+    # WHEN we process any setup event for the relation
+    with patch("charm.GrafanaAgentMachineCharm.is_ready", True):
+        context.run("config_changed", state)
+
+    yml = yaml.safe_load(placeholder_cfg_path.read_text())
+
+    assert yml["traces"]["configs"][0]["tail_sampling"]