Merge pull request #289 from iksaif/corentin.chary/reduce-lock-conten…

…tion buffered_metrics: reduce a bit lock contention
DataDog · Nov 15, 2023 · 54ec306 · 54ec306
2 parents e612112 + db7e98b
commit 54ec306
Show file tree

Hide file tree

Showing 4 changed files with 49 additions and 18 deletions.
diff --git a/statsd/buffered_metric_context.go b/statsd/buffered_metric_context.go
@@ -14,7 +14,7 @@ type bufferedMetricContexts struct {
 	nbContext uint64
 	mutex     sync.RWMutex
 	values    bufferedMetricMap
-	newMetric func(string, float64, string) *bufferedMetric
+	newMetric func(string, float64, string, float64) *bufferedMetric
 
 	// Each bufferedMetricContexts uses its own random source and random
 	// lock to prevent goroutines from contending for the lock on the
@@ -25,11 +25,11 @@ type bufferedMetricContexts struct {
 	randomLock sync.Mutex
 }
 
-func newBufferedContexts(newMetric func(string, float64, string, int64) *bufferedMetric, maxSamples int64) bufferedMetricContexts {
+func newBufferedContexts(newMetric func(string, float64, string, int64, float64) *bufferedMetric, maxSamples int64) bufferedMetricContexts {
 	return bufferedMetricContexts{
 		values: bufferedMetricMap{},
-		newMetric: func(name string, value float64, stringTags string) *bufferedMetric {
-			return newMetric(name, value, stringTags, maxSamples)
+		newMetric: func(name string, value float64, stringTags string, rate float64) *bufferedMetric {
+			return newMetric(name, value, stringTags, maxSamples, rate)
 		},
 		// Note that calling "time.Now().UnixNano()" repeatedly quickly may return
 		// very similar values. That's fine for seeding the worker-specific random
@@ -57,6 +57,16 @@ func (bc *bufferedMetricContexts) flush(metrics []metric) []metric {
 func (bc *bufferedMetricContexts) sample(name string, value float64, tags []string, rate float64) error {
 	keepingSample := shouldSample(rate, bc.random, &bc.randomLock)
 
+	// If we don't keep the sample, return early. If we do keep the sample
+	// we end up storing the *first* observed sampling rate in the metric.
+	// This is the *wrong* behavior but it's the one we had before and the alternative would increase lock contention too
+	// much with the current code.
+	// TODO: change this behavior in the future, probably by introducing thread-local storage and lockless stuctures.
+	// If this code is removed, also remove the observed sampling rate in the metric and fix `bufferedMetric.flushUnsafe()`
+	if !keepingSample {
+		return nil
+	}
+
 	context, stringTags := getContextAndTags(name, tags)
 	var v *bufferedMetric = nil
 
@@ -71,7 +81,7 @@ func (bc *bufferedMetricContexts) sample(name string, value float64, tags []stri
 		v, _ = bc.values[context]
 		if v == nil {
 			// If we might keep a sample that we should have skipped, but that should not drastically affect performances.
-			bc.values[context] = bc.newMetric(name, value, stringTags)
+			bc.values[context] = bc.newMetric(name, value, stringTags, rate)
 			// We added a new value, we need to unlock the mutex and quit
 			bc.mutex.Unlock()
 			return nil

diff --git a/statsd/metrics.go b/statsd/metrics.go
@@ -140,6 +140,10 @@ type bufferedMetric struct {
 
 	// maxSamples is the maximum number of samples we keep in memory
 	maxSamples int64
+
+	// The first observed user-specified sample rate. When specified
+	// it is used because we don't know better.
+	specifiedRate float64
 }
 
 func (s *bufferedMetric) sample(v float64) {
@@ -184,18 +188,30 @@ func (s *bufferedMetric) skipSample() {
 }
 
 func (s *bufferedMetric) flushUnsafe() metric {
+	totalSamples := atomic.LoadInt64(&s.totalSamples)
+	var rate float64
+
+	// If the user had a specified rate send it because we don't know better.
+	// This code should be removed once we can also remove the early return at the top of
+	// `bufferedMetricContexts.sample`
+	if s.specifiedRate != 1.0 {
+		rate = s.specifiedRate
+	} else {
+		rate = float64(s.storedSamples) / float64(totalSamples)
+	}
+
 	return metric{
 		metricType: s.mtype,
 		name:       s.name,
 		stags:      s.tags,
-		rate:       float64(s.storedSamples) / float64(atomic.LoadInt64(&s.totalSamples)),
+		rate:       rate,
 		fvalues:    s.data[:s.storedSamples],
 	}
 }
 
 type histogramMetric = bufferedMetric
 
-func newHistogramMetric(name string, value float64, stringTags string, maxSamples int64) *histogramMetric {
+func newHistogramMetric(name string, value float64, stringTags string, maxSamples int64, rate float64) *histogramMetric {
 	return &histogramMetric{
 		data:          newData(value, maxSamples),
 		totalSamples:  1,
@@ -204,12 +220,13 @@ func newHistogramMetric(name string, value float64, stringTags string, maxSample
 		tags:          stringTags,
 		mtype:         histogramAggregated,
 		maxSamples:    maxSamples,
+		specifiedRate: rate,
 	}
 }
 
 type distributionMetric = bufferedMetric
 
-func newDistributionMetric(name string, value float64, stringTags string, maxSamples int64) *distributionMetric {
+func newDistributionMetric(name string, value float64, stringTags string, maxSamples int64, rate float64) *distributionMetric {
 	return &distributionMetric{
 		data:          newData(value, maxSamples),
 		totalSamples:  1,
@@ -218,12 +235,13 @@ func newDistributionMetric(name string, value float64, stringTags string, maxSam
 		tags:          stringTags,
 		mtype:         distributionAggregated,
 		maxSamples:    maxSamples,
+		specifiedRate: rate,
 	}
 }
 
 type timingMetric = bufferedMetric
 
-func newTimingMetric(name string, value float64, stringTags string, maxSamples int64) *timingMetric {
+func newTimingMetric(name string, value float64, stringTags string, maxSamples int64, rate float64) *timingMetric {
 	return &timingMetric{
 		data:          newData(value, maxSamples),
 		totalSamples:  1,
@@ -232,6 +250,7 @@ func newTimingMetric(name string, value float64, stringTags string, maxSamples i
 		tags:          stringTags,
 		mtype:         timingAggregated,
 		maxSamples:    maxSamples,
+		specifiedRate: rate,
 	}
 }
 

diff --git a/statsd/metrics_test.go b/statsd/metrics_test.go
@@ -132,15 +132,15 @@ func TestFlushUnsafeSetMetricSample(t *testing.T) {
 }
 
 func TestNewHistogramMetric(t *testing.T) {
-	s := newHistogramMetric("test", 1.0, "tag1,tag2", 0)
+	s := newHistogramMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	assert.Equal(t, s.data, []float64{1.0})
 	assert.Equal(t, s.name, "test")
 	assert.Equal(t, s.tags, "tag1,tag2")
 	assert.Equal(t, s.mtype, histogramAggregated)
 }
 
 func TestHistogramMetricSample(t *testing.T) {
-	s := newHistogramMetric("test", 1.0, "tag1,tag2", 0)
+	s := newHistogramMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	s.sample(123.45)
 	assert.Equal(t, s.data, []float64{1.0, 123.45})
 	assert.Equal(t, s.name, "test")
@@ -149,7 +149,7 @@ func TestHistogramMetricSample(t *testing.T) {
 }
 
 func TestFlushUnsafeHistogramMetricSample(t *testing.T) {
-	s := newHistogramMetric("test", 1.0, "tag1,tag2", 0)
+	s := newHistogramMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	m := s.flushUnsafe()
 
 	assert.Equal(t, m.metricType, histogramAggregated)
@@ -170,15 +170,15 @@ func TestFlushUnsafeHistogramMetricSample(t *testing.T) {
 }
 
 func TestNewDistributionMetric(t *testing.T) {
-	s := newDistributionMetric("test", 1.0, "tag1,tag2", 0)
+	s := newDistributionMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	assert.Equal(t, s.data, []float64{1.0})
 	assert.Equal(t, s.name, "test")
 	assert.Equal(t, s.tags, "tag1,tag2")
 	assert.Equal(t, s.mtype, distributionAggregated)
 }
 
 func TestDistributionMetricSample(t *testing.T) {
-	s := newDistributionMetric("test", 1.0, "tag1,tag2", 0)
+	s := newDistributionMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	s.sample(123.45)
 	assert.Equal(t, s.data, []float64{1.0, 123.45})
 	assert.Equal(t, s.name, "test")
@@ -187,7 +187,7 @@ func TestDistributionMetricSample(t *testing.T) {
 }
 
 func TestFlushUnsafeDistributionMetricSample(t *testing.T) {
-	s := newDistributionMetric("test", 1.0, "tag1,tag2", 0)
+	s := newDistributionMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	m := s.flushUnsafe()
 
 	assert.Equal(t, m.metricType, distributionAggregated)
@@ -208,15 +208,15 @@ func TestFlushUnsafeDistributionMetricSample(t *testing.T) {
 }
 
 func TestNewTimingMetric(t *testing.T) {
-	s := newTimingMetric("test", 1.0, "tag1,tag2", 0)
+	s := newTimingMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	assert.Equal(t, s.data, []float64{1.0})
 	assert.Equal(t, s.name, "test")
 	assert.Equal(t, s.tags, "tag1,tag2")
 	assert.Equal(t, s.mtype, timingAggregated)
 }
 
 func TestTimingMetricSample(t *testing.T) {
-	s := newTimingMetric("test", 1.0, "tag1,tag2", 0)
+	s := newTimingMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	s.sample(123.45)
 	assert.Equal(t, s.data, []float64{1.0, 123.45})
 	assert.Equal(t, s.name, "test")
@@ -225,7 +225,7 @@ func TestTimingMetricSample(t *testing.T) {
 }
 
 func TestFlushUnsafeTimingMetricSample(t *testing.T) {
-	s := newTimingMetric("test", 1.0, "tag1,tag2", 0)
+	s := newTimingMetric("test", 1.0, "tag1,tag2", 0, 1.0)
 	m := s.flushUnsafe()
 
 	assert.Equal(t, m.metricType, timingAggregated)

diff --git a/statsd/options.go b/statsd/options.go
@@ -312,6 +312,8 @@ func WithoutClientSideAggregation() Option {
 
 // WithExtendedClientSideAggregation enables client side aggregation for all types. This feature is only compatible with
 // Agent's version >=6.25.0 && <7.0.0 or Agent's versions >=7.25.0.
+// When enabled, the use of `rate` with distribution is discouraged and `WithMaxSamplesPerContext()` should be used.
+// If `rate` is used with different values of `rate` the resulting rate is not guaranteed to be correct.
 func WithExtendedClientSideAggregation() Option {
 	return func(o *Options) error {
 		o.aggregation = true