diff --git a/service/matching/tasklist/task_list_manager.go b/service/matching/tasklist/task_list_manager.go index ba968a1d563..854adc50e4e 100644 --- a/service/matching/tasklist/task_list_manager.go +++ b/service/matching/tasklist/task_list_manager.go @@ -35,6 +35,7 @@ import ( "golang.org/x/exp/maps" "golang.org/x/sync/errgroup" + "math/rand/v2" "github.com/uber/cadence/client/history" "github.com/uber/cadence/client/matching" @@ -63,6 +64,8 @@ const ( returnEmptyTaskTimeBudget = time.Second noIsolationTimeout = time.Duration(0) minimumIsolationDuration = time.Millisecond * 50 + // sampling rate for emitting misconfigured partition metrics + misconfiguredPartitionEmitRate = 0.05 ) var ( @@ -643,7 +646,9 @@ func (c *taskListManagerImpl) GetTask( } func (c *taskListManagerImpl) getTask(ctx context.Context, maxDispatchPerSecond *float64) (*InternalTask, error) { - c.emitMisconfiguredPartitionMetrics() + if rand.Float32() < misconfiguredPartitionEmitRate { + c.emitMisconfiguredPartitionMetrics() + } // We need to set a shorter timeout than the original ctx; otherwise, by the time ctx deadline is // reached, instead of emptyTask, context timeout error is returned to the frontend by the rpc stack, // which counts against our SLO. By shortening the timeout by a very small amount, the emptyTask can be