From 5cc9d33b7715f8b95c653f7f4d0daa4e4ac864d1 Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Thu, 14 Nov 2024 14:16:01 -0800 Subject: [PATCH 1/3] sql: add sql.distsql.select.distributed.count metrics Add a new planFlag and counter to EngineMetrics. Note that the planFlag is set if *any* part of a query uses distributed execution, including pre- or post-queries. This seemed both simpler and more likely to be useful than only counting main queries that use distributed execution. Release note (ops change): Add two new metrics, `sql.distsql.select.distributed.count` and `sql.distsql.select.distributed.count.internal`. These metrics count the number of SELECT statements that actually execute with full or partial distribution. These metrics differ from `sql.distsql.select.count` and `sql.distsql.select.count.internal` in that the latter count the number of SELECT statements that are *planned* with full or partial distribution, but might not necessarily execute with full or partial distribution, depending on the location of data. --- docs/generated/metrics/metrics.html | 6 ++++-- .../opentelemetry/cockroachdb_metrics.go | 4 +++- pkg/sql/conn_executor.go | 9 +++++---- pkg/sql/distsql_running.go | 3 +++ pkg/sql/exec_util.go | 8 +++++++- pkg/sql/executor_statement_metrics.go | 8 +++++++- pkg/sql/plan.go | 15 ++++++++++----- 7 files changed, 39 insertions(+), 14 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index 0328d9a2b05c..4259c3d096d3 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1651,8 +1651,10 @@ APPLICATIONsql.distsql.queries.activeNumber of SQL queries currently activeQueriesGAUGECOUNTAVGNONE APPLICATIONsql.distsql.queries.spilledNumber of queries that have spilled to diskQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.queries.totalNumber of SQL queries executedQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONsql.distsql.select.countNumber of DistSQL SELECT statementsSQL StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONsql.distsql.select.count.internalNumber of DistSQL SELECT statements (internal queries)SQL Internal StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.distsql.select.countNumber of SELECT statements planned to be distributedSQL StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.distsql.select.count.internalNumber of SELECT statements planned to be distributed (internal queries)SQL Internal StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.distsql.select.distributed.countNumber of SELECT statements that were distributedSQL StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.distsql.select.distributed.count.internalNumber of SELECT statements that were distributed (internal queries)SQL Internal StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.service.latencyLatency of DistSQL request executionLatencyHISTOGRAMNANOSECONDSAVGNONE APPLICATIONsql.distsql.service.latency.internalLatency of DistSQL request execution (internal queries)SQL Internal StatementsHISTOGRAMNANOSECONDSAVGNONE APPLICATIONsql.distsql.vec.openfdsCurrent number of open file descriptors used by vectorized external storageFilesGAUGECOUNTAVGNONE diff --git a/pkg/roachprod/opentelemetry/cockroachdb_metrics.go b/pkg/roachprod/opentelemetry/cockroachdb_metrics.go index bb73178d9fb4..3e66315d58f1 100644 --- a/pkg/roachprod/opentelemetry/cockroachdb_metrics.go +++ b/pkg/roachprod/opentelemetry/cockroachdb_metrics.go @@ -1587,7 +1587,9 @@ var cockroachdbMetrics = map[string]string{ "sql_distsql_queries_spilled": "sql.distsql.queries.spilled", "sql_distsql_queries_total": "sql.distsql.queries.total", "sql_distsql_select_count": "sql.distsql.select.count", - "sql_distsql_select_count_internal": "sql.distsql.select.internal", + "sql_distsql_select_count_internal": "sql.distsql.select.count.internal", + "sql_distsql_select_distributed_count": "sql.distsql.select.distributed.count", + "sql_distsql_select_distributed_count_internal": "sql.distsql.select.distributed.count.internal", "sql_distsql_service_latency": "sql.distsql.service.latency", "sql_distsql_service_latency_bucket": "sql.distsql.service.latency.bucket", "sql_distsql_service_latency_count": "sql.distsql.service.latency.count", diff --git a/pkg/sql/conn_executor.go b/pkg/sql/conn_executor.go index 191cfaa1e037..93987171751c 100644 --- a/pkg/sql/conn_executor.go +++ b/pkg/sql/conn_executor.go @@ -505,10 +505,11 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server { func makeMetrics(internal bool) Metrics { return Metrics{ EngineMetrics: EngineMetrics{ - DistSQLSelectCount: metric.NewCounter(getMetricMeta(MetaDistSQLSelect, internal)), - SQLOptFallbackCount: metric.NewCounter(getMetricMeta(MetaSQLOptFallback, internal)), - SQLOptPlanCacheHits: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheHits, internal)), - SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)), + DistSQLSelectCount: metric.NewCounter(getMetricMeta(MetaDistSQLSelect, internal)), + DistSQLSelectDistributedCount: metric.NewCounter(getMetricMeta(MetaDistSQLSelectDistributed, internal)), + SQLOptFallbackCount: metric.NewCounter(getMetricMeta(MetaSQLOptFallback, internal)), + SQLOptPlanCacheHits: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheHits, internal)), + SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)), // TODO(mrtracy): See HistogramWindowInterval in server/config.go for the 6x factor. DistSQLExecLatency: metric.NewHistogram(metric.HistogramOptions{ Mode: metric.HistogramModePreferHdrLatency, diff --git a/pkg/sql/distsql_running.go b/pkg/sql/distsql_running.go index 1f36f29d3c92..732dfdf20451 100644 --- a/pkg/sql/distsql_running.go +++ b/pkg/sql/distsql_running.go @@ -858,6 +858,9 @@ func (dsp *DistSQLPlanner) Run( dsp.cancelFlowsCoordinator.addFlowsToCancel(flows) } }() + if planCtx.planner != nil { + planCtx.planner.curPlan.flags.Set(planFlagDistributedExecution) + } } // Currently, we get the statement only if there is a planner available in diff --git a/pkg/sql/exec_util.go b/pkg/sql/exec_util.go index ce7d26be8e1a..582612949cc6 100644 --- a/pkg/sql/exec_util.go +++ b/pkg/sql/exec_util.go @@ -780,7 +780,13 @@ var ( } MetaDistSQLSelect = metric.Metadata{ Name: "sql.distsql.select.count", - Help: "Number of DistSQL SELECT statements", + Help: "Number of SELECT statements planned to be distributed", + Measurement: "SQL Statements", + Unit: metric.Unit_COUNT, + } + MetaDistSQLSelectDistributed = metric.Metadata{ + Name: "sql.distsql.select.distributed.count", + Help: "Number of SELECT statements that were distributed", Measurement: "SQL Statements", Unit: metric.Unit_COUNT, } diff --git a/pkg/sql/executor_statement_metrics.go b/pkg/sql/executor_statement_metrics.go index d2d7d302ea81..ad782e4c63f7 100644 --- a/pkg/sql/executor_statement_metrics.go +++ b/pkg/sql/executor_statement_metrics.go @@ -20,8 +20,11 @@ import ( // EngineMetrics groups a set of SQL metrics. type EngineMetrics struct { - // The subset of SELECTs that are processed through DistSQL. + // The subset of SELECTs that are requested to be processed through DistSQL. DistSQLSelectCount *metric.Counter + // The subset of SELECTs that were executed by DistSQL with full or partial + // distribution. + DistSQLSelectDistributedCount *metric.Counter // The subset of queries which we attempted and failed to plan with the // cost-based optimizer. SQLOptFallbackCount *metric.Counter @@ -143,6 +146,9 @@ func (ex *connExecutor) recordStatementSummary( if flags.IsDistributed() { if _, ok := stmt.AST.(*tree.Select); ok { m.DistSQLSelectCount.Inc(1) + if flags.IsSet(planFlagDistributedExecution) { + m.DistSQLSelectDistributedCount.Inc(1) + } } if shouldIncludeInLatencyMetrics { m.DistSQLExecLatency.RecordValue(runLatRaw.Nanoseconds()) diff --git a/pkg/sql/plan.go b/pkg/sql/plan.go index 375f4284c8f9..928c6811f914 100644 --- a/pkg/sql/plan.go +++ b/pkg/sql/plan.go @@ -579,15 +579,16 @@ const ( // did not find one. planFlagOptCacheMiss - // planFlagFullyDistributed is set if the query execution is is fully - // distributed. + // planFlagFullyDistributed is set if the query is planned to use full + // distribution. planFlagFullyDistributed - // planFlagPartiallyDistributed is set if the query execution is is partially - // distributed (see physicalplan.PartiallyDistributedPlan). + // planFlagPartiallyDistributed is set if the query is planned to use partial + // distribution (see physicalplan.PartiallyDistributedPlan). planFlagPartiallyDistributed - // planFlagNotDistributed is set if the query execution is not distributed. + // planFlagNotDistributed is set if the query is planned to not use + // distribution. planFlagNotDistributed // planFlagImplicitTxn marks that the plan was run inside of an implicit @@ -645,6 +646,10 @@ const ( // planFlagOptimized is set if optimization was performed during the // current execution of the query. planFlagOptimized + + // planFlagDistributedExecution is set if execution of any part of the plan + // was distributed. + planFlagDistributedExecution ) // IsSet returns true if the receiver has all of the given flags set. From 416a28cee2cce71d732c5526d38d7493e3d6b4aa Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Thu, 14 Nov 2024 16:01:58 -0800 Subject: [PATCH 2/3] sql: add sql.distsql.distributed.count metric Release note (ops change): Add new metric `sql.distsql.distributed.count` which counts the number of invocations of the DistSQL engine with full or partial distribution. (This is in contrast to `sql.distsql.queries.total` which counts the total number of invocations of the DistSQL engine.) --- docs/generated/metrics/metrics.html | 1 + pkg/roachprod/opentelemetry/cockroachdb_metrics.go | 1 + pkg/sql/distsql_running.go | 2 +- pkg/sql/execinfra/metrics.go | 13 ++++++++++++- 4 files changed, 15 insertions(+), 2 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index 4259c3d096d3..1bce24c71d8f 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1644,6 +1644,7 @@ APPLICATIONsql.distsql.cumulative_contention_nanosCumulative contention across all queries (in nanoseconds)NanosecondsCOUNTERNANOSECONDSAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.dist_query_rerun_locally.countTotal number of cases when distributed query error resulted in a local rerunQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.dist_query_rerun_locally.failure_countTotal number of cases when the local rerun of a distributed query resulted in an errorQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.distsql.distributed.countNumber of SQL queries executed with full or partial distributionQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.exec.latencyLatency of DistSQL statement executionLatencyHISTOGRAMNANOSECONDSAVGNONE APPLICATIONsql.distsql.exec.latency.internalLatency of DistSQL statement execution (internal queries)SQL Internal StatementsHISTOGRAMNANOSECONDSAVGNONE APPLICATIONsql.distsql.flows.activeNumber of distributed SQL flows currently activeFlowsGAUGECOUNTAVGNONE diff --git a/pkg/roachprod/opentelemetry/cockroachdb_metrics.go b/pkg/roachprod/opentelemetry/cockroachdb_metrics.go index 3e66315d58f1..f5c22edcd4b7 100644 --- a/pkg/roachprod/opentelemetry/cockroachdb_metrics.go +++ b/pkg/roachprod/opentelemetry/cockroachdb_metrics.go @@ -1571,6 +1571,7 @@ var cockroachdbMetrics = map[string]string{ "sql_disk_distsql_spilled_bytes_written": "sql.disk.distsql.spilled.bytes.written", "sql_distsql_contended_queries_count": "sql.distsql.contended.queries", "sql_distsql_cumulative_contention_nanos": "sql.distsql.cumulative_contention_nanos", + "sql_distsql_distributed_count": "sql.distsql.distributed.count", "sql_distsql_dist_query_rerun_locally_count": "sql.distsql.dist_query_rerun_locally.count", "sql_distsql_dist_query_rerun_locally_failure_count": "sql.distsql.dist_query_rerun_locally.failure_count", "sql_distsql_exec_latency": "sql.distsql.exec.latency", diff --git a/pkg/sql/distsql_running.go b/pkg/sql/distsql_running.go index 732dfdf20451..94ce760bd5f9 100644 --- a/pkg/sql/distsql_running.go +++ b/pkg/sql/distsql_running.go @@ -830,7 +830,7 @@ func (dsp *DistSQLPlanner) Run( log.VEvent(ctx, 2, "running DistSQL plan") - dsp.distSQLSrv.ServerConfig.Metrics.QueryStart() + dsp.distSQLSrv.ServerConfig.Metrics.QueryStart(len(flows) > 1 /* distributed */) defer dsp.distSQLSrv.ServerConfig.Metrics.QueryStop() recv.outputTypes = plan.GetResultTypes() diff --git a/pkg/sql/execinfra/metrics.go b/pkg/sql/execinfra/metrics.go index 50a62916fe82..9edf23e40235 100644 --- a/pkg/sql/execinfra/metrics.go +++ b/pkg/sql/execinfra/metrics.go @@ -16,6 +16,7 @@ import ( type DistSQLMetrics struct { QueriesActive *metric.Gauge QueriesTotal *metric.Counter + DistributedCount *metric.Counter ContendedQueriesCount *metric.Counter CumulativeContentionNanos *metric.Counter FlowsActive *metric.Gauge @@ -50,6 +51,12 @@ var ( Measurement: "Queries", Unit: metric.Unit_COUNT, } + metaDistributedCount = metric.Metadata{ + Name: "sql.distsql.distributed.count", + Help: "Number of SQL queries executed with full or partial distribution", + Measurement: "Queries", + Unit: metric.Unit_COUNT, + } metaContendedQueriesCount = metric.Metadata{ Name: "sql.distsql.contended_queries.count", Help: "Number of SQL queries that experienced contention", @@ -145,6 +152,7 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { return DistSQLMetrics{ QueriesActive: metric.NewGauge(metaQueriesActive), QueriesTotal: metric.NewCounter(metaQueriesTotal), + DistributedCount: metric.NewCounter(metaDistributedCount), ContendedQueriesCount: metric.NewCounter(metaContendedQueriesCount), CumulativeContentionNanos: metric.NewCounter(metaCumulativeContentionNanos), FlowsActive: metric.NewGauge(metaFlowsActive), @@ -174,9 +182,12 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { } // QueryStart registers the start of a new DistSQL query. -func (m *DistSQLMetrics) QueryStart() { +func (m *DistSQLMetrics) QueryStart(distributed bool) { m.QueriesActive.Inc(1) m.QueriesTotal.Inc(1) + if distributed { + m.DistributedCount.Inc(1) + } } // QueryStop registers the end of a DistSQL query. From 4f4dce18f11686fe03873ed8dc30aa0ff0e58258 Mon Sep 17 00:00:00 2001 From: Michael Erickson Date: Thu, 14 Nov 2024 16:20:23 -0800 Subject: [PATCH 3/3] sql: clarify scope of a few sql.distsql metrics Release note (ops change): Add some clarification that the following metrics count invocations of the DistSQL engine and not SQL queries (which could each result in multiple invocations of the DistSQL engine): - `sql.distsql.queries.active` - `sql.distsql.queries.total` - `sql.distsql.distributed.count` --- docs/generated/metrics/metrics.html | 6 +++--- pkg/sql/distsql_running.go | 4 ++-- pkg/sql/execinfra/metrics.go | 20 ++++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index 1bce24c71d8f..ed7fb69a953a 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -1644,14 +1644,14 @@ APPLICATIONsql.distsql.cumulative_contention_nanosCumulative contention across all queries (in nanoseconds)NanosecondsCOUNTERNANOSECONDSAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.dist_query_rerun_locally.countTotal number of cases when distributed query error resulted in a local rerunQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.dist_query_rerun_locally.failure_countTotal number of cases when the local rerun of a distributed query resulted in an errorQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONsql.distsql.distributed.countNumber of SQL queries executed with full or partial distributionQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.distsql.distributed.countNumber of invocations of the DistSQL engine executed with full or partial distributionDistSQL runsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.exec.latencyLatency of DistSQL statement executionLatencyHISTOGRAMNANOSECONDSAVGNONE APPLICATIONsql.distsql.exec.latency.internalLatency of DistSQL statement execution (internal queries)SQL Internal StatementsHISTOGRAMNANOSECONDSAVGNONE APPLICATIONsql.distsql.flows.activeNumber of distributed SQL flows currently activeFlowsGAUGECOUNTAVGNONE APPLICATIONsql.distsql.flows.totalNumber of distributed SQL flows executedFlowsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONsql.distsql.queries.activeNumber of SQL queries currently activeQueriesGAUGECOUNTAVGNONE +APPLICATIONsql.distsql.queries.activeNumber of invocations of the DistSQL engine currently activeDistSQL runsGAUGECOUNTAVGNONE APPLICATIONsql.distsql.queries.spilledNumber of queries that have spilled to diskQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONsql.distsql.queries.totalNumber of SQL queries executedQueriesCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONsql.distsql.queries.totalNumber of invocations of the DistSQL engine executedDistSQL runsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.select.countNumber of SELECT statements planned to be distributedSQL StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.select.count.internalNumber of SELECT statements planned to be distributed (internal queries)SQL Internal StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONsql.distsql.select.distributed.countNumber of SELECT statements that were distributedSQL StatementsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE diff --git a/pkg/sql/distsql_running.go b/pkg/sql/distsql_running.go index 94ce760bd5f9..b2fbebd3b88b 100644 --- a/pkg/sql/distsql_running.go +++ b/pkg/sql/distsql_running.go @@ -830,8 +830,8 @@ func (dsp *DistSQLPlanner) Run( log.VEvent(ctx, 2, "running DistSQL plan") - dsp.distSQLSrv.ServerConfig.Metrics.QueryStart(len(flows) > 1 /* distributed */) - defer dsp.distSQLSrv.ServerConfig.Metrics.QueryStop() + dsp.distSQLSrv.ServerConfig.Metrics.RunStart(len(flows) > 1 /* distributed */) + defer dsp.distSQLSrv.ServerConfig.Metrics.RunStop() recv.outputTypes = plan.GetResultTypes() if execinfra.IncludeRUEstimateInExplainAnalyze.Get(&dsp.st.SV) && diff --git a/pkg/sql/execinfra/metrics.go b/pkg/sql/execinfra/metrics.go index 9edf23e40235..930c70706aa8 100644 --- a/pkg/sql/execinfra/metrics.go +++ b/pkg/sql/execinfra/metrics.go @@ -41,20 +41,20 @@ var _ metric.Struct = DistSQLMetrics{} var ( metaQueriesActive = metric.Metadata{ Name: "sql.distsql.queries.active", - Help: "Number of SQL queries currently active", - Measurement: "Queries", + Help: "Number of invocations of the DistSQL engine currently active", + Measurement: "DistSQL runs", Unit: metric.Unit_COUNT, } metaQueriesTotal = metric.Metadata{ Name: "sql.distsql.queries.total", - Help: "Number of SQL queries executed", - Measurement: "Queries", + Help: "Number of invocations of the DistSQL engine executed", + Measurement: "DistSQL runs", Unit: metric.Unit_COUNT, } metaDistributedCount = metric.Metadata{ Name: "sql.distsql.distributed.count", - Help: "Number of SQL queries executed with full or partial distribution", - Measurement: "Queries", + Help: "Number of invocations of the DistSQL engine executed with full or partial distribution", + Measurement: "DistSQL runs", Unit: metric.Unit_COUNT, } metaContendedQueriesCount = metric.Metadata{ @@ -181,8 +181,8 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics { } } -// QueryStart registers the start of a new DistSQL query. -func (m *DistSQLMetrics) QueryStart(distributed bool) { +// RunStart registers the start of an invocation of the DistSQL engine. +func (m *DistSQLMetrics) RunStart(distributed bool) { m.QueriesActive.Inc(1) m.QueriesTotal.Inc(1) if distributed { @@ -190,8 +190,8 @@ func (m *DistSQLMetrics) QueryStart(distributed bool) { } } -// QueryStop registers the end of a DistSQL query. -func (m *DistSQLMetrics) QueryStop() { +// RunStop registers the end of an invocation of the DistSQL engine. +func (m *DistSQLMetrics) RunStop() { m.QueriesActive.Dec(1) }