Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

sql: add metrics measuring distributed execution #135236

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -1644,15 +1644,18 @@
<tr><td>APPLICATION</td><td>sql.distsql.cumulative_contention_nanos</td><td>Cumulative contention across all queries (in nanoseconds)</td><td>Nanoseconds</td><td>COUNTER</td><td>NANOSECONDS</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.dist_query_rerun_locally.count</td><td>Total number of cases when distributed query error resulted in a local rerun</td><td>Queries</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.dist_query_rerun_locally.failure_count</td><td>Total number of cases when the local rerun of a distributed query resulted in an error</td><td>Queries</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.distributed.count</td><td>Number of invocations of the DistSQL engine executed with full or partial distribution</td><td>DistSQL runs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.exec.latency</td><td>Latency of DistSQL statement execution</td><td>Latency</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.exec.latency.internal</td><td>Latency of DistSQL statement execution (internal queries)</td><td>SQL Internal Statements</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.flows.active</td><td>Number of distributed SQL flows currently active</td><td>Flows</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.flows.total</td><td>Number of distributed SQL flows executed</td><td>Flows</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.queries.active</td><td>Number of SQL queries currently active</td><td>Queries</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.queries.active</td><td>Number of invocations of the DistSQL engine currently active</td><td>DistSQL runs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.queries.spilled</td><td>Number of queries that have spilled to disk</td><td>Queries</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.queries.total</td><td>Number of SQL queries executed</td><td>Queries</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.select.count</td><td>Number of DistSQL SELECT statements</td><td>SQL Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.select.count.internal</td><td>Number of DistSQL SELECT statements (internal queries)</td><td>SQL Internal Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.queries.total</td><td>Number of invocations of the DistSQL engine executed</td><td>DistSQL runs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.select.count</td><td>Number of SELECT statements planned to be distributed</td><td>SQL Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.select.count.internal</td><td>Number of SELECT statements planned to be distributed (internal queries)</td><td>SQL Internal Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.select.distributed.count</td><td>Number of SELECT statements that were distributed</td><td>SQL Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.select.distributed.count.internal</td><td>Number of SELECT statements that were distributed (internal queries)</td><td>SQL Internal Statements</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.service.latency</td><td>Latency of DistSQL request execution</td><td>Latency</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.service.latency.internal</td><td>Latency of DistSQL request execution (internal queries)</td><td>SQL Internal Statements</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>sql.distsql.vec.openfds</td><td>Current number of open file descriptors used by vectorized external storage</td><td>Files</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
Expand Down
5 changes: 4 additions & 1 deletion pkg/roachprod/opentelemetry/cockroachdb_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -1571,6 +1571,7 @@ var cockroachdbMetrics = map[string]string{
"sql_disk_distsql_spilled_bytes_written": "sql.disk.distsql.spilled.bytes.written",
"sql_distsql_contended_queries_count": "sql.distsql.contended.queries",
"sql_distsql_cumulative_contention_nanos": "sql.distsql.cumulative_contention_nanos",
"sql_distsql_distributed_count": "sql.distsql.distributed.count",
"sql_distsql_dist_query_rerun_locally_count": "sql.distsql.dist_query_rerun_locally.count",
"sql_distsql_dist_query_rerun_locally_failure_count": "sql.distsql.dist_query_rerun_locally.failure_count",
"sql_distsql_exec_latency": "sql.distsql.exec.latency",
Expand All @@ -1587,7 +1588,9 @@ var cockroachdbMetrics = map[string]string{
"sql_distsql_queries_spilled": "sql.distsql.queries.spilled",
"sql_distsql_queries_total": "sql.distsql.queries.total",
"sql_distsql_select_count": "sql.distsql.select.count",
"sql_distsql_select_count_internal": "sql.distsql.select.internal",
"sql_distsql_select_count_internal": "sql.distsql.select.count.internal",
"sql_distsql_select_distributed_count": "sql.distsql.select.distributed.count",
"sql_distsql_select_distributed_count_internal": "sql.distsql.select.distributed.count.internal",
"sql_distsql_service_latency": "sql.distsql.service.latency",
"sql_distsql_service_latency_bucket": "sql.distsql.service.latency.bucket",
"sql_distsql_service_latency_count": "sql.distsql.service.latency.count",
Expand Down
9 changes: 5 additions & 4 deletions pkg/sql/conn_executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -505,10 +505,11 @@ func NewServer(cfg *ExecutorConfig, pool *mon.BytesMonitor) *Server {
func makeMetrics(internal bool) Metrics {
return Metrics{
EngineMetrics: EngineMetrics{
DistSQLSelectCount: metric.NewCounter(getMetricMeta(MetaDistSQLSelect, internal)),
SQLOptFallbackCount: metric.NewCounter(getMetricMeta(MetaSQLOptFallback, internal)),
SQLOptPlanCacheHits: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheHits, internal)),
SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)),
DistSQLSelectCount: metric.NewCounter(getMetricMeta(MetaDistSQLSelect, internal)),
DistSQLSelectDistributedCount: metric.NewCounter(getMetricMeta(MetaDistSQLSelectDistributed, internal)),
SQLOptFallbackCount: metric.NewCounter(getMetricMeta(MetaSQLOptFallback, internal)),
SQLOptPlanCacheHits: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheHits, internal)),
SQLOptPlanCacheMisses: metric.NewCounter(getMetricMeta(MetaSQLOptPlanCacheMisses, internal)),
// TODO(mrtracy): See HistogramWindowInterval in server/config.go for the 6x factor.
DistSQLExecLatency: metric.NewHistogram(metric.HistogramOptions{
Mode: metric.HistogramModePreferHdrLatency,
Expand Down
7 changes: 5 additions & 2 deletions pkg/sql/distsql_running.go
Original file line number Diff line number Diff line change
Expand Up @@ -830,8 +830,8 @@ func (dsp *DistSQLPlanner) Run(

log.VEvent(ctx, 2, "running DistSQL plan")

dsp.distSQLSrv.ServerConfig.Metrics.QueryStart()
defer dsp.distSQLSrv.ServerConfig.Metrics.QueryStop()
dsp.distSQLSrv.ServerConfig.Metrics.RunStart(len(flows) > 1 /* distributed */)
defer dsp.distSQLSrv.ServerConfig.Metrics.RunStop()

recv.outputTypes = plan.GetResultTypes()
if execinfra.IncludeRUEstimateInExplainAnalyze.Get(&dsp.st.SV) &&
Expand All @@ -858,6 +858,9 @@ func (dsp *DistSQLPlanner) Run(
dsp.cancelFlowsCoordinator.addFlowsToCancel(flows)
}
}()
if planCtx.planner != nil {
planCtx.planner.curPlan.flags.Set(planFlagDistributedExecution)
}
}

// Currently, we get the statement only if there is a planner available in
Expand Down
8 changes: 7 additions & 1 deletion pkg/sql/exec_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -780,7 +780,13 @@ var (
}
MetaDistSQLSelect = metric.Metadata{
Name: "sql.distsql.select.count",
Help: "Number of DistSQL SELECT statements",
Help: "Number of SELECT statements planned to be distributed",
Measurement: "SQL Statements",
Unit: metric.Unit_COUNT,
}
MetaDistSQLSelectDistributed = metric.Metadata{
Name: "sql.distsql.select.distributed.count",
Help: "Number of SELECT statements that were distributed",
Measurement: "SQL Statements",
Unit: metric.Unit_COUNT,
}
Expand Down
27 changes: 19 additions & 8 deletions pkg/sql/execinfra/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
type DistSQLMetrics struct {
QueriesActive *metric.Gauge
QueriesTotal *metric.Counter
DistributedCount *metric.Counter
ContendedQueriesCount *metric.Counter
CumulativeContentionNanos *metric.Counter
FlowsActive *metric.Gauge
Expand All @@ -40,14 +41,20 @@ var _ metric.Struct = DistSQLMetrics{}
var (
metaQueriesActive = metric.Metadata{
Name: "sql.distsql.queries.active",
Help: "Number of SQL queries currently active",
Measurement: "Queries",
Help: "Number of invocations of the DistSQL engine currently active",
Measurement: "DistSQL runs",
Unit: metric.Unit_COUNT,
}
metaQueriesTotal = metric.Metadata{
Name: "sql.distsql.queries.total",
Help: "Number of SQL queries executed",
Measurement: "Queries",
Help: "Number of invocations of the DistSQL engine executed",
Measurement: "DistSQL runs",
Unit: metric.Unit_COUNT,
}
metaDistributedCount = metric.Metadata{
Name: "sql.distsql.distributed.count",
Help: "Number of invocations of the DistSQL engine executed with full or partial distribution",
Measurement: "DistSQL runs",
Unit: metric.Unit_COUNT,
}
metaContendedQueriesCount = metric.Metadata{
Expand Down Expand Up @@ -145,6 +152,7 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics {
return DistSQLMetrics{
QueriesActive: metric.NewGauge(metaQueriesActive),
QueriesTotal: metric.NewCounter(metaQueriesTotal),
DistributedCount: metric.NewCounter(metaDistributedCount),
ContendedQueriesCount: metric.NewCounter(metaContendedQueriesCount),
CumulativeContentionNanos: metric.NewCounter(metaCumulativeContentionNanos),
FlowsActive: metric.NewGauge(metaFlowsActive),
Expand Down Expand Up @@ -173,14 +181,17 @@ func MakeDistSQLMetrics(histogramWindow time.Duration) DistSQLMetrics {
}
}

// QueryStart registers the start of a new DistSQL query.
func (m *DistSQLMetrics) QueryStart() {
// RunStart registers the start of an invocation of the DistSQL engine.
func (m *DistSQLMetrics) RunStart(distributed bool) {
m.QueriesActive.Inc(1)
m.QueriesTotal.Inc(1)
if distributed {
m.DistributedCount.Inc(1)
}
}

// QueryStop registers the end of a DistSQL query.
func (m *DistSQLMetrics) QueryStop() {
// RunStop registers the end of an invocation of the DistSQL engine.
func (m *DistSQLMetrics) RunStop() {
m.QueriesActive.Dec(1)
}

Expand Down
8 changes: 7 additions & 1 deletion pkg/sql/executor_statement_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,11 @@ import (

// EngineMetrics groups a set of SQL metrics.
type EngineMetrics struct {
// The subset of SELECTs that are processed through DistSQL.
// The subset of SELECTs that are requested to be processed through DistSQL.
DistSQLSelectCount *metric.Counter
// The subset of SELECTs that were executed by DistSQL with full or partial
// distribution.
DistSQLSelectDistributedCount *metric.Counter
// The subset of queries which we attempted and failed to plan with the
// cost-based optimizer.
SQLOptFallbackCount *metric.Counter
Expand Down Expand Up @@ -143,6 +146,9 @@ func (ex *connExecutor) recordStatementSummary(
if flags.IsDistributed() {
if _, ok := stmt.AST.(*tree.Select); ok {
m.DistSQLSelectCount.Inc(1)
if flags.IsSet(planFlagDistributedExecution) {
m.DistSQLSelectDistributedCount.Inc(1)
}
}
if shouldIncludeInLatencyMetrics {
m.DistSQLExecLatency.RecordValue(runLatRaw.Nanoseconds())
Expand Down
15 changes: 10 additions & 5 deletions pkg/sql/plan.go
Original file line number Diff line number Diff line change
Expand Up @@ -579,15 +579,16 @@ const (
// did not find one.
planFlagOptCacheMiss

// planFlagFullyDistributed is set if the query execution is is fully
// distributed.
// planFlagFullyDistributed is set if the query is planned to use full
// distribution.
planFlagFullyDistributed

// planFlagPartiallyDistributed is set if the query execution is is partially
// distributed (see physicalplan.PartiallyDistributedPlan).
// planFlagPartiallyDistributed is set if the query is planned to use partial
// distribution (see physicalplan.PartiallyDistributedPlan).
planFlagPartiallyDistributed

// planFlagNotDistributed is set if the query execution is not distributed.
// planFlagNotDistributed is set if the query is planned to not use
// distribution.
planFlagNotDistributed

// planFlagImplicitTxn marks that the plan was run inside of an implicit
Expand Down Expand Up @@ -645,6 +646,10 @@ const (
// planFlagOptimized is set if optimization was performed during the
// current execution of the query.
planFlagOptimized

// planFlagDistributedExecution is set if execution of any part of the plan
// was distributed.
planFlagDistributedExecution
)

// IsSet returns true if the receiver has all of the given flags set.
Expand Down