From a27819a2cd24b63ac42f31840b96289400dccf08 Mon Sep 17 00:00:00 2001 From: shralex Date: Wed, 27 Dec 2023 09:26:18 -0800 Subject: [PATCH] kv: log slow requests on replica level in addition to range level Previously, slow requests were only logged at the range level, but the logs did not indicate which replica is slow. Moreover, the SlowRPC metric attempted to represent the number of requests currently being retried, however it was done on the range level and therefore missed a second level of replica-level retries being done underneath. This PR adds logging on the replica level, removes a confusing log line, and changes the metric to count the number of slow requests in a simpler manner. Epic: https://cockroachlabs.atlassian.net/browse/CRDB-33510 Fixes: https://github.com/cockroachdb/cockroach/issues/114431 --- docs/generated/metrics/metrics.html | 3 +- pkg/kv/kvclient/kvcoord/dist_sender.go | 60 +++++++++++++++++++-- pkg/kv/kvclient/kvcoord/dist_sender_test.go | 14 ++++- 3 files changed, 71 insertions(+), 6 deletions(-) diff --git a/docs/generated/metrics/metrics.html b/docs/generated/metrics/metrics.html index e986a54fd8c8..68a9c27d3e3b 100644 --- a/docs/generated/metrics/metrics.html +++ b/docs/generated/metrics/metrics.html @@ -924,6 +924,7 @@ APPLICATIONdistsender.rpc.transferlease.sentNumber of TransferLease requests processed.

This counts the requests in batches handed to DistSender, not the RPCs
sent to individual Ranges as a result.RPCsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONdistsender.rpc.truncatelog.sentNumber of TruncateLog requests processed.

This counts the requests in batches handed to DistSender, not the RPCs
sent to individual Ranges as a result.RPCsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONdistsender.rpc.writebatch.sentNumber of WriteBatch requests processed.

This counts the requests in batches handed to DistSender, not the RPCs
sent to individual Ranges as a result.RPCsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE +APPLICATIONdistsender.slow.replicarpcsNumber of slow replica-bound RPCs.

Note that this is not a good signal for KV health. The remote side of the
RPCs tracked here may experience contention, so an end user can easily
cause values for this metric to be emitted by leaving a transaction open
for a long time and contending with it using a second transaction.RequestsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONjobs.adopt_iterationsnumber of job-adopt iterations performed by the registryiterationsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONjobs.auto_config_env_runner.currently_idleNumber of auto_config_env_runner jobs currently considered Idle and can be freely shut downjobsGAUGECOUNTAVGNONE APPLICATIONjobs.auto_config_env_runner.currently_pausedNumber of auto_config_env_runner jobs currently considered PausedjobsGAUGECOUNTAVGNONE @@ -1244,7 +1245,7 @@ APPLICATIONphysical_replication.resolved_events_ingestedResolved events ingested by all replication jobsEventsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE APPLICATIONphysical_replication.runningNumber of currently running replication streamsReplication StreamsGAUGECOUNTAVGNONE APPLICATIONphysical_replication.sst_bytesSST bytes (compressed) sent to KV by all replication jobsBytesCOUNTERBYTESAVGNON_NEGATIVE_DERIVATIVE -APPLICATIONrequests.slow.distsenderNumber of replica-bound RPCs currently stuck or retrying for a long time.

Note that this is not a good signal for KV health. The remote side of the
RPCs tracked here may experience contention, so an end user can easily
cause values for this metric to be emitted by leaving a transaction open
for a long time and contending with it using a second transaction.RequestsGAUGECOUNTAVGNONE +APPLICATIONrequests.slow.distsenderNumber of range-bound RPCs currently stuck or retrying for a long time.

Note that this is not a good signal for KV health. The remote side of the
RPCs tracked here may experience contention, so an end user can easily
cause values for this metric to be emitted by leaving a transaction open
for a long time and contending with it using a second transaction.RequestsGAUGECOUNTAVGNONE APPLICATIONround-trip-latencyDistribution of round-trip latencies with other nodes.

This only reflects successful heartbeats and measures gRPC overhead as well as
possible head-of-line blocking. Elevated values in this metric may hint at
network issues and/or saturation, but they are no proof of them. CPU overload
can similarly elevate this metric. The operator should look towards OS-level
metrics such as packet loss, retransmits, etc, to conclusively diagnose network
issues. Heartbeats are not very frequent (~seconds), so they may not capture
rare or short-lived degradations.
Round-trip timeHISTOGRAMNANOSECONDSAVGNONE APPLICATIONrpc.connection.avg_round_trip_latencySum of exponentially weighted moving average of round-trip latencies, as measured through a gRPC RPC.

Dividing this Gauge by rpc.connection.healthy gives an approximation of average
latency, but the top-level round-trip-latency histogram is more useful. Instead,
users should consult the label families of this metric if they are available
(which requires prometheus and the cluster setting 'server.child_metrics.enabled');
these provide per-peer moving averages.

This metric does not track failed connection. A failed connection's contribution
is reset to zero.
LatencyGAUGENANOSECONDSAVGNONE APPLICATIONrpc.connection.failuresCounter of failed connections.

This includes both the event in which a healthy connection terminates as well as
unsuccessful reconnection attempts.

Connections that are terminated as part of local node shutdown are excluded.
Decommissioned peers are excluded.
ConnectionsCOUNTERCOUNTAVGNON_NEGATIVE_DERIVATIVE diff --git a/pkg/kv/kvclient/kvcoord/dist_sender.go b/pkg/kv/kvclient/kvcoord/dist_sender.go index 3473fbdd2fea..3c804b3bf9f5 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender.go @@ -34,6 +34,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/settings" "github.com/cockroachdb/cockroach/pkg/settings/cluster" "github.com/cockroachdb/cockroach/pkg/util" + "github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb" "github.com/cockroachdb/cockroach/pkg/util/grpcutil" "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" @@ -159,7 +160,18 @@ var ( } metaDistSenderSlowRPCs = metric.Metadata{ Name: "requests.slow.distsender", - Help: `Number of replica-bound RPCs currently stuck or retrying for a long time. + Help: `Number of range-bound RPCs currently stuck or retrying for a long time. + +Note that this is not a good signal for KV health. The remote side of the +RPCs tracked here may experience contention, so an end user can easily +cause values for this metric to be emitted by leaving a transaction open +for a long time and contending with it using a second transaction.`, + Measurement: "Requests", + Unit: metric.Unit_COUNT, + } + metaDistSenderSlowReplicaRPCs = metric.Metadata{ + Name: "distsender.slow.replicarpcs", + Help: `Number of slow replica-bound RPCs. Note that this is not a good signal for KV health. The remote side of the RPCs tracked here may experience contention, so an end user can easily @@ -310,6 +322,7 @@ type DistSenderMetrics struct { InLeaseTransferBackoffs *metric.Counter RangeLookups *metric.Counter SlowRPCs *metric.Gauge + SlowReplicaRPCs *metric.Counter MethodCounts [kvpb.NumMethods]*metric.Counter ErrCounts [kvpb.NumErrors]*metric.Counter DistSenderRangeFeedMetrics @@ -342,6 +355,7 @@ func makeDistSenderMetrics() DistSenderMetrics { InLeaseTransferBackoffs: metric.NewCounter(metaDistSenderInLeaseTransferBackoffsCount), RangeLookups: metric.NewCounter(metaDistSenderRangeLookups), SlowRPCs: metric.NewGauge(metaDistSenderSlowRPCs), + SlowReplicaRPCs: metric.NewCounter(metaDistSenderSlowReplicaRPCs), DistSenderRangeFeedMetrics: makeDistSenderRangeFeedMetrics(), } for i := range m.MethodCounts { @@ -1814,6 +1828,22 @@ func slowRangeRPCReturnWarningStr(s *redact.StringBuilder, dur time.Duration, at s.Printf("slow RPC finished after %.2fs (%d attempts)", dur.Seconds(), attempts) } +func slowReplicaRPCWarningStr( + s *redact.StringBuilder, + ba *kvpb.BatchRequest, + dur time.Duration, + attempts int64, + err error, + br *kvpb.BatchResponse, +) { + resp := interface{}(err) + if resp == nil { + resp = br + } + s.Printf("have been waiting %.2fs (%d attempts) for RPC %s to replica %s; resp: %s", + dur.Seconds(), attempts, ba, ba.Replica, resp) +} + // sendPartialBatch sends the supplied batch to the range specified by the // routing token. // @@ -1907,8 +1937,7 @@ func (ds *DistSender) sendPartialBatch( prevTok = routingTok reply, err = ds.sendToReplicas(ctx, ba, routingTok, withCommit) - const slowDistSenderThreshold = time.Minute - if dur := timeutil.Since(tBegin); dur > slowDistSenderThreshold && !tBegin.IsZero() { + if dur := timeutil.Since(tBegin); dur > slowDistSenderRangeThreshold && !tBegin.IsZero() { { var s redact.StringBuilder slowRangeRPCWarningStr(&s, ba, dur, attempts, routingTok.Desc(), err, reply) @@ -2189,6 +2218,15 @@ func noMoreReplicasErr(ambiguousErr, lastAttemptErr error) error { return newSendError(errors.Wrap(lastAttemptErr, "sending to all replicas failed; last error")) } +// slowDistSenderRangeThreshold is a latency threshold for logging slow +// requests to a range, potentially involving RPCs to multiple replicas +// of the range. +const slowDistSenderRangeThreshold = time.Minute + +// slowDistSenderReplicaThreshold is a latency threshold for logging a slow RPC +// to a single replica. +const slowDistSenderReplicaThreshold = 10 * time.Second + // defaultSendClosedTimestampPolicy is used when the closed timestamp policy // is not known by the range cache. This choice prevents sending batch requests // to only voters when a perfectly good non-voter may exist in the local @@ -2325,7 +2363,8 @@ func (ds *DistSender) sendToReplicas( // per-replica state and may succeed on other replicas. var ambiguousError error var br *kvpb.BatchResponse - for first := true; ; first = false { + attempts := int64(0) + for first := true; ; first, attempts = false, attempts+1 { if !first { ds.metrics.NextReplicaErrCount.Inc(1) } @@ -2411,7 +2450,20 @@ func (ds *DistSender) sendToReplicas( comparisonResult := ds.getLocalityComparison(ctx, ds.nodeIDGetter(), ba.Replica.NodeID) ds.metrics.updateCrossLocalityMetricsOnReplicaAddressedBatchRequest(comparisonResult, int64(ba.Size())) + tBegin := timeutil.Now() // for slow log message br, err = transport.SendNext(ctx, ba) + if dur := timeutil.Since(tBegin); dur > slowDistSenderReplicaThreshold { + var s redact.StringBuilder + slowReplicaRPCWarningStr(&s, ba, dur, attempts, err, br) + if admissionpb.WorkPriority(ba.AdmissionHeader.Priority) >= admissionpb.NormalPri { + // Note that these RPC may or may not have succeeded. Errors are counted separately below. + ds.metrics.SlowReplicaRPCs.Inc(1) + log.Warningf(ctx, "slow replica RPC: %v", &s) + } else { + log.Eventf(ctx, "slow replica RPC: %v", &s) + } + } + ds.metrics.updateCrossLocalityMetricsOnReplicaAddressedBatchResponse(comparisonResult, int64(br.Size())) ds.maybeIncrementErrCounters(br, err) diff --git a/pkg/kv/kvclient/kvcoord/dist_sender_test.go b/pkg/kv/kvclient/kvcoord/dist_sender_test.go index 68b5fa67f476..0a61afbee02a 100644 --- a/pkg/kv/kvclient/kvcoord/dist_sender_test.go +++ b/pkg/kv/kvclient/kvcoord/dist_sender_test.go @@ -4546,6 +4546,11 @@ func TestDistSenderSlowLogMessage(t *testing.T) { get.KeyLockingStrength = lock.Shared get.KeyLockingDurability = lock.Unreplicated ba.Add(get) + ba.Replica = roachpb.ReplicaDescriptor{ + ReplicaID: 1, + NodeID: 2, + StoreID: 3, + } br := &kvpb.BatchResponse{} br.Error = kvpb.NewError(errors.New("boom")) desc := &roachpb.RangeDescriptor{RangeID: 9, StartKey: roachpb.RKey("x"), EndKey: roachpb.RKey("z")} @@ -4557,7 +4562,6 @@ func TestDistSenderSlowLogMessage(t *testing.T) { act := s.RedactableString() require.EqualValues(t, exp, act) } - { exp := `slow RPC finished after 8.16s (120 attempts)` var s redact.StringBuilder @@ -4565,6 +4569,14 @@ func TestDistSenderSlowLogMessage(t *testing.T) { act := s.RedactableString() require.EqualValues(t, exp, act) } + { + exp := `have been waiting 8.16s (120 attempts) for RPC Get(Shared,Unreplicated) [‹"a"›] to` + + ` replica (n2,s3):1; resp: ‹(err: boom)›` + var s redact.StringBuilder + slowReplicaRPCWarningStr(&s, ba, dur, attempts, nil /* err */, br) + act := s.RedactableString() + require.EqualValues(t, exp, act) + } } // TestErrorIndexOnRangeSplit verifies that in case divideAndSendBatchToRanges