Skip to content

Commit

Permalink
kv: log slow requests on replica level in addition to range level
Browse files Browse the repository at this point in the history
Previously, slow requests were only logged at the range level, but
the logs did not indicate which replica is slow. Moreover, the
SlowRPC metric attempted to represent the number of requests
currently being retried, however it was done on the range level
and therefore missed a second level of replica-level retries being
done underneath.

This PR adds logging on the replica level, removes a confusing log
line, and changes the metric to count the number of slow requests
in a simpler manner.

Epic: https://cockroachlabs.atlassian.net/browse/CRDB-33510
Fixes: cockroachdb#114431
  • Loading branch information
shralex authored and wenyihu6 committed Feb 21, 2024
1 parent ac0ac53 commit a27819a
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 6 deletions.
3 changes: 2 additions & 1 deletion docs/generated/metrics/metrics.html
Original file line number Diff line number Diff line change
Expand Up @@ -924,6 +924,7 @@
<tr><td>APPLICATION</td><td>distsender.rpc.transferlease.sent</td><td>Number of TransferLease requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.truncatelog.sent</td><td>Number of TruncateLog requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.rpc.writebatch.sent</td><td>Number of WriteBatch requests processed.<br/><br/>This counts the requests in batches handed to DistSender, not the RPCs<br/>sent to individual Ranges as a result.</td><td>RPCs</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>distsender.slow.replicarpcs</td><td>Number of slow replica-bound RPCs.<br/><br/>Note that this is not a good signal for KV health. The remote side of the<br/>RPCs tracked here may experience contention, so an end user can easily<br/>cause values for this metric to be emitted by leaving a transaction open<br/>for a long time and contending with it using a second transaction.</td><td>Requests</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.adopt_iterations</td><td>number of job-adopt iterations performed by the registry</td><td>iterations</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>jobs.auto_config_env_runner.currently_idle</td><td>Number of auto_config_env_runner jobs currently considered Idle and can be freely shut down</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>jobs.auto_config_env_runner.currently_paused</td><td>Number of auto_config_env_runner jobs currently considered Paused</td><td>jobs</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
Expand Down Expand Up @@ -1244,7 +1245,7 @@
<tr><td>APPLICATION</td><td>physical_replication.resolved_events_ingested</td><td>Resolved events ingested by all replication jobs</td><td>Events</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.running</td><td>Number of currently running replication streams</td><td>Replication Streams</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>physical_replication.sst_bytes</td><td>SST bytes (compressed) sent to KV by all replication jobs</td><td>Bytes</td><td>COUNTER</td><td>BYTES</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
<tr><td>APPLICATION</td><td>requests.slow.distsender</td><td>Number of replica-bound RPCs currently stuck or retrying for a long time.<br/><br/>Note that this is not a good signal for KV health. The remote side of the<br/>RPCs tracked here may experience contention, so an end user can easily<br/>cause values for this metric to be emitted by leaving a transaction open<br/>for a long time and contending with it using a second transaction.</td><td>Requests</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>requests.slow.distsender</td><td>Number of range-bound RPCs currently stuck or retrying for a long time.<br/><br/>Note that this is not a good signal for KV health. The remote side of the<br/>RPCs tracked here may experience contention, so an end user can easily<br/>cause values for this metric to be emitted by leaving a transaction open<br/>for a long time and contending with it using a second transaction.</td><td>Requests</td><td>GAUGE</td><td>COUNT</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>round-trip-latency</td><td>Distribution of round-trip latencies with other nodes.<br/><br/>This only reflects successful heartbeats and measures gRPC overhead as well as<br/>possible head-of-line blocking. Elevated values in this metric may hint at<br/>network issues and/or saturation, but they are no proof of them. CPU overload<br/>can similarly elevate this metric. The operator should look towards OS-level<br/>metrics such as packet loss, retransmits, etc, to conclusively diagnose network<br/>issues. Heartbeats are not very frequent (~seconds), so they may not capture<br/>rare or short-lived degradations.<br/></td><td>Round-trip time</td><td>HISTOGRAM</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>rpc.connection.avg_round_trip_latency</td><td>Sum of exponentially weighted moving average of round-trip latencies, as measured through a gRPC RPC.<br/><br/>Dividing this Gauge by rpc.connection.healthy gives an approximation of average<br/>latency, but the top-level round-trip-latency histogram is more useful. Instead,<br/>users should consult the label families of this metric if they are available<br/>(which requires prometheus and the cluster setting &#39;server.child_metrics.enabled&#39;);<br/>these provide per-peer moving averages.<br/><br/>This metric does not track failed connection. A failed connection&#39;s contribution<br/>is reset to zero.<br/></td><td>Latency</td><td>GAUGE</td><td>NANOSECONDS</td><td>AVG</td><td>NONE</td></tr>
<tr><td>APPLICATION</td><td>rpc.connection.failures</td><td>Counter of failed connections.<br/><br/>This includes both the event in which a healthy connection terminates as well as<br/>unsuccessful reconnection attempts.<br/><br/>Connections that are terminated as part of local node shutdown are excluded.<br/>Decommissioned peers are excluded.<br/></td><td>Connections</td><td>COUNTER</td><td>COUNT</td><td>AVG</td><td>NON_NEGATIVE_DERIVATIVE</td></tr>
Expand Down
60 changes: 56 additions & 4 deletions pkg/kv/kvclient/kvcoord/dist_sender.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ import (
"github.com/cockroachdb/cockroach/pkg/settings"
"github.com/cockroachdb/cockroach/pkg/settings/cluster"
"github.com/cockroachdb/cockroach/pkg/util"
"github.com/cockroachdb/cockroach/pkg/util/admission/admissionpb"
"github.com/cockroachdb/cockroach/pkg/util/grpcutil"
"github.com/cockroachdb/cockroach/pkg/util/hlc"
"github.com/cockroachdb/cockroach/pkg/util/log"
Expand Down Expand Up @@ -159,7 +160,18 @@ var (
}
metaDistSenderSlowRPCs = metric.Metadata{
Name: "requests.slow.distsender",
Help: `Number of replica-bound RPCs currently stuck or retrying for a long time.
Help: `Number of range-bound RPCs currently stuck or retrying for a long time.
Note that this is not a good signal for KV health. The remote side of the
RPCs tracked here may experience contention, so an end user can easily
cause values for this metric to be emitted by leaving a transaction open
for a long time and contending with it using a second transaction.`,
Measurement: "Requests",
Unit: metric.Unit_COUNT,
}
metaDistSenderSlowReplicaRPCs = metric.Metadata{
Name: "distsender.slow.replicarpcs",
Help: `Number of slow replica-bound RPCs.
Note that this is not a good signal for KV health. The remote side of the
RPCs tracked here may experience contention, so an end user can easily
Expand Down Expand Up @@ -310,6 +322,7 @@ type DistSenderMetrics struct {
InLeaseTransferBackoffs *metric.Counter
RangeLookups *metric.Counter
SlowRPCs *metric.Gauge
SlowReplicaRPCs *metric.Counter
MethodCounts [kvpb.NumMethods]*metric.Counter
ErrCounts [kvpb.NumErrors]*metric.Counter
DistSenderRangeFeedMetrics
Expand Down Expand Up @@ -342,6 +355,7 @@ func makeDistSenderMetrics() DistSenderMetrics {
InLeaseTransferBackoffs: metric.NewCounter(metaDistSenderInLeaseTransferBackoffsCount),
RangeLookups: metric.NewCounter(metaDistSenderRangeLookups),
SlowRPCs: metric.NewGauge(metaDistSenderSlowRPCs),
SlowReplicaRPCs: metric.NewCounter(metaDistSenderSlowReplicaRPCs),
DistSenderRangeFeedMetrics: makeDistSenderRangeFeedMetrics(),
}
for i := range m.MethodCounts {
Expand Down Expand Up @@ -1814,6 +1828,22 @@ func slowRangeRPCReturnWarningStr(s *redact.StringBuilder, dur time.Duration, at
s.Printf("slow RPC finished after %.2fs (%d attempts)", dur.Seconds(), attempts)
}

func slowReplicaRPCWarningStr(
s *redact.StringBuilder,
ba *kvpb.BatchRequest,
dur time.Duration,
attempts int64,
err error,
br *kvpb.BatchResponse,
) {
resp := interface{}(err)
if resp == nil {
resp = br
}
s.Printf("have been waiting %.2fs (%d attempts) for RPC %s to replica %s; resp: %s",
dur.Seconds(), attempts, ba, ba.Replica, resp)
}

// sendPartialBatch sends the supplied batch to the range specified by the
// routing token.
//
Expand Down Expand Up @@ -1907,8 +1937,7 @@ func (ds *DistSender) sendPartialBatch(
prevTok = routingTok
reply, err = ds.sendToReplicas(ctx, ba, routingTok, withCommit)

const slowDistSenderThreshold = time.Minute
if dur := timeutil.Since(tBegin); dur > slowDistSenderThreshold && !tBegin.IsZero() {
if dur := timeutil.Since(tBegin); dur > slowDistSenderRangeThreshold && !tBegin.IsZero() {
{
var s redact.StringBuilder
slowRangeRPCWarningStr(&s, ba, dur, attempts, routingTok.Desc(), err, reply)
Expand Down Expand Up @@ -2189,6 +2218,15 @@ func noMoreReplicasErr(ambiguousErr, lastAttemptErr error) error {
return newSendError(errors.Wrap(lastAttemptErr, "sending to all replicas failed; last error"))
}

// slowDistSenderRangeThreshold is a latency threshold for logging slow
// requests to a range, potentially involving RPCs to multiple replicas
// of the range.
const slowDistSenderRangeThreshold = time.Minute

// slowDistSenderReplicaThreshold is a latency threshold for logging a slow RPC
// to a single replica.
const slowDistSenderReplicaThreshold = 10 * time.Second

// defaultSendClosedTimestampPolicy is used when the closed timestamp policy
// is not known by the range cache. This choice prevents sending batch requests
// to only voters when a perfectly good non-voter may exist in the local
Expand Down Expand Up @@ -2325,7 +2363,8 @@ func (ds *DistSender) sendToReplicas(
// per-replica state and may succeed on other replicas.
var ambiguousError error
var br *kvpb.BatchResponse
for first := true; ; first = false {
attempts := int64(0)
for first := true; ; first, attempts = false, attempts+1 {
if !first {
ds.metrics.NextReplicaErrCount.Inc(1)
}
Expand Down Expand Up @@ -2411,7 +2450,20 @@ func (ds *DistSender) sendToReplicas(
comparisonResult := ds.getLocalityComparison(ctx, ds.nodeIDGetter(), ba.Replica.NodeID)
ds.metrics.updateCrossLocalityMetricsOnReplicaAddressedBatchRequest(comparisonResult, int64(ba.Size()))

tBegin := timeutil.Now() // for slow log message
br, err = transport.SendNext(ctx, ba)
if dur := timeutil.Since(tBegin); dur > slowDistSenderReplicaThreshold {
var s redact.StringBuilder
slowReplicaRPCWarningStr(&s, ba, dur, attempts, err, br)
if admissionpb.WorkPriority(ba.AdmissionHeader.Priority) >= admissionpb.NormalPri {
// Note that these RPC may or may not have succeeded. Errors are counted separately below.
ds.metrics.SlowReplicaRPCs.Inc(1)
log.Warningf(ctx, "slow replica RPC: %v", &s)
} else {
log.Eventf(ctx, "slow replica RPC: %v", &s)
}
}

ds.metrics.updateCrossLocalityMetricsOnReplicaAddressedBatchResponse(comparisonResult, int64(br.Size()))
ds.maybeIncrementErrCounters(br, err)

Expand Down
14 changes: 13 additions & 1 deletion pkg/kv/kvclient/kvcoord/dist_sender_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -4546,6 +4546,11 @@ func TestDistSenderSlowLogMessage(t *testing.T) {
get.KeyLockingStrength = lock.Shared
get.KeyLockingDurability = lock.Unreplicated
ba.Add(get)
ba.Replica = roachpb.ReplicaDescriptor{
ReplicaID: 1,
NodeID: 2,
StoreID: 3,
}
br := &kvpb.BatchResponse{}
br.Error = kvpb.NewError(errors.New("boom"))
desc := &roachpb.RangeDescriptor{RangeID: 9, StartKey: roachpb.RKey("x"), EndKey: roachpb.RKey("z")}
Expand All @@ -4557,14 +4562,21 @@ func TestDistSenderSlowLogMessage(t *testing.T) {
act := s.RedactableString()
require.EqualValues(t, exp, act)
}

{
exp := `slow RPC finished after 8.16s (120 attempts)`
var s redact.StringBuilder
slowRangeRPCReturnWarningStr(&s, dur, attempts)
act := s.RedactableString()
require.EqualValues(t, exp, act)
}
{
exp := `have been waiting 8.16s (120 attempts) for RPC Get(Shared,Unreplicated) [‹"a"›] to` +
` replica (n2,s3):1; resp: ‹(err: boom)›`
var s redact.StringBuilder
slowReplicaRPCWarningStr(&s, ba, dur, attempts, nil /* err */, br)
act := s.RedactableString()
require.EqualValues(t, exp, act)
}
}

// TestErrorIndexOnRangeSplit verifies that in case divideAndSendBatchToRanges
Expand Down

0 comments on commit a27819a

Please sign in to comment.