From a17c07f4a29b7ecf80f0248bca681376ab788f0c Mon Sep 17 00:00:00 2001 From: Arul Ajmani Date: Wed, 13 Nov 2024 21:42:34 +0000 Subject: [PATCH] kvserver: log most lease applications We now log every lease application, instead of hiding this behind a verbosity setting. To avoid the logs from getting too spammy, we do so every 5 seconds. Informs #134640 Release note: None --- pkg/kv/kvserver/replica_proposal.go | 151 ++++++++++++++++------------ 1 file changed, 89 insertions(+), 62 deletions(-) diff --git a/pkg/kv/kvserver/replica_proposal.go b/pkg/kv/kvserver/replica_proposal.go index c0261d33b8eb..34ef0730f5c0 100644 --- a/pkg/kv/kvserver/replica_proposal.go +++ b/pkg/kv/kvserver/replica_proposal.go @@ -312,6 +312,8 @@ const ( allowLeaseJump = true ) +var leaseAcquisitionLoggerEvery = log.Every(1 * time.Second) + // leasePostApplyLocked updates the Replica's internal state to reflect the // application of a new Range lease. The method is idempotent, so it can be // called repeatedly for the same lease safely. However, the method will panic @@ -386,17 +388,6 @@ func (r *Replica) leasePostApplyLocked( // timestamp cache. leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID || prevLease.Sequence != newLease.Sequence - if iAmTheLeaseHolder { - // Log lease acquisitions loudly when verbose logging is enabled or when the - // new leaseholder is draining, in which case it should be shedding leases. - // Otherwise, log a trace event. - if log.V(1) || (leaseChangingHands && r.store.IsDraining()) { - log.Infof(ctx, "new range lease %s following %s", newLease, prevLease) - } else { - log.Eventf(ctx, "new range lease %s following %s", newLease, prevLease) - } - } - if leaseChangingHands && iAmTheLeaseHolder { // When taking over the lease, we need to check whether a merge is in // progress, as only the old leaseholder would have been explicitly notified @@ -483,9 +474,9 @@ func (r *Replica) leasePostApplyLocked( r.gossipFirstRangeLocked(ctx) } - // Log the lease acquisition, if appropriate. - if leaseChangingHands && iAmTheLeaseHolder { - r.maybeLogLeaseAcquisition(ctx, now, prevLease, newLease) + // Log the lease, if appropriate. + if iAmTheLeaseHolder { + r.maybeLogLease(ctx, now, prevLease, newLease) } st := r.leaseStatusAtRLocked(ctx, now) @@ -598,58 +589,94 @@ func (r *Replica) leasePostApplyLocked( } } -// maybeLogLeaseAcquisition is called on the new leaseholder when the lease -// changes hands, to log the lease acquisition if appropriate. -func (r *Replica) maybeLogLeaseAcquisition( +// maybeLogLease is called on the new leaseholder to log the lease +// if appropriate. +func (r *Replica) maybeLogLease( ctx context.Context, now hlc.ClockTimestamp, prevLease, newLease *roachpb.Lease, ) { - // Log acquisition of meta and liveness range leases. These are critical to - // cluster health, so it's useful to know their location over time. - if r.descRLocked().StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) { - if r.ownsValidLeaseRLocked(ctx, now) { - log.Health.Infof(ctx, "acquired system range lease: %s [acquisition-type=%s]", - newLease, newLease.AcquisitionType) - } else { - log.Health.Warningf(ctx, "applied system range lease after it expired: %s [acquisition-type=%s]", - newLease, newLease.AcquisitionType) - } + leaseChangingHands := prevLease.Replica.StoreID != newLease.Replica.StoreID || + prevLease.Sequence != newLease.Sequence + + // TODO(arul): consider pulling out all these leasing related logging into + // a separate log channel. + + extension := newLease.Type() == roachpb.LeaseExpiration && !leaseChangingHands + promotion := prevLease.Type() == roachpb.LeaseExpiration && + newLease.Type() != roachpb.LeaseExpiration && !leaseChangingHands + if r.store.IsDraining() && leaseChangingHands { + // If the new leaseholder is on a draining node, in which case it should be + // shedding leases, indicate this in the log line. + log.Health.Infof(ctx, "new range lease %s on draining node following %s", newLease, prevLease) + } else if log.V(1) { + // Log every lease acquisition if verbose logging is enabled. + log.Health.Infof(ctx, "new range lease %s following %s", newLease, prevLease) + } else if !extension && leaseAcquisitionLoggerEvery.ShouldLog() { + // We log lease applications once every leaseAcquisitionLoggerEvery + // duration. to prevent logs from getting too spammy. Moreover, to make + // these logs useful, we don't log extensions for expiration based leases, + // as those are fairly frequent. + log.Health.Infof(ctx, "new range lease %s following %s", newLease, prevLease) + } else if promotion { + // Lease is being promoted. It likely won't be caught by the + // leaseAcquisitionLoggerEvery above, as we attempt to promote the moment + // the lease transfer is applied. Log it here. Note that we log every + // lease transfer anyway, so this shouldn't be too much more chatty. + log.Health.Infof(ctx, "new range lease %s promoted from %s", newLease, prevLease) + } else { + // If none of the above is true, just log as a trace event. + log.Eventf(ctx, "new range lease %s following %s", newLease, prevLease) } - const slowLeaseApplyWarnThreshold = time.Second - newLeaseAppDelay := time.Duration(now.WallTime - newLease.ProposedTS.WallTime) - if newLeaseAppDelay > slowLeaseApplyWarnThreshold { - // If we hold the lease now and the lease was proposed "earlier", there - // must have been replication lag, and possibly reads and/or writes were - // delayed. - // - // We see this most commonly with lease transfers targeting a behind replica, - // or, in the worst case, a snapshot. We are constantly improving our - // heuristics for avoiding that[^1] but if it does happen it's good to know - // from the logs. - // - // In the case of a lease transfer, the two timestamps compared below are from - // different clocks, so there could be skew. We just pretend this is not the - // case, which is good enough here. - // - // [^1]: https://github.com/cockroachdb/cockroach/pull/82758 - log.Health.Warningf(ctx, - "applied lease after ~%.2fs replication lag, client traffic may have "+ - "been delayed [lease=%v prev=%v acquisition-type=%s]", - newLeaseAppDelay.Seconds(), newLease, prevLease, newLease.AcquisitionType) - } else if prevLease.Type() == roachpb.LeaseExpiration && - newLease.Type() != roachpb.LeaseExpiration && - prevLease.Expiration != nil && // nil when there is no previous lease - prevLease.Expiration.LessEq(newLease.Start.ToTimestamp()) { - // If the previous lease is expiration-based, but the new lease is not and - // starts at or after its expiration, it is likely that a lease transfer - // (which is expiration-based) went to a follower that then couldn't upgrade - // it to an epoch lease (for example, didn't apply it in time for it to - // actually serve any traffic). The result was likely an outage which - // resolves right now, so log to point this out. - log.Health.Warningf(ctx, - "lease expired before epoch/leader lease upgrade, client traffic may "+ - "have been delayed [lease=%v prev=%v acquisition-type=%s]", - newLease, prevLease, newLease.AcquisitionType) + if leaseChangingHands { + // Log acquisition of meta and liveness range leases. These are critical to + // cluster health, so it's useful to know their location over time. + if r.descRLocked().StartKey.Less(roachpb.RKey(keys.NodeLivenessKeyMax)) { + if r.ownsValidLeaseRLocked(ctx, now) { + log.Health.Infof(ctx, "acquired system range lease: %s [acquisition-type=%s]", + newLease, newLease.AcquisitionType) + } else { + log.Health.Warningf(ctx, "applied system range lease after it expired: %s [acquisition-type=%s]", + newLease, newLease.AcquisitionType) + } + } + + // Log slow lease applications in the Health log. + const slowLeaseApplyWarnThreshold = time.Second + newLeaseAppDelay := time.Duration(now.WallTime - newLease.ProposedTS.WallTime) + if newLeaseAppDelay > slowLeaseApplyWarnThreshold { + // If we hold the lease now and the lease was proposed "earlier", there + // must have been replication lag, and possibly reads and/or writes were + // delayed. + // + // We see this most commonly with lease transfers targeting a behind replica, + // or, in the worst case, a snapshot. We are constantly improving our + // heuristics for avoiding that[^1] but if it does happen it's good to know + // from the logs. + // + // In the case of a lease transfer, the two timestamps compared below are from + // different clocks, so there could be skew. We just pretend this is not the + // case, which is good enough here. + // + // [^1]: https://github.com/cockroachdb/cockroach/pull/82758 + log.Health.Warningf(ctx, + "applied lease after ~%.2fs replication lag, client traffic may have "+ + "been delayed [lease=%v prev=%v acquisition-type=%s]", + newLeaseAppDelay.Seconds(), newLease, prevLease, newLease.AcquisitionType) + } else if prevLease.Type() == roachpb.LeaseExpiration && + newLease.Type() != roachpb.LeaseExpiration && + prevLease.Expiration != nil && // nil when there is no previous lease + prevLease.Expiration.LessEq(newLease.Start.ToTimestamp()) { + // If the previous lease is expiration-based, but the new lease is not and + // starts at or after its expiration, it is likely that a lease transfer + // (which is expiration-based) went to a follower that then couldn't upgrade + // it to an epoch lease (for example, didn't apply it in time for it to + // actually serve any traffic). The result was likely an outage which + // resolves right now, so log to point this out. + log.Health.Warningf(ctx, + "lease expired before epoch/leader lease upgrade, client traffic may "+ + "have been delayed [lease=%v prev=%v acquisition-type=%s]", + newLease, prevLease, newLease.AcquisitionType) + } } }