From f003a9b0d8e43d56c4b4d7cf90afe6477525aed0 Mon Sep 17 00:00:00 2001 From: Mira Radeva Date: Thu, 14 Nov 2024 10:13:55 -0500 Subject: [PATCH] kv: add a backoff to the retry loop in db.Txn In rare cases (e.g. #77376), two transactions can get repeatedly deadlocked while trying to write to same key(s): one aborts the other, but before it can proceed, the other transaction has restarted and acquired a lock on the key again. This can result in the max transaction retries being exceeded without either transaction succeeding. This commit adds a backoff to the transaction retry loop in `db.Txn`, which will hopefully help one transaction slow down and let the other one commit. Fixes: #77376 Release note: None --- pkg/kv/txn.go | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/pkg/kv/txn.go b/pkg/kv/txn.go index ee0082a3c941..ff45b8cc1549 100644 --- a/pkg/kv/txn.go +++ b/pkg/kv/txn.go @@ -11,6 +11,7 @@ import ( "math" "time" + "github.com/cockroachdb/cockroach/pkg/base" "github.com/cockroachdb/cockroach/pkg/kv/kvpb" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/closedts" "github.com/cockroachdb/cockroach/pkg/kv/kvserver/concurrency/isolation" @@ -23,6 +24,7 @@ import ( "github.com/cockroachdb/cockroach/pkg/util/hlc" "github.com/cockroachdb/cockroach/pkg/util/log" "github.com/cockroachdb/cockroach/pkg/util/protoutil" + "github.com/cockroachdb/cockroach/pkg/util/retry" "github.com/cockroachdb/cockroach/pkg/util/syncutil" "github.com/cockroachdb/cockroach/pkg/util/timeutil" "github.com/cockroachdb/cockroach/pkg/util/uuid" @@ -1045,7 +1047,7 @@ func (e *AutoCommitError) Error() string { func (txn *Txn) exec(ctx context.Context, fn func(context.Context, *Txn) error) (err error) { // Run fn in a retry loop until we encounter a success or // error condition this loop isn't capable of handling. - for attempt := 1; ; attempt++ { + for r := retry.Start(base.DefaultRetryOptions()); r.Next(); { if err := ctx.Err(); err != nil { return errors.Wrap(err, "txn exec") } @@ -1115,7 +1117,8 @@ func (txn *Txn) exec(ctx context.Context, fn func(context.Context, *Txn) error) // txn.db.ctx.Settings == nil is only expected in tests. maxRetries = int(MaxInternalTxnAutoRetries.Get(&txn.db.ctx.Settings.SV)) } - if attempt > maxRetries { + attempt := r.CurrentAttempt() + if attempt >= maxRetries { // If the retries limit has been exceeded, rollback and return an error. rollbackErr := txn.Rollback(ctx) // NOTE: we don't errors.Wrap the most recent retry error because we want