From f8c2feaa41f5c78ab19e0fae3a5fd2d0775690db Mon Sep 17 00:00:00 2001 From: Faizan Qazi Date: Fri, 8 Nov 2024 13:43:31 +0000 Subject: [PATCH] scjob: support handling permanent job errors Previously, when a descriptor was detect as a dropped the error was marked as permanent for schema change jobs. However, the declarative schema change job framework did not properly handle these errors. As a result, even if the erorr was tagged we would keep attempting rollbacks. To address this, this patch makes the OnFailOrCancel logic detect permanent errors and allows them to cause the schema change to hit the terminal state. Fixes: #131405 Release note: None --- pkg/sql/schemachanger/schemachanger_test.go | 33 +++++++++++++++++++++ pkg/sql/schemachanger/scjob/job.go | 9 +++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/pkg/sql/schemachanger/schemachanger_test.go b/pkg/sql/schemachanger/schemachanger_test.go index 7a8135e783cf..a87c06fc58ba 100644 --- a/pkg/sql/schemachanger/schemachanger_test.go +++ b/pkg/sql/schemachanger/schemachanger_test.go @@ -922,3 +922,36 @@ func isPQErrWithCode(err error, codes ...pgcode.Code) bool { } return false } + +func TestSchemaChangerFailsOnMissingDesc(t *testing.T) { + defer leaktest.AfterTest(t)() + defer log.Scope(t).Close(t) + + ctx := context.Background() + var params base.TestServerArgs + params.Knobs = base.TestingKnobs{ + SQLDeclarativeSchemaChanger: &scexec.TestingKnobs{ + AfterStage: func(p scplan.Plan, stageIdx int) error { + if p.Params.ExecutionPhase != scop.PostCommitPhase || stageIdx > 1 { + return nil + } + + return catalog.ErrDescriptorNotFound + }, + }, + JobsTestingKnobs: jobs.NewTestingKnobsWithShortIntervals(), + } + + s, sqlDB, _ := serverutils.StartServer(t, params) + defer s.Stopper().Stop(ctx) + + tdb := sqlutils.MakeSQLRunner(sqlDB) + tdb.Exec(t, `SET use_declarative_schema_changer = 'off'`) + tdb.Exec(t, `CREATE DATABASE db`) + tdb.Exec(t, `CREATE TABLE db.t (a INT PRIMARY KEY)`) + tdb.Exec(t, `SET use_declarative_schema_changer = 'unsafe'`) + tdb.ExpectErr(t, "descriptor not found", `ALTER TABLE db.t ADD COLUMN b INT NOT NULL DEFAULT (123)`) + // Validate the job has hit a terminal state. + tdb.CheckQueryResults(t, "SELECT status FROM crdb_internal.jobs WHERE statement LIKE '%ADD COLUMN%'", + [][]string{{"failed"}}) +} diff --git a/pkg/sql/schemachanger/scjob/job.go b/pkg/sql/schemachanger/scjob/job.go index 0cd04b014aab..e0bfbb540315 100644 --- a/pkg/sql/schemachanger/scjob/job.go +++ b/pkg/sql/schemachanger/scjob/job.go @@ -62,6 +62,13 @@ func (n *newSchemaChangeResumer) OnFailOrCancel( ) error { execCtx := execCtxI.(sql.JobExecContext) execCfg := execCtx.ExecCfg() + // Permanent error has been hit, so there is no rollback + // from here. Only if the status is reverting will these be + // treated as fatal. + if jobs.IsPermanentJobError(err) && n.job.Status() == jobs.StatusReverting { + log.Warningf(ctx, "schema change will not rollback; permanent error detected: %v", err) + return nil + } n.rollbackCause = err // Clean up any protected timestamps as a last resort, in case the job @@ -148,7 +155,7 @@ func (n *newSchemaChangeResumer) run(ctx context.Context, execCtxI interface{}) // permanent job error, so that non-cancelable jobs don't get retried. If a // descriptor has gone missing, it isn't likely to come back. if errors.IsAny(err, catalog.ErrDescriptorNotFound, catalog.ErrDescriptorDropped, catalog.ErrReferencedDescriptorNotFound) { - err = jobs.MarkAsPermanentJobError(err) + return jobs.MarkAsPermanentJobError(err) } return err }