From d6ff944a36857468014d7cd1a797b9aef220ed46 Mon Sep 17 00:00:00 2001 From: Sandra Romanchenko <53295797+sandraromanchenko@users.noreply.github.com> Date: Mon, 2 Sep 2024 13:55:08 +0300 Subject: [PATCH 01/45] Release 2.6.0 (#1004) * PKG-157 pbm: add marketing message to postinstall --------- Co-authored-by: Oleksandr Miroshnychenko --- packaging/debian/postinst | 10 ++++++++++ packaging/rpm/mongodb-backup.spec | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/packaging/debian/postinst b/packaging/debian/postinst index b89f8ad41..0598396ba 100644 --- a/packaging/debian/postinst +++ b/packaging/debian/postinst @@ -2,4 +2,14 @@ #DEBHELPER# chown mongod:mongod /etc/pbm-storage.conf +cat << EOF +** Join Percona Squad! ** + +Participate in monthly SWAG raffles, get early access to new product features, +invite-only ”ask me anything” sessions with database performance experts. + +Interested? Fill in the form at https://squad.percona.com/mongodb + +EOF + exit 0 diff --git a/packaging/rpm/mongodb-backup.spec b/packaging/rpm/mongodb-backup.spec index 28e7d117a..a8b1cdf38 100644 --- a/packaging/rpm/mongodb-backup.spec +++ b/packaging/rpm/mongodb-backup.spec @@ -110,6 +110,16 @@ fi fi %endif +cat << EOF +** Join Percona Squad! ** + +Participate in monthly SWAG raffles, get early access to new product features, +invite-only ”ask me anything” sessions with database performance experts. + +Interested? Fill in the form at https://squad.percona.com/mongodb + +EOF + %postun -n percona-backup-mongodb case "$1" in From d18cc8320e61b3fcbde1f1af16e92a3e0d50e00d Mon Sep 17 00:00:00 2001 From: Boris Ilijic Date: Thu, 5 Sep 2024 09:32:49 +0200 Subject: [PATCH 02/45] PBM-1338: Restore Start/Finish time for `describe-restore` command (#1005) * Add restore start and finish time info * Fix help text for oplog-replay command * Remove StartPITR info from describe-restore cmd * Add finish time only for successful restore --- cmd/pbm/main.go | 2 +- cmd/pbm/restore.go | 13 ++++++------- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cmd/pbm/main.go b/cmd/pbm/main.go index a71143d35..30f53f839 100644 --- a/cmd/pbm/main.go +++ b/cmd/pbm/main.go @@ -268,7 +268,7 @@ func main() { replayCmd.Flag("start", fmt.Sprintf("Replay oplog from the time. Set in format %s", datetimeFormat)). Required(). StringVar(&replayOpts.start) - replayCmd.Flag("end", "Replay oplog to the time. Set in format %s"). + replayCmd.Flag("end", fmt.Sprintf("Replay oplog to the time. Set in format %s", datetimeFormat)). Required(). StringVar(&replayOpts.end) replayCmd.Flag("wait", "Wait for the restore to finish."). diff --git a/cmd/pbm/restore.go b/cmd/pbm/restore.go index 3db18c5b7..d74523aa8 100644 --- a/cmd/pbm/restore.go +++ b/cmd/pbm/restore.go @@ -532,6 +532,7 @@ type describeRestoreResult struct { Namespaces []string `json:"namespaces,omitempty" yaml:"namespaces,omitempty"` StartTS *int64 `json:"start_ts,omitempty" yaml:"-"` StartTime *string `json:"start,omitempty" yaml:"start,omitempty"` + FinishTime *string `json:"finish,omitempty" yaml:"finish,omitempty"` PITR *int64 `json:"ts_to_restore,omitempty" yaml:"-"` PITRTime *string `json:"time_to_restore,omitempty" yaml:"time_to_restore,omitempty"` LastTransitionTS int64 `json:"last_transition_ts" yaml:"-"` @@ -618,18 +619,16 @@ func describeRestore(ctx context.Context, conn connect.Client, o descrRestoreOpt res.OPID = meta.OPID res.LastTransitionTS = meta.LastTransitionTS res.LastTransitionTime = time.Unix(res.LastTransitionTS, 0).UTC().Format(time.RFC3339) + res.StartTime = util.Ref(time.Unix(meta.StartTS, 0).UTC().Format(time.RFC3339)) + if meta.Status == defs.StatusDone { + res.FinishTime = util.Ref(time.Unix(meta.LastTransitionTS, 0).UTC().Format(time.RFC3339)) + } if meta.Status == defs.StatusError { res.Error = &meta.Error } - if meta.StartPITR != 0 { - res.StartTS = &meta.StartPITR - s := time.Unix(meta.StartPITR, 0).UTC().Format(time.RFC3339) - res.StartTime = &s - } if meta.PITR != 0 { res.PITR = &meta.PITR - s := time.Unix(meta.PITR, 0).UTC().Format(time.RFC3339) - res.PITRTime = &s + res.PITRTime = util.Ref(time.Unix(meta.PITR, 0).UTC().Format(time.RFC3339)) } for _, rs := range meta.Replsets { From 3ca1f951e3b12c7669eeb1a0f0a52a1fa390914d Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Thu, 22 Aug 2024 11:07:28 +0200 Subject: [PATCH 03/45] [PBM-1316] skip in-progress backups for delete many backups --- pbm/backup/delete.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/pbm/backup/delete.go b/pbm/backup/delete.go index 1c47498a6..0fa538f2b 100644 --- a/pbm/backup/delete.go +++ b/pbm/backup/delete.go @@ -497,6 +497,11 @@ func listBackupsBefore(ctx context.Context, conn connect.Client, ts primitive.Ti f := bson.D{ {"store.profile", nil}, {"last_write_ts", bson.M{"$lt": ts}}, + {"status", bson.M{"$in": bson.A{ + defs.StatusDone, + defs.StatusCancelled, + defs.StatusError, + }}}, } o := options.Find().SetSort(bson.D{{"last_write_ts", 1}}) cur, err := conn.BcpCollection().Find(ctx, f, o) From 65a3f12bc6fcf0edcf0863db511e36791794f63f Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 28 Aug 2024 18:34:10 +0200 Subject: [PATCH 04/45] [PBM-1316] add check for another op in CLI --- cmd/pbm/backup.go | 11 ++------- cmd/pbm/common.go | 48 ++++++++++++++++++++++++++++++++++++++ cmd/pbm/config.go | 13 ++++++++++- cmd/pbm/delete.go | 24 ++++++++++++++++--- cmd/pbm/main.go | 58 ++-------------------------------------------- cmd/pbm/oplog.go | 12 +++++++--- cmd/pbm/profile.go | 10 ++++++++ cmd/pbm/restore.go | 17 ++++++++++---- 8 files changed, 116 insertions(+), 77 deletions(-) diff --git a/cmd/pbm/backup.go b/cmd/pbm/backup.go index ec16fe545..714f7f78e 100644 --- a/cmd/pbm/backup.go +++ b/cmd/pbm/backup.go @@ -102,15 +102,8 @@ func runBackup( return nil, errors.Wrap(err, "backup pre-check") } - if err := checkConcurrentOp(ctx, conn); err != nil { - // PITR slicing can be run along with the backup start - agents will resolve it. - var e *concurentOpError - if !errors.As(err, &e) { - return nil, err - } - if e.op.Type != ctrl.CmdPITR { - return nil, err - } + if err := checkForAnotherOperation(ctx, pbm); err != nil { + return nil, err } cfg, err := config.GetProfiledConfig(ctx, conn, b.profile) diff --git a/cmd/pbm/common.go b/cmd/pbm/common.go index 77fb6eb86..eab604027 100644 --- a/cmd/pbm/common.go +++ b/cmd/pbm/common.go @@ -2,11 +2,15 @@ package main import ( "context" + "encoding/json" + "fmt" "time" "github.com/percona/percona-backup-mongodb/pbm/connect" "github.com/percona/percona-backup-mongodb/pbm/ctrl" + "github.com/percona/percona-backup-mongodb/pbm/defs" "github.com/percona/percona-backup-mongodb/pbm/errors" + "github.com/percona/percona-backup-mongodb/sdk" ) var errWaitTimeout = errors.New("Operation is in progress. Check pbm status and logs") @@ -16,3 +20,47 @@ func sendCmd(ctx context.Context, conn connect.Client, cmd ctrl.Cmd) error { _, err := conn.CmdStreamCollection().InsertOne(ctx, cmd) return err } + +func checkForAnotherOperation(ctx context.Context, pbm *sdk.Client) error { + locks, err := pbm.OpLocks(ctx) + if err != nil { + return errors.Wrap(err, "get operation lock") + } + if len(locks) == 0 { + return nil + } + + ts, err := sdk.ClusterTime(ctx, pbm) + if err != nil { + return errors.Wrap(err, "get cluster time") + } + + for _, l := range locks { + if l.Heartbeat.T+defs.StaleFrameSec >= ts.T { + return &concurrentOpError{l} + } + } + + return nil +} + +type concurrentOpError struct{ sdk.OpLock } + +func (e *concurrentOpError) Error() string { + return fmt.Sprintf("another operation in progress, %s/%s [%s/%s]", + e.Cmd, e.OpID, e.Replset, e.Node) +} + +func (e *concurrentOpError) MarshalJSON() ([]byte, error) { + s := map[string]any{ + "error": "another operation in progress", + "operation": map[string]any{ + "type": e.Cmd, + "opid": e.OpID, + "replset": e.Replset, + "node": e.Node, + }, + } + + return json.Marshal(s) +} diff --git a/cmd/pbm/config.go b/cmd/pbm/config.go index 64fa9c0f5..72c604b2d 100644 --- a/cmd/pbm/config.go +++ b/cmd/pbm/config.go @@ -47,7 +47,18 @@ func (c confVals) String() string { return s } -func runConfig(ctx context.Context, conn connect.Client, pbm *sdk.Client, c *configOpts) (fmt.Stringer, error) { +func runConfig( + ctx context.Context, + conn connect.Client, + pbm *sdk.Client, + c *configOpts, +) (fmt.Stringer, error) { + if len(c.set) != 0 || c.rsync || c.file != "" { + if err := checkForAnotherOperation(ctx, pbm); err != nil { + return nil, err + } + } + switch { case len(c.set) > 0: var o confVals diff --git a/cmd/pbm/delete.go b/cmd/pbm/delete.go index 93656cf54..0990dea4e 100644 --- a/cmd/pbm/delete.go +++ b/cmd/pbm/delete.go @@ -44,6 +44,12 @@ func deleteBackup( if d.bcpType != "" && d.olderThan == "" { return nil, errors.New("cannot use --type without --older-then") } + if !d.dryRun { + err := checkForAnotherOperation(ctx, pbm) + if err != nil { + return nil, err + } + } var cid sdk.CommandID var err error @@ -60,7 +66,7 @@ func deleteBackup( } if d.dryRun { - return &outMsg{"running an agent"}, nil + return &outMsg{""}, nil } return waitForDelete(ctx, conn, pbm, cid) @@ -170,6 +176,12 @@ func deletePITR( if d.olderThan != "" && d.all { return nil, errors.New("cannot use --older-then and --all at the same command") } + if !d.dryRun { + err := checkForAnotherOperation(ctx, pbm) + if err != nil { + return nil, err + } + } var until primitive.Timestamp if d.all { @@ -200,7 +212,7 @@ func deletePITR( printDeleteInfoTo(os.Stdout, nil, chunks) if d.dryRun { - return &outMsg{"running an agent"}, nil + return &outMsg{""}, nil } if !d.yes { q := "Are you sure you want to delete chunks?" @@ -255,6 +267,12 @@ func doCleanup(ctx context.Context, conn connect.Client, pbm *sdk.Client, d *cle realTime := n.Format(time.RFC3339) return nil, errors.Errorf("--older-than %q is after now %q", providedTime, realTime) } + if !d.dryRun { + err := checkForAnotherOperation(ctx, pbm) + if err != nil { + return nil, err + } + } info, err := pbm.CleanupReport(ctx, ts) if err != nil { @@ -267,7 +285,7 @@ func doCleanup(ctx context.Context, conn connect.Client, pbm *sdk.Client, d *cle printDeleteInfoTo(os.Stdout, info.Backups, info.Chunks) if d.dryRun { - return &outMsg{"running an agent"}, nil + return &outMsg{""}, nil } if !d.yes { if err := askConfirmation("Are you sure you want to delete?"); err != nil { diff --git a/cmd/pbm/main.go b/cmd/pbm/main.go index 30f53f839..c7819eafa 100644 --- a/cmd/pbm/main.go +++ b/cmd/pbm/main.go @@ -16,10 +16,8 @@ import ( "github.com/percona/percona-backup-mongodb/pbm/connect" "github.com/percona/percona-backup-mongodb/pbm/defs" "github.com/percona/percona-backup-mongodb/pbm/errors" - "github.com/percona/percona-backup-mongodb/pbm/lock" "github.com/percona/percona-backup-mongodb/pbm/log" "github.com/percona/percona-backup-mongodb/pbm/oplog" - "github.com/percona/percona-backup-mongodb/pbm/topo" "github.com/percona/percona-backup-mongodb/pbm/version" "github.com/percona/percona-backup-mongodb/sdk" ) @@ -516,9 +514,9 @@ func main() { case descBcpCmd.FullCommand(): out, err = describeBackup(ctx, pbm, &descBcp) case restoreCmd.FullCommand(): - out, err = runRestore(ctx, conn, &restore, pbmOutF) + out, err = runRestore(ctx, conn, pbm, &restore, pbmOutF) case replayCmd.FullCommand(): - out, err = replayOplog(ctx, conn, replayOpts, pbmOutF) + out, err = replayOplog(ctx, conn, pbm, replayOpts, pbmOutF) case listCmd.FullCommand(): out, err = runList(ctx, conn, pbm, &list) case deleteBcpCmd.FullCommand(): @@ -775,55 +773,3 @@ func parseDateT(v string) (time.Time, error) { return time.Time{}, errInvalidFormat } - -type concurentOpError struct { - op *lock.LockHeader -} - -func (e *concurentOpError) Error() string { - return fmt.Sprintf("another operation in progress, %s/%s [%s/%s]", e.op.Type, e.op.OPID, e.op.Replset, e.op.Node) -} - -func (e *concurentOpError) As(err any) bool { - if err == nil { - return false - } - - er, ok := err.(*concurentOpError) - if !ok { - return false - } - - er.op = e.op - return true -} - -func (e *concurentOpError) MarshalJSON() ([]byte, error) { - s := make(map[string]interface{}) - s["error"] = "another operation in progress" - s["operation"] = e.op - return json.Marshal(s) -} - -func checkConcurrentOp(ctx context.Context, conn connect.Client) error { - locks, err := lock.GetLocks(ctx, conn, &lock.LockHeader{}) - if err != nil { - return errors.Wrap(err, "get locks") - } - - ts, err := topo.GetClusterTime(ctx, conn) - if err != nil { - return errors.Wrap(err, "read cluster time") - } - - // Stop if there is some live operation. - // But in case of stale lock just move on - // and leave it for agents to deal with. - for _, l := range locks { - if l.Heartbeat.T+defs.StaleFrameSec >= ts.T { - return &concurentOpError{&l.LockHeader} - } - } - - return nil -} diff --git a/cmd/pbm/oplog.go b/cmd/pbm/oplog.go index c72be995d..2b25d0206 100644 --- a/cmd/pbm/oplog.go +++ b/cmd/pbm/oplog.go @@ -10,6 +10,7 @@ import ( "github.com/percona/percona-backup-mongodb/pbm/defs" "github.com/percona/percona-backup-mongodb/pbm/errors" "github.com/percona/percona-backup-mongodb/pbm/restore" + "github.com/percona/percona-backup-mongodb/sdk" ) type replayOptions struct { @@ -40,7 +41,13 @@ func (r oplogReplayResult) String() string { return fmt.Sprintf("Oplog replay %q has started", r.Name) } -func replayOplog(ctx context.Context, conn connect.Client, o replayOptions, outf outFormat) (fmt.Stringer, error) { +func replayOplog( + ctx context.Context, + conn connect.Client, + pbm *sdk.Client, + o replayOptions, + outf outFormat, +) (fmt.Stringer, error) { rsMap, err := parseRSNamesMapping(o.rsMap) if err != nil { return nil, errors.Wrap(err, "cannot parse replset mapping") @@ -55,8 +62,7 @@ func replayOplog(ctx context.Context, conn connect.Client, o replayOptions, outf return nil, errors.Wrap(err, "parse end time") } - err = checkConcurrentOp(ctx, conn) - if err != nil { + if err := checkForAnotherOperation(ctx, pbm); err != nil { return nil, err } diff --git a/cmd/pbm/profile.go b/cmd/pbm/profile.go index f6617cc8e..4390cc7cb 100644 --- a/cmd/pbm/profile.go +++ b/cmd/pbm/profile.go @@ -100,6 +100,9 @@ func handleAddConfigProfile( if opts.name == "" { return nil, errors.New("argument `profile-name` should not be empty") } + if err := checkForAnotherOperation(ctx, pbm); err != nil { + return nil, err + } _, err := pbm.GetConfig(ctx) if err != nil { @@ -174,6 +177,9 @@ func handleRemoveConfigProfile( if opts.name == "" { return nil, errors.New("argument `profile-name` should not be empty") } + if err := checkForAnotherOperation(ctx, pbm); err != nil { + return nil, err + } _, err := pbm.GetConfigProfile(ctx, opts.name) if err != nil { @@ -220,6 +226,10 @@ func handleSyncConfigProfile( return nil, errors.New("ambiguous: and --all are provided") } + if err := checkForAnotherOperation(ctx, pbm); err != nil { + return nil, err + } + var err error var cid sdk.CommandID diff --git a/cmd/pbm/restore.go b/cmd/pbm/restore.go index d74523aa8..3adc1b05d 100644 --- a/cmd/pbm/restore.go +++ b/cmd/pbm/restore.go @@ -25,6 +25,7 @@ import ( "github.com/percona/percona-backup-mongodb/pbm/storage" "github.com/percona/percona-backup-mongodb/pbm/topo" "github.com/percona/percona-backup-mongodb/pbm/util" + "github.com/percona/percona-backup-mongodb/sdk" ) type restoreOpts struct { @@ -97,7 +98,13 @@ func (r externRestoreRet) String() string { r.Name, r.Name) } -func runRestore(ctx context.Context, conn connect.Client, o *restoreOpts, outf outFormat) (fmt.Stringer, error) { +func runRestore( + ctx context.Context, + conn connect.Client, + pbm *sdk.Client, + o *restoreOpts, + outf outFormat, +) (fmt.Stringer, error) { nss, err := parseCLINSOption(o.ns) if err != nil { return nil, errors.Wrap(err, "parse --ns option") @@ -115,6 +122,10 @@ func runRestore(ctx context.Context, conn connect.Client, o *restoreOpts, outf o return nil, errors.New("either a backup name or point in time should be set, non both together!") } + if err := checkForAnotherOperation(ctx, pbm); err != nil { + return nil, err + } + clusterTime, err := topo.GetClusterTime(ctx, conn) if err != nil { return nil, errors.Wrap(err, "read cluster time") @@ -318,10 +329,6 @@ func doRestore( if err != nil { return nil, err } - err = checkConcurrentOp(ctx, conn) - if err != nil { - return nil, err - } name := time.Now().UTC().Format(time.RFC3339Nano) From e36eb5f83c2b1f84cadf08e3ed57cd004aca9b40 Mon Sep 17 00:00:00 2001 From: Boris Ilijic Date: Mon, 9 Sep 2024 10:35:59 +0200 Subject: [PATCH 05/45] PBM-1382: Fix priority status for Delayed member (#1006) * Fix priority status for Delayed member * Fix priority calculation for Delayed member --- cmd/pbm/status.go | 3 +- pbm/prio/priority.go | 11 ++-- pbm/prio/priority_test.go | 102 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+), 4 deletions(-) diff --git a/cmd/pbm/status.go b/cmd/pbm/status.go index f81a4f397..f21495294 100644 --- a/cmd/pbm/status.go +++ b/cmd/pbm/status.go @@ -179,7 +179,8 @@ func (n node) String() string { } var s string - if len(n.PrioBcp) == 0 || len(n.PrioPITR) == 0 { + if len(n.PrioBcp) == 0 || len(n.PrioPITR) == 0 || + n.Role == cli.RoleDelayed { s = fmt.Sprintf("%s [%s]: pbm-agent [%s]", n.Host, role, ver) } else { s = fmt.Sprintf("%s [%s], Bkp Prio: [%s], PITR Prio: [%s]: pbm-agent [%s]", diff --git a/pbm/prio/priority.go b/pbm/prio/priority.go index e78432512..ad29321d6 100644 --- a/pbm/prio/priority.go +++ b/pbm/prio/priority.go @@ -9,9 +9,10 @@ import ( ) const ( - defaultScore = 1.0 - scoreForPrimary = defaultScore / 2 - scoreForHidden = defaultScore * 2 + defaultScore = 1.0 + scoreForPrimary = defaultScore / 2 + scoreForHidden = defaultScore * 2 + scoreForExcluded = 0 ) // NodesPriority groups nodes by priority according to @@ -107,6 +108,8 @@ func CalcPriorityForAgent( func CalcPriorityForNode(node *topo.NodeInfo) float64 { if node.IsPrimary { return scoreForPrimary + } else if node.IsDelayed() { + return scoreForExcluded } else if node.Hidden { return scoreForHidden } @@ -122,6 +125,8 @@ func implicitPrioCalc(a *topo.AgentStat, rule map[string]float64) float64 { return defaultScore * coeff } else if a.State == defs.NodeStatePrimary { return scoreForPrimary + } else if a.DelaySecs > 0 { + return scoreForExcluded } else if a.Hidden { return scoreForHidden } diff --git a/pbm/prio/priority_test.go b/pbm/prio/priority_test.go index 605847870..e1d9ad78a 100644 --- a/pbm/prio/priority_test.go +++ b/pbm/prio/priority_test.go @@ -412,6 +412,108 @@ func TestCalcPriorityForNode(t *testing.T) { t.Errorf("wrong priority for hidden: want=%v, got=%v", scoreForHidden, p) } }) + + t.Run("for delayed (slaveDelayed)", func(t *testing.T) { + nodeInfo := &topo.NodeInfo{ + Secondary: true, + SecondaryDelayOld: 5, + } + + p := CalcPriorityForNode(nodeInfo) + if p != scoreForExcluded { + t.Errorf("wrong priority for hidden: want=%v, got=%v", scoreForExcluded, p) + } + }) + + t.Run("for delayed", func(t *testing.T) { + nodeInfo := &topo.NodeInfo{ + Secondary: true, + SecondaryDelaySecs: 1, + } + + p := CalcPriorityForNode(nodeInfo) + if p != scoreForExcluded { + t.Errorf("wrong priority for hidden: want=%v, got=%v", scoreForExcluded, p) + } + }) + + t.Run("for hidden & delayed", func(t *testing.T) { + nodeInfo := &topo.NodeInfo{ + Secondary: true, + SecondaryDelaySecs: 3600, + Hidden: true, + } + + p := CalcPriorityForNode(nodeInfo) + if p != scoreForExcluded { + t.Errorf("wrong priority for hidden: want=%v, got=%v", scoreForExcluded, p) + } + }) +} + +func TestImplicitPrioCalc(t *testing.T) { + t.Run("for primary", func(t *testing.T) { + agentStat := &topo.AgentStat{ + State: defs.NodeStatePrimary, + } + + p := implicitPrioCalc(agentStat, nil) + if p != scoreForPrimary { + t.Errorf("wrong priority for primary: want=%v, got=%v", scoreForPrimary, p) + } + }) + + t.Run("for secondary", func(t *testing.T) { + agentStat := &topo.AgentStat{ + State: defs.NodeStateSecondary, + } + + p := implicitPrioCalc(agentStat, nil) + + if p != defaultScore { + t.Errorf("wrong priority for secondary: want=%v, got=%v", defaultScore, p) + } + }) + + t.Run("for hidden", func(t *testing.T) { + agentStat := &topo.AgentStat{ + State: defs.NodeStateSecondary, + Hidden: true, + } + + p := implicitPrioCalc(agentStat, nil) + + if p != scoreForHidden { + t.Errorf("wrong priority for hidden: want=%v, got=%v", scoreForHidden, p) + } + }) + + t.Run("for delayed", func(t *testing.T) { + agentStat := &topo.AgentStat{ + State: defs.NodeStateSecondary, + DelaySecs: 3600, + } + + p := implicitPrioCalc(agentStat, nil) + + if p != scoreForExcluded { + t.Errorf("wrong priority for hidden: want=%v, got=%v", scoreForExcluded, p) + } + }) + + t.Run("for hidden & delayed", func(t *testing.T) { + agentStat := &topo.AgentStat{ + State: defs.NodeStateSecondary, + DelaySecs: 3600, + Hidden: true, + } + + p := implicitPrioCalc(agentStat, nil) + + if p != scoreForExcluded { + t.Errorf("wrong priority for hidden: want=%v, got=%v", scoreForExcluded, p) + } + }) } func newP(rs, node string) topo.AgentStat { From 6edc99251e351850dafbd40b3849891988435ff2 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 28 Aug 2024 15:16:04 +0200 Subject: [PATCH 06/45] [PBM-1312] allow to control max number of parallel collections --- cmd/pbm-agent/agent.go | 11 ++++++++--- cmd/pbm-agent/backup.go | 2 +- cmd/pbm-agent/main.go | 3 ++- cmd/pbm-agent/oplog.go | 4 +++- cmd/pbm-agent/restore.go | 11 +++++++++-- cmd/pbm/backup.go | 18 ++++++++++++++++++ cmd/pbm/main.go | 4 ++++ cmd/pbm/restore.go | 22 +++++++++++++++------- pbm/archive/archive.go | 8 +++++++- pbm/backup/backup.go | 12 ++++++------ pbm/backup/logical.go | 12 +++++++++++- pbm/ctrl/cmd.go | 3 +++ pbm/restore/logical.go | 15 +++++++++++++-- pbm/snapshot/backup.go | 12 ++++++------ pbm/snapshot/dump.go | 3 ++- pbm/snapshot/restore.go | 1 - 16 files changed, 108 insertions(+), 33 deletions(-) diff --git a/cmd/pbm-agent/agent.go b/cmd/pbm-agent/agent.go index bbddc64f8..4cfed0747 100644 --- a/cmd/pbm-agent/agent.go +++ b/cmd/pbm-agent/agent.go @@ -34,7 +34,7 @@ type Agent struct { brief topo.NodeBrief - dumpConns int + numParallelColls int closeCMD chan struct{} pauseHB int32 @@ -44,7 +44,12 @@ type Agent struct { monStopSig chan struct{} } -func newAgent(ctx context.Context, leadConn connect.Client, uri string, dumpConns int) (*Agent, error) { +func newAgent( + ctx context.Context, + leadConn connect.Client, + uri string, + numParallelColls int, +) (*Agent, error) { nodeConn, err := connect.MongoConnect(ctx, uri, connect.Direct(true)) if err != nil { return nil, err @@ -72,7 +77,7 @@ func newAgent(ctx context.Context, leadConn connect.Client, uri string, dumpConn ConfigSvr: info.IsConfigSrv(), Version: mongoVersion, }, - dumpConns: dumpConns, + numParallelColls: numParallelColls, } return a, nil } diff --git a/cmd/pbm-agent/backup.go b/cmd/pbm-agent/backup.go index e685ad466..9b384e7a3 100644 --- a/cmd/pbm-agent/backup.go +++ b/cmd/pbm-agent/backup.go @@ -114,7 +114,7 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID, case defs.LogicalBackup: fallthrough default: - bcp = backup.New(a.leadConn, a.nodeConn, a.brief, a.dumpConns) + bcp = backup.New(a.leadConn, a.nodeConn, a.brief, a.numParallelColls) } bcp.SetConfig(cfg) diff --git a/cmd/pbm-agent/main.go b/cmd/pbm-agent/main.go index 648060098..34239775f 100644 --- a/cmd/pbm-agent/main.go +++ b/cmd/pbm-agent/main.go @@ -32,7 +32,8 @@ func main() { Envar("PBM_MONGODB_URI"). Required(). String() - dumpConns = pbmAgentCmd.Flag("dump-parallel-collections", "Number of collections to dump in parallel"). + dumpConns = pbmAgentCmd. + Flag("dump-parallel-collections", "Number of collections to dump in parallel"). Envar("PBM_DUMP_PARALLEL_COLLECTIONS"). Default(strconv.Itoa(runtime.NumCPU() / 2)). Int() diff --git a/cmd/pbm-agent/oplog.go b/cmd/pbm-agent/oplog.go index cb1bb1d03..69641d442 100644 --- a/cmd/pbm-agent/oplog.go +++ b/cmd/pbm-agent/oplog.go @@ -71,7 +71,9 @@ func (a *Agent) OplogReplay(ctx context.Context, r *ctrl.ReplayCmd, opID ctrl.OP }() l.Info("oplog replay started") - if err := restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap).ReplayOplog(ctx, r, opID, l); err != nil { + rr := restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap, 0) + err = rr.ReplayOplog(ctx, r, opID, l) + if err != nil { if errors.Is(err, restore.ErrNoDataForShard) { l.Info("no oplog for the shard, skipping") } else { diff --git a/cmd/pbm-agent/restore.go b/cmd/pbm-agent/restore.go index 872681265..1d76153a0 100644 --- a/cmd/pbm-agent/restore.go +++ b/cmd/pbm-agent/restore.go @@ -113,10 +113,17 @@ func (a *Agent) Restore(ctx context.Context, r *ctrl.RestoreCmd, opid ctrl.OPID, l.Info("This node is not the primary. Check pbm agent on the primary for restore progress") return } + + var numParallelColls int + if r.NumParallelColls != nil && *r.NumParallelColls > 0 { + numParallelColls = int(*r.NumParallelColls) + } + + rr := restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap, numParallelColls) if r.OplogTS.IsZero() { - err = restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap).Snapshot(ctx, r, opid, bcp) + err = rr.Snapshot(ctx, r, opid, bcp) } else { - err = restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap).PITR(ctx, r, opid, bcp) + err = rr.PITR(ctx, r, opid, bcp) } case defs.PhysicalBackup, defs.IncrementalBackup, defs.ExternalBackup: if lck != nil { diff --git a/cmd/pbm/backup.go b/cmd/pbm/backup.go index 714f7f78e..814fa83ab 100644 --- a/cmd/pbm/backup.go +++ b/cmd/pbm/backup.go @@ -38,6 +38,8 @@ type backupOpts struct { wait bool waitTime time.Duration externList bool + + numParallelColls int32 } type backupOut struct { @@ -87,6 +89,10 @@ func runBackup( b *backupOpts, outf outFormat, ) (fmt.Stringer, error) { + numParallelColls, err := parseCLINumParallelCollsOption(b.numParallelColls) + if err != nil { + return nil, errors.Wrap(err, "parse --num-parallel-collections option") + } nss, err := parseCLINSOption(b.ns) if err != nil { return nil, errors.Wrap(err, "parse --ns option") @@ -136,6 +142,7 @@ func runBackup( Namespaces: nss, Compression: compression, CompressionLevel: level, + NumParallelColls: numParallelColls, Filelist: b.externList, Profile: b.profile, }, @@ -662,3 +669,14 @@ func (incompatibleMongodVersionError) Is(err error) bool { func (e incompatibleMongodVersionError) Unwrap() error { return errIncompatible } + +func parseCLINumParallelCollsOption(value int32) (*int32, error) { + if value < 0 { + return nil, errors.New("value cannot be negative") + } + if value == 0 { + return nil, nil //nolint:nilnil + } + + return &value, nil +} diff --git a/cmd/pbm/main.go b/cmd/pbm/main.go index c7819eafa..ceec59a2e 100644 --- a/cmd/pbm/main.go +++ b/cmd/pbm/main.go @@ -196,6 +196,8 @@ func main() { backupCmd.Flag("profile", "Config profile name").StringVar(&backupOptions.profile) backupCmd.Flag("compression-level", "Compression level (specific to the compression type)"). IntsVar(&backupOptions.compressionLevel) + backupCmd.Flag("num-parallel-collections", "Number of parallel collections"). + Int32Var(&backupOptions.numParallelColls) backupCmd.Flag("ns", `Namespaces to backup (e.g. "db.*", "db.collection"). If not set, backup all ("*.*")`). StringVar(&backupOptions.ns) backupCmd.Flag("wait", "Wait for the backup to finish"). @@ -239,6 +241,8 @@ func main() { restoreCmd.Flag("base-snapshot", "Override setting: Name of older snapshot that PITR will be based on during restore."). StringVar(&restore.pitrBase) + restoreCmd.Flag("num-parallel-collections", "Number of parallel collections"). + Int32Var(&restore.numParallelColls) restoreCmd.Flag("ns", `Namespaces to restore (e.g. "db1.*,db2.collection2"). If not set, restore all ("*.*")`). StringVar(&restore.ns) restoreCmd.Flag("with-users-and-roles", "Includes users and roles for selected database (--ns flag)"). diff --git a/cmd/pbm/restore.go b/cmd/pbm/restore.go index 3adc1b05d..9c8030f7d 100644 --- a/cmd/pbm/restore.go +++ b/cmd/pbm/restore.go @@ -40,6 +40,8 @@ type restoreOpts struct { rsMap string conf string ts string + + numParallelColls int32 } type restoreRet struct { @@ -105,6 +107,10 @@ func runRestore( o *restoreOpts, outf outFormat, ) (fmt.Stringer, error) { + numParallelColls, err := parseCLINumParallelCollsOption(o.numParallelColls) + if err != nil { + return nil, errors.Wrap(err, "parse --num-parallel-collections option") + } nss, err := parseCLINSOption(o.ns) if err != nil { return nil, errors.Wrap(err, "parse --ns option") @@ -132,7 +138,7 @@ func runRestore( } tdiff := time.Now().Unix() - int64(clusterTime.T) - m, err := doRestore(ctx, conn, o, nss, rsMap, outf) + m, err := doRestore(ctx, conn, o, numParallelColls, nss, rsMap, outf) if err != nil { return nil, err } @@ -321,6 +327,7 @@ func doRestore( ctx context.Context, conn connect.Client, o *restoreOpts, + numParallelColls *int32, nss []string, rsMapping map[string]string, outf outFormat, @@ -335,12 +342,13 @@ func doRestore( cmd := ctrl.Cmd{ Cmd: ctrl.CmdRestore, Restore: &ctrl.RestoreCmd{ - Name: name, - BackupName: bcp, - Namespaces: nss, - UsersAndRoles: o.usersAndRoles, - RSMap: rsMapping, - External: o.extern, + Name: name, + BackupName: bcp, + NumParallelColls: numParallelColls, + Namespaces: nss, + UsersAndRoles: o.usersAndRoles, + RSMap: rsMapping, + External: o.extern, }, } if o.pitr != "" { diff --git a/pbm/archive/archive.go b/pbm/archive/archive.go index 3556e34d1..5fa7fdcec 100644 --- a/pbm/archive/archive.go +++ b/pbm/archive/archive.go @@ -82,12 +82,18 @@ func Decompose(r io.Reader, newWriter NewWriter, nsFilter NSFilterFn, docFilter return errors.Wrap(err, "metadata") } -func Compose(w io.Writer, nsFilter NSFilterFn, newReader NewReader) error { +func Compose(w io.Writer, newReader NewReader, nsFilter NSFilterFn, concurrency int) error { meta, err := readMetadata(newReader) if err != nil { return errors.Wrap(err, "metadata") } + if concurrency > 0 { + // mongorestore uses this field as a number of + // concurrent collections to restore at a moment + meta.Header.ConcurrentCollections = int32(concurrency) + } + nss := make([]*Namespace, 0, len(meta.Namespaces)) for _, ns := range meta.Namespaces { if nsFilter(NSify(ns.Database, ns.Collection)) { diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index e366a34ec..8f20a5970 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -32,17 +32,17 @@ type Backup struct { typ defs.BackupType incrBase bool timeouts *config.BackupTimeouts - dumpConns int + numParallelColls int oplogSlicerInterval time.Duration } func New(leadConn connect.Client, conn *mongo.Client, brief topo.NodeBrief, dumpConns int) *Backup { return &Backup{ - leadConn: leadConn, - nodeConn: conn, - brief: brief, - typ: defs.LogicalBackup, - dumpConns: dumpConns, + leadConn: leadConn, + nodeConn: conn, + brief: brief, + typ: defs.LogicalBackup, + numParallelColls: dumpConns, } } diff --git a/pbm/backup/logical.go b/pbm/backup/logical.go index ab0cb512f..aad0ff1ce 100644 --- a/pbm/backup/logical.go +++ b/pbm/backup/logical.go @@ -135,7 +135,17 @@ func (b *Backup) doLogical( if len(nssSize) == 0 { dump = snapshot.DummyBackup{} } else { - dump, err = snapshot.NewBackup(b.brief.URI, b.dumpConns, db, coll) + numParallelColls := b.numParallelColls + if bcp.NumParallelColls != nil { + if *bcp.NumParallelColls > 0 { + numParallelColls = int(*bcp.NumParallelColls) + } else { + l.Warning("invalid value of NumParallelCollections (%v). fallback to %v", + numParallelColls, b.numParallelColls) + } + } + + dump, err = snapshot.NewBackup(b.brief.URI, numParallelColls, db, coll) if err != nil { return errors.Wrap(err, "init mongodump options") } diff --git a/pbm/ctrl/cmd.go b/pbm/ctrl/cmd.go index 23542ed0e..d43e7f51f 100644 --- a/pbm/ctrl/cmd.go +++ b/pbm/ctrl/cmd.go @@ -133,6 +133,7 @@ type BackupCmd struct { Namespaces []string `bson:"nss,omitempty"` Compression compress.CompressionType `bson:"compression"` CompressionLevel *int `bson:"level,omitempty"` + NumParallelColls *int32 `bson:"numParallelColls,omitempty"` Filelist bool `bson:"filelist,omitempty"` Profile string `bson:"profile,omitempty"` } @@ -154,6 +155,8 @@ type RestoreCmd struct { UsersAndRoles bool `bson:"usersAndRoles,omitempty"` RSMap map[string]string `bson:"rsMap,omitempty"` + NumParallelColls *int32 `bson:"numParallelColls,omitempty"` + OplogTS primitive.Timestamp `bson:"oplogTS,omitempty"` External bool `bson:"external"` diff --git a/pbm/restore/logical.go b/pbm/restore/logical.go index f46b89ea2..db9c6ea35 100644 --- a/pbm/restore/logical.go +++ b/pbm/restore/logical.go @@ -46,6 +46,8 @@ type Restore struct { nodeInfo *topo.NodeInfo bcpStg storage.Storage oplogStg storage.Storage + + numParallelColls int // Shards to participate in restore. Num of shards in bcp could // be less than in the cluster and this is ok. Only these shards // would be expected to run restore (distributed transactions sync, @@ -76,7 +78,13 @@ type oplogRange struct { type restoreUsersAndRolesOption bool // New creates a new restore object -func New(leadConn connect.Client, nodeConn *mongo.Client, brief topo.NodeBrief, rsMap map[string]string) *Restore { +func New( + leadConn connect.Client, + nodeConn *mongo.Client, + brief topo.NodeBrief, + rsMap map[string]string, + numParallelColls int, +) *Restore { if rsMap == nil { rsMap = make(map[string]string) } @@ -87,6 +95,8 @@ func New(leadConn connect.Client, nodeConn *mongo.Client, brief topo.NodeBrief, brief: brief, rsMap: rsMap, + numParallelColls: numParallelColls, + indexCatalog: idx.NewIndexCatalog(), } } @@ -805,7 +815,8 @@ func (r *Restore) RunSnapshot( return rdr, nil }, bcp.Compression, - util.MakeSelectedPred(nss)) + util.MakeSelectedPred(nss), + r.numParallelColls) } if err != nil { return err diff --git a/pbm/snapshot/backup.go b/pbm/snapshot/backup.go index b0cd2cd7e..fc816da39 100644 --- a/pbm/snapshot/backup.go +++ b/pbm/snapshot/backup.go @@ -20,11 +20,7 @@ type backuper struct { pm *progress.BarWriter } -func NewBackup(curi string, conns int, d, c string) (*backuper, error) { - if conns <= 0 { - conns = 1 - } - +func NewBackup(curi string, maxParallelColls int, d, c string) (*backuper, error) { var err error opts := options.New("pbm-agent:dump", version.Current().Version, "", "", false, @@ -49,6 +45,10 @@ func NewBackup(curi string, conns int, d, c string) (*backuper, error) { } } + if maxParallelColls < 1 { + maxParallelColls = 1 + } + backup := &backuper{} backup.pm = progress.NewBarWriter(&progressWriter{}, time.Second*60, 24, false) @@ -59,7 +59,7 @@ func NewBackup(curi string, conns int, d, c string) (*backuper, error) { // instead of creating a file. This is not clear at plain sight, // you nee to look the code to discover it. Archive: "-", - NumParallelCollections: conns, + NumParallelCollections: maxParallelColls, }, InputOptions: &mongodump.InputOptions{}, SessionProvider: &db.SessionProvider{}, diff --git a/pbm/snapshot/dump.go b/pbm/snapshot/dump.go index 3abdeaf31..0cc3ecedc 100644 --- a/pbm/snapshot/dump.go +++ b/pbm/snapshot/dump.go @@ -98,6 +98,7 @@ func DownloadDump( download DownloadFunc, compression compress.CompressionType, match archive.NSFilterFn, + numParallelColls int, ) (io.ReadCloser, error) { pr, pw := io.Pipe() @@ -120,7 +121,7 @@ func DownloadDump( return r, errors.Wrapf(err, "create decompressor: %q", ns) } - err := archive.Compose(pw, match, newReader) + err := archive.Compose(pw, newReader, match, numParallelColls) pw.CloseWithError(errors.Wrap(err, "compose")) }() diff --git a/pbm/snapshot/restore.go b/pbm/snapshot/restore.go index b4f56c589..cdccfc26a 100644 --- a/pbm/snapshot/restore.go +++ b/pbm/snapshot/restore.go @@ -87,7 +87,6 @@ func NewRestore(uri string, cfg *config.Config) (io.ReaderFrom, error) { BypassDocumentValidation: true, Drop: true, NumInsertionWorkers: numInsertionWorkers, - NumParallelCollections: 1, PreserveUUID: preserveUUID, StopOnError: true, WriteConcern: "majority", From 5bb76dacbaf1856074415230d7964c428141795e Mon Sep 17 00:00:00 2001 From: radoslawszulgo Date: Mon, 9 Sep 2024 15:45:16 +0200 Subject: [PATCH 07/45] Update squad.go --- cmd/pbm-agent/squad.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/pbm-agent/squad.go b/cmd/pbm-agent/squad.go index 3a217f2e9..3a2177f44 100644 --- a/cmd/pbm-agent/squad.go +++ b/cmd/pbm-agent/squad.go @@ -14,7 +14,7 @@ const perconaSquadNotice = ` /(( ,((( *### ____) | (_| | |_| | (_| | (_| | //// ((( #### |_____/ \__, |\__,_|\__,_|\__,_| /// (((( #### | | - /////////////(((((((((((((((((######## |_| Join @ percona.com/squad + /////////////(((((((((((((((((######## |_| Join @ squad.percona.com/mongodb ** Join Percona Squad! ** Participate in monthly SWAG raffles, get early access to new product features, From de1369b8d76b396ff0b63713765ebc1e93934f87 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 11 Sep 2024 14:09:17 +0200 Subject: [PATCH 08/45] [PBM-1356] split requirements and warnings on agent start --- cmd/pbm-agent/agent.go | 14 ++++++-------- cmd/pbm-agent/main.go | 2 ++ 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/pbm-agent/agent.go b/cmd/pbm-agent/agent.go index 4cfed0747..ce989caaf 100644 --- a/cmd/pbm-agent/agent.go +++ b/cmd/pbm-agent/agent.go @@ -109,16 +109,16 @@ func (a *Agent) CanStart(ctx context.Context) error { return ErrDelayedNode } - ver, err := version.GetMongoVersion(ctx, a.leadConn.MongoClient()) - if err != nil { - return errors.Wrap(err, "get mongo version") - } - if err := version.FeatureSupport(ver).PBMSupport(); err != nil { + return nil +} + +func (a *Agent) showIncompatibilityWarning(ctx context.Context) { + if err := version.FeatureSupport(a.brief.Version).PBMSupport(); err != nil { log.FromContext(ctx). Warning("", "", "", primitive.Timestamp{}, "WARNING: %v", err) } - if ver.IsShardedTimeseriesSupported() { + if a.brief.Version.IsShardedTimeseriesSupported() { tss, err := topo.ListShardedTimeseries(ctx, a.leadConn) if err != nil { log.FromContext(ctx). @@ -131,8 +131,6 @@ func (a *Agent) CanStart(ctx context.Context) error { strings.Join(tss, ", ")) } } - - return nil } // Start starts listening the commands stream. diff --git a/cmd/pbm-agent/main.go b/cmd/pbm-agent/main.go index 34239775f..77e66deb4 100644 --- a/cmd/pbm-agent/main.go +++ b/cmd/pbm-agent/main.go @@ -118,6 +118,8 @@ func runAgent(mongoURI string, dumpConns int) error { return errors.Wrap(err, "setup pbm collections") } + agent.showIncompatibilityWarning(ctx) + if canRunSlicer { go agent.PITR(ctx) } From 88006eb79ee81ece19be090db01f0ffea1e45332 Mon Sep 17 00:00:00 2001 From: Boris Ilijic Date: Thu, 12 Sep 2024 17:58:04 +0200 Subject: [PATCH 09/45] PBM-1369: Fix never ending backup after agent is lost (#1009) * Fix endless loop when writing cluster last_write_ts * Increase polling time when writing last_write_ts on the cluster level * Add optimization to fetch all locks at ones --- pbm/backup/backup.go | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index 8f20a5970..4c551150e 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -721,7 +721,37 @@ func setClusterLastWriteImpl( break } - time.Sleep(time.Second) + // before we try another time, let's check if we have lost agent + clusterTime, err := topo.GetClusterTime(ctx, conn) + if err != nil { + return errors.Wrap(err, "read cluster time") + } + + locks, err := lock.GetLocks(ctx, conn, &lock.LockHeader{ + Type: ctrl.CmdBackup, + OPID: bcp.OPID, + }) + if err != nil { + return errors.Wrap(err, "get locks") + } + + for _, replset := range bcp.Replsets { + var lck *lock.LockData + for _, l := range locks { + if l.Replset == replset.Name { + lck = &l + break + } + } + if lck == nil { + continue + } + if lck.Heartbeat.T+defs.StaleFrameSec < clusterTime.T { + return errors.Errorf("lost shard %s, last beat ts: %d", replset.Name, lck.Heartbeat.T) + } + } + + time.Sleep(10 * time.Second) } lw := bcp.Replsets[0].LastWriteTS From 07eb18d3f01cf6cf8520e51f038791ac5e0c2375 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Thu, 12 Sep 2024 13:05:43 +0200 Subject: [PATCH 10/45] PBM-823: improve delete backup files --- cmd/pbm-agent/delete.go | 2 +- pbm/backup/backup.go | 2 +- pbm/backup/delete.go | 6 +- pbm/backup/storage.go | 168 +++++++++++----------------------------- 4 files changed, 52 insertions(+), 126 deletions(-) diff --git a/cmd/pbm-agent/delete.go b/cmd/pbm-agent/delete.go index 7831e003e..c69e682c7 100644 --- a/cmd/pbm-agent/delete.go +++ b/cmd/pbm-agent/delete.go @@ -290,7 +290,7 @@ func (a *Agent) Cleanup(ctx context.Context, d *ctrl.CleanupCmd, opid ctrl.OPID, bcp := &cr.Backups[i] eg.Go(func() error { - err := backup.DeleteBackupFiles(bcp, stg) + err := backup.DeleteBackupFiles(stg, bcp.Name) return errors.Wrapf(err, "delete backup files %q", bcp.Name) }) } diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index 4c551150e..f9ed69a7d 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -306,7 +306,7 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l return } - if err := DeleteBackupFiles(bcpm, stg); err != nil { + if err := DeleteBackupFiles(stg, bcp.Name); err != nil { l.Error("Failed to delete leftover files for canceled backup %q", bcpm.Name) } }() diff --git a/pbm/backup/delete.go b/pbm/backup/delete.go index 0fa538f2b..bd5723852 100644 --- a/pbm/backup/delete.go +++ b/pbm/backup/delete.go @@ -79,7 +79,7 @@ func deleteBackupImpl(ctx context.Context, conn connect.Client, bcp *BackupMeta) return errors.Wrap(err, "get storage") } - err = DeleteBackupFiles(bcp, stg) + err = DeleteBackupFiles(stg, bcp.Name) if err != nil { return errors.Wrap(err, "delete files from storage") } @@ -116,7 +116,7 @@ func deleteIncremetalChainImpl(ctx context.Context, conn connect.Client, bcp *Ba for i := len(all) - 1; i >= 0; i-- { bcp := all[i] - err = DeleteBackupFiles(bcp, stg) + err = DeleteBackupFiles(stg, bcp.Name) if err != nil { return errors.Wrap(err, "delete files from storage") } @@ -333,7 +333,7 @@ func DeleteBackupBefore( for i := range backups { bcp := &backups[i] - err := DeleteBackupFiles(bcp, stg) + err := DeleteBackupFiles(stg, bcp.Name) if err != nil { return errors.Wrapf(err, "delete files from storage for %q", bcp.Name) } diff --git a/pbm/backup/storage.go b/pbm/backup/storage.go index cccd5a9d9..95e03b487 100644 --- a/pbm/backup/storage.go +++ b/pbm/backup/storage.go @@ -3,8 +3,9 @@ package backup import ( "context" "encoding/json" - "fmt" "path" + "runtime" + "sync" "golang.org/x/sync/errgroup" @@ -14,6 +15,7 @@ import ( "github.com/percona/percona-backup-mongodb/pbm/errors" "github.com/percona/percona-backup-mongodb/pbm/log" "github.com/percona/percona-backup-mongodb/pbm/storage" + sfs "github.com/percona/percona-backup-mongodb/pbm/storage/fs" "github.com/percona/percona-backup-mongodb/pbm/util" "github.com/percona/percona-backup-mongodb/pbm/version" ) @@ -189,146 +191,70 @@ func checkFile(stg storage.Storage, filename string) error { } // DeleteBackupFiles removes backup's artifacts from storage -func DeleteBackupFiles(meta *BackupMeta, stg storage.Storage) error { - switch meta.Type { - case defs.PhysicalBackup, defs.IncrementalBackup: - return deletePhysicalBackupFiles(meta, stg) - case defs.LogicalBackup: - fallthrough - default: - var err error - if version.IsLegacyArchive(meta.PBMVersion) { - err = deleteLegacyLogicalBackupFiles(meta, stg) - } else { - err = deleteLogicalBackupFiles(meta, stg) - } +func DeleteBackupFiles(stg storage.Storage, backupName string) error { + if fs, ok := stg.(*sfs.FS); ok { + return deleteBackupFromFS(fs, backupName) + } - return err + files, err := stg.List(backupName, "") + if err != nil { + return errors.Wrap(err, "list files") } -} -// DeleteBackupFiles removes backup's artifacts from storage -func deletePhysicalBackupFiles(meta *BackupMeta, stg storage.Storage) error { - if version.HasFilelistFile(meta.PBMVersion) { - for i := range meta.Replsets { - rs := &meta.Replsets[i] - if rs.Files != nil { - // it is already fetched - continue - } + parallel := runtime.NumCPU() + fileC := make(chan string, parallel) + errC := make(chan error, parallel) - filelistPath := path.Join(meta.Name, rs.Name, FilelistName) - rdr, err := stg.SourceReader(filelistPath) - if err != nil { - if errors.Is(err, storage.ErrNotExist) { - continue - } + wg := &sync.WaitGroup{} - return errors.Wrapf(err, "open %q", filelistPath) - } - defer rdr.Close() + wg.Add(parallel) + for range parallel { + go func() { + defer wg.Done() - filelist, err := ReadFilelist(rdr) - rdr.Close() - if err != nil { - return errors.Wrapf(err, "parse filelist") + for f := range fileC { + err := stg.Delete(backupName + "/" + f) + if err != nil { + errC <- errors.Wrapf(err, "delete %s", backupName+"/"+f) + } } - - rs.Files = filelist - } + }() } - for _, r := range meta.Replsets { - for _, f := range r.Files { - fname := meta.Name + "/" + r.Name + "/" + f.Name + meta.Compression.Suffix() - if f.Len != 0 { - fname += fmt.Sprintf(".%d-%d", f.Off, f.Len) - } - err := stg.Delete(fname) - if err != nil && !errors.Is(err, storage.ErrNotExist) { - return errors.Wrapf(err, "delete %s", fname) - } - } - for _, f := range r.Journal { - fname := meta.Name + "/" + r.Name + "/" + f.Name + meta.Compression.Suffix() - if f.Len != 0 { - fname += fmt.Sprintf(".%d-%d", f.Off, f.Len) - } - err := stg.Delete(fname) - if err != nil && !errors.Is(err, storage.ErrNotExist) { - return errors.Wrapf(err, "delete %s", fname) - } - } - if version.HasFilelistFile(meta.PBMVersion) { - err := stg.Delete(path.Join(meta.Name, r.Name, FilelistName)) - if err != nil && !errors.Is(err, storage.ErrNotExist) { - return errors.Wrapf(err, "delete %s", path.Join(meta.Name, r.Name, FilelistName)) - } + go func() { + for i := range files { + fileC <- files[i].Name } - } - - err := stg.Delete(meta.Name + defs.MetadataFileSuffix) - if err != nil && !errors.Is(err, storage.ErrNotExist) { - return errors.Wrap(err, "delete metadata file from storage") - } + close(fileC) - return nil -} + wg.Wait() + close(errC) + }() -// deleteLogicalBackupFiles removes backup's artifacts from storage -func deleteLogicalBackupFiles(meta *BackupMeta, stg storage.Storage) error { - if stg.Type() == storage.Filesystem { - return deleteLogicalBackupFilesFromFS(stg, meta.Name) + var errs []error + for err := range errC { + errs = append(errs, err) } - prefix := meta.Name + "/" - files, err := stg.List(prefix, "") - if err != nil { - return errors.Wrapf(err, "get file list: %q", prefix) - } - - eg := errgroup.Group{} - for _, f := range files { - ns := prefix + f.Name - eg.Go(func() error { - return errors.Wrapf(stg.Delete(ns), "delete %q", ns) - }) - } - if err := eg.Wait(); err != nil { - return err - } - - bcpMF := meta.Name + defs.MetadataFileSuffix - return errors.Wrapf(stg.Delete(bcpMF), "delete %q", bcpMF) -} - -// deleteLogicalBackupFiles removes backup's artifacts from storage -func deleteLogicalBackupFilesFromFS(stg storage.Storage, bcpName string) error { - if err := stg.Delete(bcpName); err != nil { - return errors.Wrapf(err, "delete %q", bcpName) + err = stg.Delete(backupName + defs.MetadataFileSuffix) + if err != nil && !errors.Is(err, storage.ErrNotExist) { + err = errors.Wrapf(err, "delete %s", backupName+defs.MetadataFileSuffix) + errs = append(errs, err) } - bcpMetafile := bcpName + defs.MetadataFileSuffix - return errors.Wrapf(stg.Delete(bcpMetafile), "delete %q", bcpMetafile) + return errors.Join(errs...) } -// deleteLegacyLogicalBackupFiles removes backup's artifacts from storage -func deleteLegacyLogicalBackupFiles(meta *BackupMeta, stg storage.Storage) error { - for _, r := range meta.Replsets { - err := stg.Delete(r.OplogName) - if err != nil && !errors.Is(err, storage.ErrNotExist) { - return errors.Wrapf(err, "delete oplog %s", r.OplogName) - } - err = stg.Delete(r.DumpName) - if err != nil && !errors.Is(err, storage.ErrNotExist) { - return errors.Wrapf(err, "delete dump %s", r.DumpName) - } +func deleteBackupFromFS(stg *sfs.FS, backupName string) error { + err1 := stg.Delete(backupName) + if err1 != nil && !errors.Is(err1, storage.ErrNotExist) { + err1 = errors.Wrapf(err1, "delete %s", backupName) } - err := stg.Delete(meta.Name + defs.MetadataFileSuffix) - if err != nil && !errors.Is(err, storage.ErrNotExist) { - return errors.Wrap(err, "delete metadata file from storage") + err2 := stg.Delete(backupName + defs.MetadataFileSuffix) + if err2 != nil && !errors.Is(err2, storage.ErrNotExist) { + err2 = errors.Wrapf(err2, "delete %s", backupName+defs.MetadataFileSuffix) } - return nil + return errors.Join(err1, err2) } From d7bf446a6a1d772f92a3990a23932932abb8dc47 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Thu, 12 Sep 2024 13:06:10 +0200 Subject: [PATCH 11/45] PBM-823: delete backup files on cancel-backup --- pbm/backup/backup.go | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index f9ed69a7d..0c84f6d92 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -238,7 +238,7 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l hbstop := make(chan struct{}) defer close(hbstop) - err = BackupHB(ctx, b.leadConn, bcp.Name) + err := BackupHB(ctx, b.leadConn, bcp.Name) if err != nil { return errors.Wrap(err, "init heartbeat") } @@ -250,7 +250,6 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l for { select { case <-ctx.Done(): - err = ctx.Err() return case <-tk.C: err = BackupHB(ctx, b.leadConn, bcp.Name) @@ -299,10 +298,7 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l } defer func() { - if !inf.IsLeader() { - return - } - if !errors.Is(err, storage.ErrCancelled) && !errors.Is(err, context.Canceled) { + if err == nil || !inf.IsLeader() { return } From 1503385b380431ca3a028c6d2b0d91e709f9eb1d Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Thu, 12 Sep 2024 14:41:11 +0200 Subject: [PATCH 12/45] PBM-1389: allow to run on MongoDB 8.0 --- pbm/version/version.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pbm/version/version.go b/pbm/version/version.go index cf602c8c0..9e8f658f7 100644 --- a/pbm/version/version.go +++ b/pbm/version/version.go @@ -212,11 +212,11 @@ type FeatureSupport MongoVersion func (f FeatureSupport) PBMSupport() error { v := MongoVersion(f) - if (v.Version[0] >= 5 && v.Version[0] <= 7) && v.Version[1] == 0 { + if (v.Version[0] >= 5 && v.Version[0] <= 8) && v.Version[1] == 0 { return nil } - return errors.New("Unsupported MongoDB version. PBM works with v5.0, v6.0, v7.0") + return errors.New("Unsupported MongoDB version. PBM works with v5.0, v6.0, v7.0, v8.0") } func (f FeatureSupport) FullPhysicalBackup() bool { From e67c983fe1334d80ac6eee542e96c91bd5f0ff6e Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Thu, 12 Sep 2024 14:41:56 +0200 Subject: [PATCH 13/45] PBM-1389: warn about sel backup/restore with ConfigShard --- cmd/pbm-agent/agent.go | 15 ++++++++++++++- pbm/topo/cluster.go | 15 +++++++++++++++ pbm/version/version.go | 4 ++++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/cmd/pbm-agent/agent.go b/cmd/pbm-agent/agent.go index ce989caaf..b03e6ff5b 100644 --- a/cmd/pbm-agent/agent.go +++ b/cmd/pbm-agent/agent.go @@ -118,7 +118,7 @@ func (a *Agent) showIncompatibilityWarning(ctx context.Context) { Warning("", "", "", primitive.Timestamp{}, "WARNING: %v", err) } - if a.brief.Version.IsShardedTimeseriesSupported() { + if a.brief.Sharded && a.brief.Version.IsShardedTimeseriesSupported() { tss, err := topo.ListShardedTimeseries(ctx, a.leadConn) if err != nil { log.FromContext(ctx). @@ -131,6 +131,19 @@ func (a *Agent) showIncompatibilityWarning(ctx context.Context) { strings.Join(tss, ", ")) } } + + if a.brief.Sharded && a.brief.Version.IsConfigShardSupported() { + hasConfigShard, err := topo.HasConfigShard(ctx, a.leadConn) + if err != nil { + log.FromContext(ctx). + Error("", "", "", primitive.Timestamp{}, + "failed to check for Config Shard: %v", err) + } else if hasConfigShard { + log.FromContext(ctx). + Warning("", "", "", primitive.Timestamp{}, + "WARNING: selective backup and restore is not supported with Config Shard") + } + } } // Start starts listening the commands stream. diff --git a/pbm/topo/cluster.go b/pbm/topo/cluster.go index 1c16a88b3..71e6f7713 100644 --- a/pbm/topo/cluster.go +++ b/pbm/topo/cluster.go @@ -171,6 +171,21 @@ func getShardMapImpl(ctx context.Context, m *mongo.Client) (map[ReplsetName]Shar return shards, nil } +// HasConfigShard return true if configsvr is listened in shards list +func HasConfigShard(ctx context.Context, conn connect.Client) (bool, error) { + err := conn.MongoClient().Database("config").Collection("shards"). + FindOne(ctx, bson.D{{"_id", "config"}}). + Err() + if err == nil { + return true, nil // OK: config shard is found + } + if errors.Is(err, mongo.ErrNoDocuments) { + return false, nil // OK: config shard is not found + } + + return false, errors.Wrap(err, "query") +} + type BalancerMode string const ( diff --git a/pbm/version/version.go b/pbm/version/version.go index 9e8f658f7..a7389748b 100644 --- a/pbm/version/version.go +++ b/pbm/version/version.go @@ -193,6 +193,10 @@ func (v MongoVersion) IsShardedTimeseriesSupported() bool { return v.Version[0] >= 6 // sharded timeseries introduced in 5.1 } +func (v MongoVersion) IsConfigShardSupported() bool { + return v.Version[0] >= 8 +} + func GetMongoVersion(ctx context.Context, m *mongo.Client) (MongoVersion, error) { res := m.Database("admin").RunCommand(ctx, bson.D{{"buildInfo", 1}}) if err := res.Err(); err != nil { From 2a91e748836c12a24a56de188552341b7144a6e2 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Thu, 12 Sep 2024 14:42:22 +0200 Subject: [PATCH 14/45] PBM-1389: disallow sel backup/restore with ConfigShard --- pbm/backup/backup.go | 12 ++++++++++++ pbm/restore/logical.go | 16 +++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index 0c84f6d92..22967229d 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -166,6 +166,18 @@ func (b *Backup) Init( // //nolint:nonamedreturns func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l log.LogEvent) (err error) { + if b.brief.Sharded && + b.brief.Version.IsConfigShardSupported() && + util.IsSelective(bcp.Namespaces) { + hasConfigShard, err := topo.HasConfigShard(ctx, b.leadConn) + if err != nil { + return errors.Wrap(err, "check for Config Shard") + } + if hasConfigShard { + return errors.New("selective backup is not supported with Config Shard") + } + } + inf, err := topo.GetNodeInfoExt(ctx, b.nodeConn) if err != nil { return errors.Wrap(err, "get cluster info") diff --git a/pbm/restore/logical.go b/pbm/restore/logical.go index db9c6ea35..886a8f637 100644 --- a/pbm/restore/logical.go +++ b/pbm/restore/logical.go @@ -187,7 +187,7 @@ func (r *Restore) Snapshot( return errors.Wrap(err, "set backup name") } - err = r.checkSnapshot(ctx, bcp) + err = r.checkSnapshot(ctx, bcp, nss) if err != nil { return err } @@ -316,7 +316,7 @@ func (r *Restore) PITR( return errors.Wrap(err, "set backup name") } - err = r.checkSnapshot(ctx, bcp) + err = r.checkSnapshot(ctx, bcp, nss) if err != nil { return err } @@ -697,7 +697,7 @@ func (r *Restore) snapshotObjects(bcp *backup.BackupMeta) (string, []oplog.Oplog return rsMeta.DumpName, chunks, nil } -func (r *Restore) checkSnapshot(ctx context.Context, bcp *backup.BackupMeta) error { +func (r *Restore) checkSnapshot(ctx context.Context, bcp *backup.BackupMeta, nss []string) error { if bcp.Status != defs.StatusDone { return errors.Errorf("backup wasn't successful: status: %s, error: %s", bcp.Status, bcp.Error()) @@ -731,6 +731,16 @@ func (r *Restore) checkSnapshot(ctx context.Context, bcp *backup.BackupMeta) err bcp.MongoVersion, ver.VersionString) return nil } + + if r.brief.Sharded && ver.IsConfigShardSupported() && util.IsSelective(nss) { + hasConfigShard, err := topo.HasConfigShard(ctx, r.leadConn) + if err != nil { + return errors.Wrap(err, "check for Config Shard") + } + if hasConfigShard { + return errors.New("selective restore is not supported with Config Shard") + } + } } return nil From edc72409e71a318cd63d25f4a42044c92b753810 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 18 Sep 2024 09:50:26 +0200 Subject: [PATCH 15/45] PBM-823: enable leftover files check for cancelled backup --- e2e-tests/pkg/tests/sharded/test_backup_cancellation.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/e2e-tests/pkg/tests/sharded/test_backup_cancellation.go b/e2e-tests/pkg/tests/sharded/test_backup_cancellation.go index 03dfc9903..0230137d5 100644 --- a/e2e-tests/pkg/tests/sharded/test_backup_cancellation.go +++ b/e2e-tests/pkg/tests/sharded/test_backup_cancellation.go @@ -28,7 +28,7 @@ func (c *Cluster) BackupCancellation(storage string) { time.Sleep(20 * time.Second) - // checkNoBackupFiles(bcpName, storage) + checkNoBackupFiles(bcpName, storage) log.Println("check backup state") m, err := c.mongopbm.GetBackupMeta(context.TODO(), bcpName) From 59969c2ed836e0ca3bc91ca5c469b336fe4957a6 Mon Sep 17 00:00:00 2001 From: Boris Ilijic Date: Wed, 18 Sep 2024 10:27:12 +0200 Subject: [PATCH 16/45] PBM-1296: PBM fails on PITR restore when timeseries collection was created during oplog slicing (#1012) * Enable using recreated UUID with TimeSeries view: PBM recreates UUID for the time series collection. This fix uses the same recreated UUID when adding the view for the time series collection. --- pbm/oplog/restore.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pbm/oplog/restore.go b/pbm/oplog/restore.go index 8ff572c03..31135a2d2 100644 --- a/pbm/oplog/restore.go +++ b/pbm/oplog/restore.go @@ -87,6 +87,7 @@ var dontPreserveUUID = []string{ "admin.system.roles", "admin.system.keys", "*.system.buckets.*", // timeseries + "*.system.views", // timeseries } // OplogRestore is the oplog applyer @@ -324,7 +325,7 @@ func (o *OplogRestore) handleOp(oe db.Oplog) error { if o.cnamespase != oe.Namespace { o.preserveUUID = o.preserveUUIDopt - // if this is a create operation, the namesape would be + // if this is a create operation, the namespace would be // inside the object to create if oe.Operation == "c" { if len(oe.Object) == 0 { From ca68fd7dc1415bfe486148ffca0c5d16d94dcf5d Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 18 Sep 2024 10:09:10 +0200 Subject: [PATCH 17/45] PBM-1394: enable oplog slicer before making backup --- e2e-tests/pkg/tests/sharded/backuper.go | 2 +- e2e-tests/pkg/tests/sharded/test_oplog_replay.go | 4 ++-- e2e-tests/pkg/tests/sharded/test_pitr_basic.go | 4 ++-- e2e-tests/pkg/tests/sharded/test_timeseries.go | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/e2e-tests/pkg/tests/sharded/backuper.go b/e2e-tests/pkg/tests/sharded/backuper.go index c22f13a10..7e39967eb 100644 --- a/e2e-tests/pkg/tests/sharded/backuper.go +++ b/e2e-tests/pkg/tests/sharded/backuper.go @@ -66,9 +66,9 @@ func NewPitr(c *Cluster) *Pitr { } func (p *Pitr) Backup() { + p.c.pitrOn() bcpName := p.c.LogicalBackup() p.started <- struct{}{} - p.c.pitrOn() p.c.BackupWaitDone(context.TODO(), bcpName) p.sdone <- struct{}{} diff --git a/e2e-tests/pkg/tests/sharded/test_oplog_replay.go b/e2e-tests/pkg/tests/sharded/test_oplog_replay.go index 2f93f9892..56afd60fa 100644 --- a/e2e-tests/pkg/tests/sharded/test_oplog_replay.go +++ b/e2e-tests/pkg/tests/sharded/test_oplog_replay.go @@ -12,12 +12,12 @@ import ( ) func (c *Cluster) OplogReplay() { - bcpName := c.LogicalBackup() - c.pitrOn() log.Println("turn on PITR") defer c.pitrOff() + bcpName := c.LogicalBackup() + counters := make(map[string]shardCounter) for name, cn := range c.shards { c.bcheckClear(name, cn) diff --git a/e2e-tests/pkg/tests/sharded/test_pitr_basic.go b/e2e-tests/pkg/tests/sharded/test_pitr_basic.go index d427b64ba..a1904e021 100644 --- a/e2e-tests/pkg/tests/sharded/test_pitr_basic.go +++ b/e2e-tests/pkg/tests/sharded/test_pitr_basic.go @@ -15,12 +15,12 @@ import ( ) func (c *Cluster) PITRbasic() { - bcpName := c.LogicalBackup() - c.pitrOn() log.Println("turn on PITR") defer c.pitrOff() + bcpName := c.LogicalBackup() + counters := make(map[string]shardCounter) for name, cn := range c.shards { c.bcheckClear(name, cn) diff --git a/e2e-tests/pkg/tests/sharded/test_timeseries.go b/e2e-tests/pkg/tests/sharded/test_timeseries.go index b551d230a..e8c24c9b5 100644 --- a/e2e-tests/pkg/tests/sharded/test_timeseries.go +++ b/e2e-tests/pkg/tests/sharded/test_timeseries.go @@ -16,11 +16,11 @@ func (c *Cluster) Timeseries() { ts1.gen() - bcpName := c.LogicalBackup() - c.pitrOn() defer c.pitrOff() + bcpName := c.LogicalBackup() + c.BackupWaitDone(context.TODO(), bcpName) time.Sleep(time.Second) From 4e467842b7e52c2892ff69bd24dac19a231db215 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 18 Sep 2024 10:09:58 +0200 Subject: [PATCH 18/45] PBM-1394: check if delete can be run during backup --- e2e-tests/cmd/pbm-test/run.go | 4 ++-- e2e-tests/docker/pbm-agent/Dockerfile | 0 e2e-tests/pkg/tests/sharded/test_delete_backup.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) delete mode 100644 e2e-tests/docker/pbm-agent/Dockerfile diff --git a/e2e-tests/cmd/pbm-test/run.go b/e2e-tests/cmd/pbm-test/run.go index f2d517fac..6d9f7e383 100644 --- a/e2e-tests/cmd/pbm-test/run.go +++ b/e2e-tests/cmd/pbm-test/run.go @@ -70,8 +70,8 @@ func run(t *sharded.Cluster, typ testTyp) { t.SetBallastData(1e5) - runTest("Check the Running Backup can't be deleted", - t.BackupNotDeleteRunning) + runTest("Check the Cannot Run Delete During Backup", + t.CannotRunDeleteDuringBackup) runTest("Check Backup Cancellation", func() { t.BackupCancellation(storage) }) diff --git a/e2e-tests/docker/pbm-agent/Dockerfile b/e2e-tests/docker/pbm-agent/Dockerfile deleted file mode 100644 index e69de29bb..000000000 diff --git a/e2e-tests/pkg/tests/sharded/test_delete_backup.go b/e2e-tests/pkg/tests/sharded/test_delete_backup.go index 78ff7994e..84d054cbd 100644 --- a/e2e-tests/pkg/tests/sharded/test_delete_backup.go +++ b/e2e-tests/pkg/tests/sharded/test_delete_backup.go @@ -191,12 +191,12 @@ func checkArtefacts(conf string, shouldStay map[string]struct{}) { } } -func (c *Cluster) BackupNotDeleteRunning() { +func (c *Cluster) CannotRunDeleteDuringBackup() { bcpName := c.LogicalBackup() c.printBcpList() log.Println("deleting backup", bcpName) o, err := c.pbm.RunCmd("pbm", "delete-backup", "-y", bcpName) - if err == nil || !strings.Contains(err.Error(), "backup is in progress") { + if err == nil || !strings.Contains(err.Error(), "another operation in progress, Snapshot backup") { list, lerr := c.pbm.RunCmd("pbm", "list") log.Fatalf("ERROR: running backup '%s' shouldn't be deleted.\n"+ "Output: %s\nStderr:%v\nBackups list:\n%v\n%v", From d556a6dc87585a4af72174e2a57970ea9e6ab5cb Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Tue, 17 Sep 2024 14:32:46 +0200 Subject: [PATCH 19/45] PBM-1297: allow to create config.databases during oplog replay --- pbm/oplog/restore.go | 55 +++++++++++++++++++++++++++++--------------- pbm/oplog/util.go | 10 -------- 2 files changed, 37 insertions(+), 28 deletions(-) delete mode 100644 pbm/oplog/util.go diff --git a/pbm/oplog/restore.go b/pbm/oplog/restore.go index 31135a2d2..69239a0ac 100644 --- a/pbm/oplog/restore.go +++ b/pbm/oplog/restore.go @@ -14,6 +14,7 @@ import ( "fmt" "io" "reflect" + "slices" "strings" "sync/atomic" @@ -77,8 +78,6 @@ var selectedNSSupportedCommands = map[string]struct{}{ "dropIndex": {}, "dropIndexes": {}, "collMod": {}, - "startIndexBuild": {}, - "abortIndexBuild": {}, "commitIndexBuild": {}, } @@ -258,6 +257,32 @@ func (o *OplogRestore) SetIncludeNS(nss []string) { o.includeNS = dbs } +func isOpAllowed(oe *Record) bool { + coll, ok := strings.CutPrefix(oe.Namespace, "config.") + if !ok { + return true // OK: not a "config" database. allow any ops + } + + if slices.Contains(dumprestore.ConfigCollectionsToKeep, coll) { + return true // OK: create/update/delete a doc + } + + if coll != "$cmd" || len(oe.Object) == 0 { + return false // other collection is not allowed + } + + op := oe.Object[0].Key + if op == "applyOps" { + return true // internal ops of applyOps are checked one by one later + } + if _, ok := selectedNSSupportedCommands[op]; ok { + s, _ := oe.Object[0].Value.(string) + return slices.Contains(dumprestore.ConfigCollectionsToKeep, s) + } + + return false +} + func (o *OplogRestore) isOpSelected(oe *Record) bool { if o.includeNS == nil || o.includeNS[""] != nil { return true @@ -273,11 +298,13 @@ func (o *OplogRestore) isOpSelected(oe *Record) bool { return false } - for _, el := range oe.Object { - if _, ok := selectedNSSupportedCommands[el.Key]; ok { - s, _ := el.Value.(string) - return colls[s] - } + cmd := oe.Object[0].Key + if cmd == "applyOps" { + return true // internal ops of applyOps are checked one by one later + } + if _, ok := selectedNSSupportedCommands[cmd]; ok { + s, _ := oe.Object[0].Value.(string) + return colls[s] } return false @@ -302,13 +329,7 @@ func (o *OplogRestore) handleOp(oe db.Oplog) error { return nil } - if db, coll, _ := strings.Cut(oe.Namespace, "."); db == "config" { - if !sliceContains(dumprestore.ConfigCollectionsToKeep, coll) { - return nil - } - } - - if !o.isOpSelected(&oe) { + if !isOpAllowed(&oe) || !o.isOpSelected(&oe) { return nil } @@ -631,10 +652,8 @@ func (o *OplogRestore) handleNonTxnOp(op db.Oplog) error { return nil } - if db, coll, _ := strings.Cut(op.Namespace, "."); db == "config" { - if !sliceContains(dumprestore.ConfigCollectionsToKeep, coll) { - return nil - } + if !isOpAllowed(&op) || !o.isOpSelected(&op) { + return nil } op, err := o.filterUUIDs(op) diff --git a/pbm/oplog/util.go b/pbm/oplog/util.go deleted file mode 100644 index b592c06db..000000000 --- a/pbm/oplog/util.go +++ /dev/null @@ -1,10 +0,0 @@ -package oplog - -func sliceContains[S ~[]E, E comparable](s S, v E) bool { - for i := range s { - if v == s[i] { - return true - } - } - return false -} From 36e98fffcbfee46aa473a191afe7624ba06fd8b6 Mon Sep 17 00:00:00 2001 From: Alexey Torkhov Date: Tue, 28 May 2024 12:29:33 +0300 Subject: [PATCH 20/45] Use RemoteCredsProvider from aws default stack This change allows to retrieve credentials from ECS and EKS roles. --- pbm/storage/s3/s3.go | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/pbm/storage/s3/s3.go b/pbm/storage/s3/s3.go index 4ce28b5e6..1bd7cea93 100644 --- a/pbm/storage/s3/s3.go +++ b/pbm/storage/s3/s3.go @@ -18,9 +18,8 @@ import ( "github.com/aws/aws-sdk-go/aws/awserr" "github.com/aws/aws-sdk-go/aws/client" "github.com/aws/aws-sdk-go/aws/credentials" - "github.com/aws/aws-sdk-go/aws/credentials/ec2rolecreds" "github.com/aws/aws-sdk-go/aws/credentials/stscreds" - "github.com/aws/aws-sdk-go/aws/ec2metadata" + "github.com/aws/aws-sdk-go/aws/defaults" "github.com/aws/aws-sdk-go/aws/request" "github.com/aws/aws-sdk-go/aws/session" "github.com/aws/aws-sdk-go/service/s3" @@ -574,10 +573,6 @@ func (s *S3) session() (*session.Session, error) { )) } - providers = append(providers, &ec2rolecreds.EC2RoleProvider{ - Client: ec2metadata.New(awsSession), - }) - httpClient := &http.Client{} if s.opts.InsecureSkipTLSVerify { httpClient = &http.Client{ @@ -587,15 +582,21 @@ func (s *S3) session() (*session.Session, error) { } } - return session.NewSession(&aws.Config{ + cfg := &aws.Config{ Region: aws.String(s.opts.Region), Endpoint: aws.String(s.opts.EndpointURL), - Credentials: credentials.NewChainCredentials(providers), S3ForcePathStyle: s.opts.ForcePathStyle, HTTPClient: httpClient, LogLevel: aws.LogLevel(SDKLogLevel(s.opts.DebugLogLevels, nil)), Logger: awsLogger(s.log), - }) + } + + // fetch credentials from remote endpoints like EC2 or ECS roles + providers = append(providers, defaults.RemoteCredProvider(*cfg, defaults.Handlers())) + + cfg.Credentials = credentials.NewChainCredentials(providers) + + return session.NewSession(cfg) } func awsLogger(l log.LogEvent) aws.Logger { From 6e0981ce8b0f2c589a8b63e4b49f368038d8d1f5 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Mon, 23 Sep 2024 12:50:20 +0200 Subject: [PATCH 21/45] PBM-1387: return empty if not path is set (#1018) --- cmd/pbm/config.go | 3 +++ pbm/config/config.go | 2 ++ 2 files changed, 5 insertions(+) diff --git a/cmd/pbm/config.go b/cmd/pbm/config.go index 72c604b2d..722daaa36 100644 --- a/cmd/pbm/config.go +++ b/cmd/pbm/config.go @@ -84,6 +84,9 @@ func runConfig( case len(c.key) > 0: k, err := config.GetConfigVar(ctx, conn, c.key) if err != nil { + if errors.Is(err, config.ErrUnsetConfigPath) { + return confKV{c.key, ""}, nil // unset config path + } return nil, errors.Wrap(err, "unable to get config key") } return confKV{c.key, fmt.Sprint(k)}, nil diff --git a/pbm/config/config.go b/pbm/config/config.go index 550911f6f..180e204de 100644 --- a/pbm/config/config.go +++ b/pbm/config/config.go @@ -15,6 +15,7 @@ import ( "go.mongodb.org/mongo-driver/bson/primitive" "go.mongodb.org/mongo-driver/mongo" "go.mongodb.org/mongo-driver/mongo/options" + "go.mongodb.org/mongo-driver/x/bsonx/bsoncore" "gopkg.in/yaml.v2" "github.com/percona/percona-backup-mongodb/pbm/compress" @@ -32,6 +33,7 @@ var ( ErrUnkownStorageType = errors.New("unknown storage type") ErrMissedConfig = errors.New("missed config") ErrMissedConfigProfile = errors.New("missed config profile") + ErrUnsetConfigPath = bsoncore.ErrElementNotFound ) type confMap map[string]reflect.Kind From 827d30dd77771216bc1d2c77e16e5742409a368c Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 18 Sep 2024 15:58:58 +0200 Subject: [PATCH 22/45] PBM-1229: skip waitForStatus on the backup leader the leader only sets the status. no reason to wait for known condition --- pbm/backup/backup.go | 22 ++++++++++------------ pbm/backup/logical.go | 24 +++++++++++++----------- pbm/backup/physical.go | 12 ++++++------ 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index 22967229d..3adb0450f 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -348,14 +348,12 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l } err = writeMeta(stg, bcpm) - if err != nil { - return errors.Wrap(err, "dump metadata") - } + return errors.Wrap(err, "dump metadata") + } else { + // to be sure the locks released only after the "done" status had written + err = b.waitForStatus(ctx, bcp.Name, defs.StatusDone, nil) + return errors.Wrap(err, "waiting for done") } - - // to be sure the locks released only after the "done" status had written - err = b.waitForStatus(ctx, bcp.Name, defs.StatusDone, nil) - return errors.Wrap(err, "waiting for done") } func waitForBalancerOff(ctx context.Context, conn connect.Client, t time.Duration, l log.LogEvent) topo.BalancerMode { @@ -412,11 +410,11 @@ func (b *Backup) toState( } return errors.Wrapf(err, "check cluster for backup `%s`", status) } - } - - err = b.waitForStatus(ctx, bcp, status, wait) - if err != nil { - return errors.Wrapf(err, "waiting for %s", status) + } else { + err = b.waitForStatus(ctx, bcp, status, wait) + if err != nil { + return errors.Wrapf(err, "waiting for %s", status) + } } return nil diff --git a/pbm/backup/logical.go b/pbm/backup/logical.go index aad0ff1ce..858fdbe44 100644 --- a/pbm/backup/logical.go +++ b/pbm/backup/logical.go @@ -81,16 +81,18 @@ func (b *Backup) doLogical( return errors.Wrap(err, "check cluster for backup started") } + // TODO(improve): do setClusterFirstWrite between + // all replsets status are StatusRunning and setting the global status err = b.setClusterFirstWrite(ctx, bcp.Name) if err != nil { return errors.Wrap(err, "set cluster first write ts") } - } - - // Waiting for cluster's StatusRunning to move further. - err = b.waitForStatus(ctx, bcp.Name, defs.StatusRunning, nil) - if err != nil { - return errors.Wrap(err, "waiting for running") + } else { + // Waiting for cluster's StatusRunning to move further. + err = b.waitForStatus(ctx, bcp.Name, defs.StatusRunning, nil) + if err != nil { + return errors.Wrap(err, "waiting for running") + } } stopOplogSlicer := startOplogSlicer(ctx, @@ -195,11 +197,11 @@ func (b *Backup) doLogical( if err != nil { return errors.Wrap(err, "check cluster for dump done") } - } - - err = b.waitForStatus(ctx, bcp.Name, defs.StatusDumpDone, nil) - if err != nil { - return errors.Wrap(err, "waiting for dump done") + } else { + err = b.waitForStatus(ctx, bcp.Name, defs.StatusDumpDone, nil) + if err != nil { + return errors.Wrap(err, "waiting for dump done") + } } lastSavedTS, oplogSize, err := stopOplogSlicer() diff --git a/pbm/backup/physical.go b/pbm/backup/physical.go index 3865ea934..5ddf4feb5 100644 --- a/pbm/backup/physical.go +++ b/pbm/backup/physical.go @@ -330,12 +330,12 @@ func (b *Backup) doPhysical( if err != nil { return errors.Wrap(err, "set cluster last write ts") } - } - - // Waiting for cluster's StatusRunning to move further. - err = b.waitForStatus(ctx, bcp.Name, defs.StatusRunning, nil) - if err != nil { - return errors.Wrap(err, "waiting for running") + } else { + // Waiting for cluster's StatusRunning to move further. + err = b.waitForStatus(ctx, bcp.Name, defs.StatusRunning, nil) + if err != nil { + return errors.Wrap(err, "waiting for running") + } } _, lwTS, err := b.waitForFirstLastWrite(ctx, bcp.Name) From 0430086b6f31837bb9ec24e1bba60a4d469ac3fe Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 18 Sep 2024 17:58:31 +0200 Subject: [PATCH 23/45] PBM-1114: save metadata file before backup done backup status can be set to done in db meta before the meta is written to storage. if the write fails, no meta will be available on storage. storage resync will delete backup meta from db and won't see it on storage. --- pbm/backup/backup.go | 60 ++++++++++++++++++++++++++++---------------- pbm/backup/query.go | 27 +++++++++++++++++--- 2 files changed, 62 insertions(+), 25 deletions(-) diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index 3adb0450f..cc046b993 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -337,9 +337,15 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l } if inf.IsLeader() { - err = b.reconcileStatus(ctx, bcp.Name, opid.String(), defs.StatusDone, nil) + shards, err := topo.ClusterMembers(ctx, b.leadConn.MongoClient()) if err != nil { - return errors.Wrap(err, "check cluster for backup done") + return errors.Wrap(err, "check cluster for backup done: get cluster members") + } + + err = b.convergeCluster(ctx, bcp.Name, opid.String(), shards, defs.StatusDone) + err = errors.Wrap(err, "check cluster for backup done: convergeCluster") + if err != nil { + return err } bcpm, err = NewDBManager(b.leadConn).GetBackupByName(ctx, bcp.Name) @@ -347,8 +353,23 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l return errors.Wrap(err, "get backup metadata") } + // PBM-1114: update file metadata with the same values as in database + unix := time.Now().Unix() + bcpm.Status = defs.StatusDone + bcpm.LastTransitionTS = unix + bcpm.Conditions = append(bcpm.Conditions, Condition{ + Timestamp: unix, + Status: defs.StatusDone, + }) + err = writeMeta(stg, bcpm) - return errors.Wrap(err, "dump metadata") + if err != nil { + return errors.Wrap(err, "dump metadata") + } + + err = ChangeBackupStateWithUnix(b.leadConn, bcp.Name, defs.StatusDone, unix, "") + return errors.Wrapf(err, "check cluster for backup done: update backup meta with %s", + defs.StatusDone) } else { // to be sure the locks released only after the "done" status had written err = b.waitForStatus(ctx, bcp.Name, defs.StatusDone, nil) @@ -432,14 +453,18 @@ func (b *Backup) reconcileStatus( } if timeout != nil { - return errors.Wrap( - b.convergeClusterWithTimeout(ctx, bcpName, opid, shards, status, *timeout), - "convergeClusterWithTimeout") + err = b.convergeClusterWithTimeout(ctx, bcpName, opid, shards, status, *timeout) + err = errors.Wrap(err, "convergeClusterWithTimeout") + } else { + err = b.convergeCluster(ctx, bcpName, opid, shards, status) + err = errors.Wrap(err, "convergeCluster") + } + if err != nil { + return err } - return errors.Wrap( - b.convergeCluster(ctx, bcpName, opid, shards, status), - "convergeCluster") + err = ChangeBackupState(b.leadConn, bcpName, status, "") + return errors.Wrapf(err, "update backup meta with %s", status) } // convergeCluster waits until all given shards reached `status` and updates a cluster status @@ -480,10 +505,11 @@ func (b *Backup) convergeClusterWithTimeout( status defs.Status, t time.Duration, ) error { - tk := time.NewTicker(time.Second * 1) + tk := time.NewTicker(time.Second) defer tk.Stop() - tout := time.After(t) + tout := time.NewTimer(t) + defer tout.Stop() for { select { @@ -495,7 +521,7 @@ func (b *Backup) convergeClusterWithTimeout( if ok { return nil } - case <-tout: + case <-tout.C: return errors.Wrap(errConvergeTimeOut, t.String()) case <-ctx.Done(): return ctx.Err() @@ -554,15 +580,7 @@ func (b *Backup) converged( } } - if shardsToFinish == 0 { - err := ChangeBackupState(b.leadConn, bcpName, status, "") - if err != nil { - return false, errors.Wrapf(err, "update backup meta with %s", status) - } - return true, nil - } - - return false, nil + return shardsToFinish == 0, nil } func (b *Backup) waitForStatus( diff --git a/pbm/backup/query.go b/pbm/backup/query.go index a69f25118..a08ab9a45 100644 --- a/pbm/backup/query.go +++ b/pbm/backup/query.go @@ -81,15 +81,34 @@ func getBackupMeta(ctx context.Context, conn connect.Client, clause bson.D) (*Ba } func ChangeBackupStateOPID(conn connect.Client, opid string, s defs.Status, msg string) error { - return changeBackupState(context.Background(), conn, bson.D{{"opid", opid}}, s, msg) + return changeBackupState(context.TODO(), + conn, bson.D{{"opid", opid}}, time.Now().UTC().Unix(), s, msg) } func ChangeBackupState(conn connect.Client, bcpName string, s defs.Status, msg string) error { - return changeBackupState(context.Background(), conn, bson.D{{"name", bcpName}}, s, msg) + return changeBackupState(context.TODO(), + conn, bson.D{{"name", bcpName}}, time.Now().UTC().Unix(), s, msg) } -func changeBackupState(ctx context.Context, conn connect.Client, clause bson.D, s defs.Status, msg string) error { - ts := time.Now().UTC().Unix() +func ChangeBackupStateWithUnix( + conn connect.Client, + bcpName string, + s defs.Status, + unix int64, + msg string, +) error { + return changeBackupState(context.TODO(), + conn, bson.D{{"name", bcpName}}, time.Now().UTC().Unix(), s, msg) +} + +func changeBackupState( + ctx context.Context, + conn connect.Client, + clause bson.D, + ts int64, + s defs.Status, + msg string, +) error { _, err := conn.BcpCollection().UpdateOne( ctx, clause, From 0d82ec46be33958bd4ca3a98ff353b5b9df6209d Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Fri, 20 Sep 2024 10:33:53 +0200 Subject: [PATCH 24/45] PBM-1114: drop unused code --- pbm/backup/storage.go | 89 ------------------------------------------- 1 file changed, 89 deletions(-) diff --git a/pbm/backup/storage.go b/pbm/backup/storage.go index 95e03b487..670b72e6b 100644 --- a/pbm/backup/storage.go +++ b/pbm/backup/storage.go @@ -2,7 +2,6 @@ package backup import ( "context" - "encoding/json" "path" "runtime" "sync" @@ -10,101 +9,13 @@ import ( "golang.org/x/sync/errgroup" "github.com/percona/percona-backup-mongodb/pbm/archive" - "github.com/percona/percona-backup-mongodb/pbm/config" "github.com/percona/percona-backup-mongodb/pbm/defs" "github.com/percona/percona-backup-mongodb/pbm/errors" - "github.com/percona/percona-backup-mongodb/pbm/log" "github.com/percona/percona-backup-mongodb/pbm/storage" sfs "github.com/percona/percona-backup-mongodb/pbm/storage/fs" - "github.com/percona/percona-backup-mongodb/pbm/util" "github.com/percona/percona-backup-mongodb/pbm/version" ) -type StorageManager interface { - GetAllBackups(ctx context.Context) ([]BackupMeta, error) - GetBackupByName(ctx context.Context, name string) (*BackupMeta, error) -} - -type storageManagerImpl struct { - cfg *config.StorageConf - stg storage.Storage -} - -func NewStorageManager(ctx context.Context, cfg *config.StorageConf) (*storageManagerImpl, error) { - stg, err := util.StorageFromConfig(cfg, log.LogEventFromContext(ctx)) - if err != nil { - return nil, errors.Wrap(err, "unable to get backup store") - } - - _, err = stg.FileStat(defs.StorInitFile) - if !errors.Is(err, storage.ErrNotExist) { - return nil, err - } - - return &storageManagerImpl{cfg: cfg, stg: stg}, nil -} - -func (m *storageManagerImpl) GetAllBackups(ctx context.Context) ([]BackupMeta, error) { - l := log.LogEventFromContext(ctx) - - bcpList, err := m.stg.List("", defs.MetadataFileSuffix) - if err != nil { - return nil, errors.Wrap(err, "get a backups list from the storage") - } - l.Debug("got backups list: %v", len(bcpList)) - - var rv []BackupMeta - for _, b := range bcpList { - l.Debug("bcp: %v", b.Name) - - d, err := m.stg.SourceReader(b.Name) - if err != nil { - return nil, errors.Wrapf(err, "read meta for %v", b.Name) - } - - v := BackupMeta{} - err = json.NewDecoder(d).Decode(&v) - d.Close() - if err != nil { - return nil, errors.Wrapf(err, "unmarshal backup meta [%s]", b.Name) - } - - err = CheckBackupFiles(ctx, &v, m.stg) - if err != nil { - l.Warning("skip snapshot %s: %v", v.Name, err) - v.Status = defs.StatusError - v.Err = err.Error() - } - rv = append(rv, v) - } - - return rv, nil -} - -func (m *storageManagerImpl) GetBackupByName(ctx context.Context, name string) (*BackupMeta, error) { - l := log.LogEventFromContext(ctx) - l.Debug("get backup by name: %v", name) - - rdr, err := m.stg.SourceReader(name + defs.MetadataFileSuffix) - if err != nil { - return nil, errors.Wrapf(err, "read meta for %v", name) - } - defer rdr.Close() - - v := &BackupMeta{} - if err := json.NewDecoder(rdr).Decode(&v); err != nil { - return nil, errors.Wrapf(err, "unmarshal backup meta [%s]", name) - } - - if err := CheckBackupFiles(ctx, v, m.stg); err != nil { - l.Warning("no backup files %s: %v", v.Name, err) - v.Status = defs.StatusError - v.Err = err.Error() - } - - return v, nil -} - func CheckBackupFiles(ctx context.Context, bcp *BackupMeta, stg storage.Storage) error { // !!! TODO: Check physical files ? if bcp.Type != defs.LogicalBackup { From 3fac1ccecf0c084bda72644d5cb6543efe675ff9 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Fri, 20 Sep 2024 10:35:14 +0200 Subject: [PATCH 25/45] PBM-1114: check backup files before done --- pbm/backup/backup.go | 5 +++++ pbm/backup/storage.go | 46 +++++++++++++++++++++++++++++++++++++++---- pbm/resync/rsync.go | 15 +++----------- 3 files changed, 50 insertions(+), 16 deletions(-) diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index cc046b993..f96d143c4 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -367,6 +367,11 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l return errors.Wrap(err, "dump metadata") } + err = CheckBackupFiles(ctx, stg, bcp.Name) + if err != nil { + return errors.Wrap(err, "check backup files") + } + err = ChangeBackupStateWithUnix(b.leadConn, bcp.Name, defs.StatusDone, unix, "") return errors.Wrapf(err, "check cluster for backup done: update backup meta with %s", defs.StatusDone) diff --git a/pbm/backup/storage.go b/pbm/backup/storage.go index 670b72e6b..bbe4bd2eb 100644 --- a/pbm/backup/storage.go +++ b/pbm/backup/storage.go @@ -2,6 +2,7 @@ package backup import ( "context" + "encoding/json" "path" "runtime" "sync" @@ -16,12 +17,45 @@ import ( "github.com/percona/percona-backup-mongodb/pbm/version" ) -func CheckBackupFiles(ctx context.Context, bcp *BackupMeta, stg storage.Storage) error { - // !!! TODO: Check physical files ? - if bcp.Type != defs.LogicalBackup { - return nil +func CheckBackupFiles(ctx context.Context, stg storage.Storage, name string) error { + bcp, err := ReadMetadata(stg, name+defs.MetadataFileSuffix) + if err != nil { + return errors.Wrap(err, "read backup metadata") + } + + return CheckBackupDataFiles(ctx, stg, bcp) +} + +func ReadMetadata(stg storage.Storage, filename string) (*BackupMeta, error) { + rdr, err := stg.SourceReader(filename) + if err != nil { + return nil, errors.Wrap(err, "open") + } + defer rdr.Close() + + var meta *BackupMeta + err = json.NewDecoder(rdr).Decode(&meta) + if err != nil { + return nil, errors.Wrap(err, "decode") } + return meta, nil +} + +func CheckBackupDataFiles(ctx context.Context, stg storage.Storage, bcp *BackupMeta) error { + switch bcp.Type { + case defs.LogicalBackup: + return checkLogicalBackupFiles(ctx, stg, bcp) + case defs.PhysicalBackup, defs.IncrementalBackup: + return checkPhysicalBackupFiles(ctx, stg, bcp) + case defs.ExternalBackup: + return nil // no files available + } + + return errors.Errorf("unknown backup type %s", bcp.Type) +} + +func checkLogicalBackupFiles(ctx context.Context, stg storage.Storage, bcp *BackupMeta) error { legacy := version.IsLegacyArchive(bcp.PBMVersion) eg, _ := errgroup.WithContext(ctx) for _, rs := range bcp.Replsets { @@ -74,6 +108,10 @@ func CheckBackupFiles(ctx context.Context, bcp *BackupMeta, stg storage.Storage) return eg.Wait() } +func checkPhysicalBackupFiles(ctx context.Context, stg storage.Storage, bcp *BackupMeta) error { + return nil +} + func ReadArchiveNamespaces(stg storage.Storage, metafile string) ([]*archive.Namespace, error) { r, err := stg.SourceReader(metafile) if err != nil { diff --git a/pbm/resync/rsync.go b/pbm/resync/rsync.go index 9fffc84aa..64372ee1e 100644 --- a/pbm/resync/rsync.go +++ b/pbm/resync/rsync.go @@ -2,7 +2,6 @@ package resync import ( "context" - "encoding/json" "runtime" "strings" "sync" @@ -293,21 +292,13 @@ func getAllBackupMetaFromStorage( backupMeta := make([]*backup.BackupMeta, 0, len(backupFiles)) for _, b := range backupFiles { - d, err := stg.SourceReader(b.Name) + meta, err := backup.ReadMetadata(stg, b.Name) if err != nil { - l.Error("read meta for %v", b.Name) + l.Error("read metadata of backup %s: %v", b.Name, err) continue } - var meta *backup.BackupMeta - err = json.NewDecoder(d).Decode(&meta) - d.Close() - if err != nil { - l.Error("unmarshal backup meta [%s]", b.Name) - continue - } - - err = backup.CheckBackupFiles(ctx, meta, stg) + err = backup.CheckBackupDataFiles(ctx, stg, meta) if err != nil { l.Warning("skip snapshot %s: %v", meta.Name, err) meta.Status = defs.StatusError From c91f62a895cd936a1a3494e01fab202195dbeb79 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Tue, 24 Sep 2024 13:26:09 +0200 Subject: [PATCH 26/45] PBM-1114: after review edits --- pbm/backup/backup.go | 2 +- pbm/backup/query.go | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pbm/backup/backup.go b/pbm/backup/backup.go index f96d143c4..7d289a3e5 100644 --- a/pbm/backup/backup.go +++ b/pbm/backup/backup.go @@ -372,7 +372,7 @@ func (b *Backup) Run(ctx context.Context, bcp *ctrl.BackupCmd, opid ctrl.OPID, l return errors.Wrap(err, "check backup files") } - err = ChangeBackupStateWithUnix(b.leadConn, bcp.Name, defs.StatusDone, unix, "") + err = ChangeBackupStateWithUnixTime(ctx, b.leadConn, bcp.Name, defs.StatusDone, unix, "") return errors.Wrapf(err, "check cluster for backup done: update backup meta with %s", defs.StatusDone) } else { diff --git a/pbm/backup/query.go b/pbm/backup/query.go index a08ab9a45..a38c1f895 100644 --- a/pbm/backup/query.go +++ b/pbm/backup/query.go @@ -90,15 +90,15 @@ func ChangeBackupState(conn connect.Client, bcpName string, s defs.Status, msg s conn, bson.D{{"name", bcpName}}, time.Now().UTC().Unix(), s, msg) } -func ChangeBackupStateWithUnix( +func ChangeBackupStateWithUnixTime( + ctx context.Context, conn connect.Client, bcpName string, s defs.Status, unix int64, msg string, ) error { - return changeBackupState(context.TODO(), - conn, bson.D{{"name", bcpName}}, time.Now().UTC().Unix(), s, msg) + return changeBackupState(ctx, conn, bson.D{{"name", bcpName}}, time.Now().UTC().Unix(), s, msg) } func changeBackupState( From 00f395bead2204ef22d211f68a9440dc73daf76c Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Mon, 23 Sep 2024 18:00:54 +0200 Subject: [PATCH 27/45] PBM-886: sync channel set/unset/close ops --- pbm/oplog/backup.go | 10 ++++++++++ pbm/storage/storage.go | 2 +- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/pbm/oplog/backup.go b/pbm/oplog/backup.go index a8559229e..0ac359af9 100644 --- a/pbm/oplog/backup.go +++ b/pbm/oplog/backup.go @@ -4,6 +4,7 @@ import ( "context" "fmt" "io" + "sync" "time" "go.mongodb.org/mongo-driver/bson" @@ -31,6 +32,7 @@ func (t Timeline) String() string { // OplogBackup is used for reading the Mongodb oplog type OplogBackup struct { cl *mongo.Client + mu sync.Mutex stopC chan struct{} start primitive.Timestamp end primitive.Timestamp @@ -68,7 +70,10 @@ func (ot *OplogBackup) WriteTo(w io.Writer) (int64, error) { return 0, errors.Errorf("oplog TailingSpan should be set, have start: %v, end: %v", ot.start, ot.end) } + ot.mu.Lock() ot.stopC = make(chan struct{}) + ot.mu.Unlock() + ctx, cancel := context.WithCancel(context.Background()) defer cancel() @@ -79,7 +84,9 @@ func (ot *OplogBackup) WriteTo(w io.Writer) (int64, error) { cancel() } + ot.mu.Lock() ot.stopC = nil + ot.mu.Unlock() }() cur, err := ot.cl.Database("local").Collection("oplog.rs").Find(ctx, @@ -145,6 +152,9 @@ func (ot *OplogBackup) WriteTo(w io.Writer) (int64, error) { } func (ot *OplogBackup) Cancel() { + ot.mu.Lock() + defer ot.mu.Unlock() + if c := ot.stopC; c != nil { select { case _, ok := <-c: diff --git a/pbm/storage/storage.go b/pbm/storage/storage.go index 228d6f8f9..a67979e6a 100644 --- a/pbm/storage/storage.go +++ b/pbm/storage/storage.go @@ -214,7 +214,7 @@ func Upload( err := r.Close() if err != nil { - return 0, errors.Wrap(err, "cancel backup: close reader") + return 0, errors.Wrap(err, "cancel upload: close reader") } return 0, ErrCancelled case <-saveDone: From 489ac3ddaf402e828c971be9ecd90a93a35312b6 Mon Sep 17 00:00:00 2001 From: Boris Ilijic Date: Thu, 26 Sep 2024 17:11:51 +0200 Subject: [PATCH 28/45] PBM-1391: Enabling PITR after physical restore causes PSMDB to crash (#1019) * Fix inconsistent data after physical restore with PITR After PITR physical restore, there was inconsistent data between Primary and Secondaries nodes. The reason was that PITR oplog and dropping collections are applied in reverse order: - on Primary: [PITR oplog apply] -> [dropping PBM databases] - on each Secondary: [dropping PBM databases] -> [catch up from Primary including oplog apply] Not using DDL operations (drop in this case) for PBM's system collections fixes the problem. * Expand setupNewDB logic with all PBM's collections That ensures that collection will not be created during PITR and by doing that we eliminate the possible problem of having different UUIDs. --- cmd/pbm-agent/setup.go | 29 ++++++++++++++++++++++++++++- pbm/restore/physical.go | 8 ++++---- 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/cmd/pbm-agent/setup.go b/cmd/pbm-agent/setup.go index f3450bc7a..c2637ddbc 100644 --- a/cmd/pbm-agent/setup.go +++ b/cmd/pbm-agent/setup.go @@ -110,6 +110,14 @@ func setupNewDB(ctx context.Context, conn connect.Client) error { return errors.Wrap(err, "ensure pitr chunks index") } + err = conn.AdminCommand( + ctx, + bson.D{{"create", defs.PITRCollection}}, + ).Err() + if err != nil && !strings.Contains(err.Error(), "already exists") { + return errors.Wrap(err, "ensure pitr collection") + } + _, err = conn.BcpCollection().Indexes().CreateMany( ctx, []mongo.IndexModel{ @@ -124,6 +132,25 @@ func setupNewDB(ctx context.Context, conn connect.Client) error { }, }, ) + if err != nil && !strings.Contains(err.Error(), "already exists") { + return errors.Wrap(err, "ensure backup collection index") + } + + err = conn.AdminCommand( + ctx, + bson.D{{"create", defs.RestoresCollection}}, + ).Err() + if err != nil && !strings.Contains(err.Error(), "already exists") { + return errors.Wrap(err, "ensure restore collection") + } + + err = conn.AdminCommand( + ctx, + bson.D{{"create", defs.AgentsStatusCollection}}, + ).Err() + if err != nil && !strings.Contains(err.Error(), "already exists") { + return errors.Wrap(err, "ensure agent status collection") + } - return err + return nil } diff --git a/pbm/restore/physical.go b/pbm/restore/physical.go index 8d00016a8..9dfae5160 100644 --- a/pbm/restore/physical.go +++ b/pbm/restore/physical.go @@ -1524,13 +1524,13 @@ func (r *PhysRestore) resetRS() error { return errors.Wrap(err, "turn off pitr") } - r.dropPBMCollections(ctx, c) + r.cleanUpPBMCollections(ctx, c) } return r.shutdown(c) } -func (r *PhysRestore) dropPBMCollections(ctx context.Context, c *mongo.Client) { +func (r *PhysRestore) cleanUpPBMCollections(ctx context.Context, c *mongo.Client) { pbmCollections := []string{ defs.LockCollection, defs.LogCollection, @@ -1554,9 +1554,9 @@ func (r *PhysRestore) dropPBMCollections(ctx context.Context, c *mongo.Client) { defer wg.Done() r.log.Debug("dropping 'admin.%s'", coll) - err := c.Database(defs.DB).Collection(coll).Drop(ctx) + _, err := c.Database(defs.DB).Collection(coll).DeleteMany(ctx, bson.D{}) if err != nil { - r.log.Warning("failed to drop 'admin.%s': %v", coll, err) + r.log.Warning("failed to delete all from 'admin.%s': %v", coll, err) } }() } From d933b423aef71555ed8c537ec9342dc8fc0bfe0e Mon Sep 17 00:00:00 2001 From: Boris Ilijic Date: Fri, 27 Sep 2024 11:33:41 +0200 Subject: [PATCH 29/45] Log correlation between Backup Name and OPID (#1027) --- cmd/pbm-agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/pbm-agent/agent.go b/cmd/pbm-agent/agent.go index b03e6ff5b..3f6b3d338 100644 --- a/cmd/pbm-agent/agent.go +++ b/cmd/pbm-agent/agent.go @@ -167,7 +167,7 @@ func (a *Agent) Start(ctx context.Context) error { return nil } - logger.Printf("got command %s", cmd) + logger.Printf("got command %s, opid: %s", cmd, cmd.OPID) ep, err := config.GetEpoch(ctx, a.leadConn) if err != nil { From e66f215c6a39e663da2fe87cdc2c6e110fad4c36 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 25 Sep 2024 23:21:15 +0200 Subject: [PATCH 30/45] PBM-1312: set default num-parallal-collection for restore to be (cpu/2) --- cmd/pbm-agent/restore.go | 3 ++- pbm/restore/logical.go | 2 +- pbm/snapshot/restore.go | 6 +++++- 3 files changed, 8 insertions(+), 3 deletions(-) diff --git a/cmd/pbm-agent/restore.go b/cmd/pbm-agent/restore.go index 1d76153a0..77510b687 100644 --- a/cmd/pbm-agent/restore.go +++ b/cmd/pbm-agent/restore.go @@ -2,6 +2,7 @@ package main import ( "context" + "runtime" "time" "github.com/percona/percona-backup-mongodb/pbm/backup" @@ -114,7 +115,7 @@ func (a *Agent) Restore(ctx context.Context, r *ctrl.RestoreCmd, opid ctrl.OPID, return } - var numParallelColls int + numParallelColls := runtime.NumCPU() / 2 if r.NumParallelColls != nil && *r.NumParallelColls > 0 { numParallelColls = int(*r.NumParallelColls) } diff --git a/pbm/restore/logical.go b/pbm/restore/logical.go index 886a8f637..a19ee84af 100644 --- a/pbm/restore/logical.go +++ b/pbm/restore/logical.go @@ -1193,7 +1193,7 @@ func (r *Restore) snapshot(ctx context.Context, input io.Reader) error { return errors.Wrap(err, "unable to get PBM config settings") } - rf, err := snapshot.NewRestore(r.brief.URI, cfg) + rf, err := snapshot.NewRestore(r.brief.URI, cfg, r.numParallelColls) if err != nil { return err } diff --git a/pbm/snapshot/restore.go b/pbm/snapshot/restore.go index cdccfc26a..04c15b36f 100644 --- a/pbm/snapshot/restore.go +++ b/pbm/snapshot/restore.go @@ -42,7 +42,7 @@ var ExcludeFromRestore = []string{ type restorer struct{ *mongorestore.MongoRestore } -func NewRestore(uri string, cfg *config.Config) (io.ReaderFrom, error) { +func NewRestore(uri string, cfg *config.Config, numParallelColls int) (io.ReaderFrom, error) { topts := options.New("mongorestore", "0.0.1", "none", @@ -76,6 +76,9 @@ func NewRestore(uri string, cfg *config.Config) (io.ReaderFrom, error) { if cfg.Restore.NumInsertionWorkers > 0 { numInsertionWorkers = cfg.Restore.NumInsertionWorkers } + if numParallelColls < 1 { + numParallelColls = 1 + } mopts := mongorestore.Options{} mopts.ToolOptions = topts @@ -87,6 +90,7 @@ func NewRestore(uri string, cfg *config.Config) (io.ReaderFrom, error) { BypassDocumentValidation: true, Drop: true, NumInsertionWorkers: numInsertionWorkers, + NumParallelCollections: numParallelColls, PreserveUUID: preserveUUID, StopOnError: true, WriteConcern: "majority", From e1873a1120f7e9fddf2421dfaf7d9071f72ee0e4 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 25 Sep 2024 23:22:20 +0200 Subject: [PATCH 31/45] PBM-1312: print in debug number of parallel collection --- pbm/backup/logical.go | 2 ++ pbm/restore/logical.go | 2 ++ 2 files changed, 4 insertions(+) diff --git a/pbm/backup/logical.go b/pbm/backup/logical.go index 858fdbe44..591fafa00 100644 --- a/pbm/backup/logical.go +++ b/pbm/backup/logical.go @@ -147,6 +147,8 @@ func (b *Backup) doLogical( } } + l.Debug("dumping up to %d collections in parallel", numParallelColls) + dump, err = snapshot.NewBackup(b.brief.URI, numParallelColls, db, coll) if err != nil { return errors.Wrap(err, "init mongodump options") diff --git a/pbm/restore/logical.go b/pbm/restore/logical.go index a19ee84af..fdaa584b2 100644 --- a/pbm/restore/logical.go +++ b/pbm/restore/logical.go @@ -794,6 +794,8 @@ func (r *Restore) RunSnapshot( // so we'll continue with selective restore } + r.log.Debug("restoring up to %d collections in parallel", r.numParallelColls) + rdr, err = snapshot.DownloadDump( func(ns string) (io.ReadCloser, error) { stg, err := util.StorageFromConfig(&bcp.Store.StorageConf, r.log) From 99c1d6a54ad3aba6e03784803f0a6bfb96baf778 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 25 Sep 2024 09:03:28 +0200 Subject: [PATCH 32/45] pass MaxProcs explicitly to mongodump/mongorestore --- pbm/snapshot/backup.go | 4 ++++ pbm/snapshot/restore.go | 3 +++ 2 files changed, 7 insertions(+) diff --git a/pbm/snapshot/backup.go b/pbm/snapshot/backup.go index fc816da39..9aa633f89 100644 --- a/pbm/snapshot/backup.go +++ b/pbm/snapshot/backup.go @@ -3,6 +3,7 @@ package snapshot import ( "io" "log" + "runtime" "time" "github.com/mongodb/mongo-tools/common/archive" @@ -45,6 +46,9 @@ func NewBackup(curi string, maxParallelColls int, d, c string) (*backuper, error } } + // mongodump calls runtime.GOMAXPROCS(MaxProcs). + opts.MaxProcs = runtime.GOMAXPROCS(0) + if maxParallelColls < 1 { maxParallelColls = 1 } diff --git a/pbm/snapshot/restore.go b/pbm/snapshot/restore.go index 04c15b36f..f719c12f2 100644 --- a/pbm/snapshot/restore.go +++ b/pbm/snapshot/restore.go @@ -2,6 +2,7 @@ package snapshot import ( "io" + "runtime" "github.com/mongodb/mongo-tools/common/options" "github.com/mongodb/mongo-tools/mongorestore" @@ -99,6 +100,8 @@ func NewRestore(uri string, cfg *config.Config, numParallelColls int) (io.Reader mopts.NSOptions = &mongorestore.NSOptions{ NSExclude: ExcludeFromRestore, } + // mongorestore calls runtime.GOMAXPROCS(MaxProcs). + mopts.MaxProcs = runtime.GOMAXPROCS(0) mr, err := mongorestore.New(mopts) if err != nil { From 976cdc7bdbe0693e2750a7d1a292bb27aa9c66f5 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Fri, 20 Sep 2024 15:14:53 +0200 Subject: [PATCH 33/45] add util/errgroup.go to collect all errors --- pbm/util/errgroup.go | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 pbm/util/errgroup.go diff --git a/pbm/util/errgroup.go b/pbm/util/errgroup.go new file mode 100644 index 000000000..b38e668c4 --- /dev/null +++ b/pbm/util/errgroup.go @@ -0,0 +1,44 @@ +package util + +import ( + "runtime" + "sync" +) + +type errorGroup struct { + errs []error + mu sync.Mutex + + wg sync.WaitGroup + sem chan struct{} +} + +func NewErrorGroup(limit int) *errorGroup { + if limit <= 0 { + limit = runtime.NumCPU() + } + return &errorGroup{sem: make(chan struct{}, limit)} +} + +func (g *errorGroup) Wait() []error { + g.wg.Wait() + return g.errs +} + +func (g *errorGroup) Go(f func() error) { + g.wg.Add(1) + go func() { + g.sem <- struct{}{} + + defer func() { + <-g.sem + g.wg.Done() + }() + + if err := f(); err != nil { + g.mu.Lock() + g.errs = append(g.errs, err) + g.mu.Unlock() + } + }() +} From 63b334ddcf6fad097ecef2722241771799bb6d65 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Fri, 20 Sep 2024 15:19:13 +0200 Subject: [PATCH 34/45] PBM-1397: collect all errors from checkLogicalBackupFiles --- pbm/backup/storage.go | 81 ++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 39 deletions(-) diff --git a/pbm/backup/storage.go b/pbm/backup/storage.go index bbe4bd2eb..8a883a46a 100644 --- a/pbm/backup/storage.go +++ b/pbm/backup/storage.go @@ -7,13 +7,12 @@ import ( "runtime" "sync" - "golang.org/x/sync/errgroup" - "github.com/percona/percona-backup-mongodb/pbm/archive" "github.com/percona/percona-backup-mongodb/pbm/defs" "github.com/percona/percona-backup-mongodb/pbm/errors" "github.com/percona/percona-backup-mongodb/pbm/storage" sfs "github.com/percona/percona-backup-mongodb/pbm/storage/fs" + "github.com/percona/percona-backup-mongodb/pbm/util" "github.com/percona/percona-backup-mongodb/pbm/version" ) @@ -45,7 +44,7 @@ func ReadMetadata(stg storage.Storage, filename string) (*BackupMeta, error) { func CheckBackupDataFiles(ctx context.Context, stg storage.Storage, bcp *BackupMeta) error { switch bcp.Type { case defs.LogicalBackup: - return checkLogicalBackupFiles(ctx, stg, bcp) + return checkLogicalBackupDataFiles(ctx, stg, bcp) case defs.PhysicalBackup, defs.IncrementalBackup: return checkPhysicalBackupFiles(ctx, stg, bcp) case defs.ExternalBackup: @@ -55,57 +54,61 @@ func CheckBackupDataFiles(ctx context.Context, stg storage.Storage, bcp *BackupM return errors.Errorf("unknown backup type %s", bcp.Type) } -func checkLogicalBackupFiles(ctx context.Context, stg storage.Storage, bcp *BackupMeta) error { +func checkLogicalBackupDataFiles(_ context.Context, stg storage.Storage, bcp *BackupMeta) error { legacy := version.IsLegacyArchive(bcp.PBMVersion) - eg, _ := errgroup.WithContext(ctx) + + eg := util.NewErrorGroup(runtime.NumCPU() * 2) for _, rs := range bcp.Replsets { - rs := rs + eg.Go(func() error { + eg.Go(func() error { return checkFile(stg, rs.DumpName) }) - eg.Go(func() error { return checkFile(stg, rs.DumpName) }) + eg.Go(func() error { + if version.IsLegacyBackupOplog(bcp.PBMVersion) { + return checkFile(stg, rs.OplogName) + } - eg.Go(func() error { - if version.IsLegacyBackupOplog(bcp.PBMVersion) { - return checkFile(stg, rs.OplogName) + files, err := stg.List(rs.OplogName, "") + if err != nil { + return errors.Wrap(err, "list") + } + if len(files) == 0 { + return errors.Wrap(err, "no oplog files") + } + for i := range files { + if files[i].Size == 0 { + return errors.Errorf("%q is empty", path.Join(rs.OplogName, files[i].Name)) + } + } + + return nil + }) + + if legacy { + return nil } - files, err := stg.List(rs.OplogName, "") + nss, err := ReadArchiveNamespaces(stg, rs.DumpName) if err != nil { - return errors.Wrap(err, "list") - } - if len(files) == 0 { - return errors.Wrap(err, "no oplog files") + return errors.Wrapf(err, "parse metafile %q", rs.DumpName) } - for i := range files { - if files[i].Size == 0 { - return errors.Errorf("%q is empty", path.Join(rs.OplogName, files[i].Name)) - } - } - - return nil - }) - if legacy { - continue - } + for _, ns := range nss { + if ns.Size == 0 { + continue + } - nss, err := ReadArchiveNamespaces(stg, rs.DumpName) - if err != nil { - return errors.Wrapf(err, "parse metafile %q", rs.DumpName) - } + ns := archive.NSify(ns.Database, ns.Collection) + f := path.Join(bcp.Name, rs.Name, ns+bcp.Compression.Suffix()) - for _, ns := range nss { - if ns.Size == 0 { - continue + eg.Go(func() error { return checkFile(stg, f) }) } - ns := archive.NSify(ns.Database, ns.Collection) - f := path.Join(bcp.Name, rs.Name, ns+bcp.Compression.Suffix()) - - eg.Go(func() error { return checkFile(stg, f) }) - } + return nil + }) } - return eg.Wait() + errs := eg.Wait() + return errors.Join(errs...) } func checkPhysicalBackupFiles(ctx context.Context, stg storage.Storage, bcp *BackupMeta) error { From 5accc6d44491e1be1d427f7705366b3612e55173 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Fri, 20 Sep 2024 15:21:42 +0200 Subject: [PATCH 35/45] PBM-1397: implement checkPhysicalBackupDataFiles() --- pbm/backup/storage.go | 63 +++++++++++++++++++++++++++++++++++++++-- pbm/backup/types.go | 10 +++++++ pbm/restore/logical.go | 1 - pbm/restore/physical.go | 5 +--- sdk/impl.go | 21 ++------------ 5 files changed, 73 insertions(+), 27 deletions(-) diff --git a/pbm/backup/storage.go b/pbm/backup/storage.go index 8a883a46a..acd55b1b5 100644 --- a/pbm/backup/storage.go +++ b/pbm/backup/storage.go @@ -46,7 +46,7 @@ func CheckBackupDataFiles(ctx context.Context, stg storage.Storage, bcp *BackupM case defs.LogicalBackup: return checkLogicalBackupDataFiles(ctx, stg, bcp) case defs.PhysicalBackup, defs.IncrementalBackup: - return checkPhysicalBackupFiles(ctx, stg, bcp) + return checkPhysicalBackupDataFiles(ctx, stg, bcp) case defs.ExternalBackup: return nil // no files available } @@ -111,8 +111,65 @@ func checkLogicalBackupDataFiles(_ context.Context, stg storage.Storage, bcp *Ba return errors.Join(errs...) } -func checkPhysicalBackupFiles(ctx context.Context, stg storage.Storage, bcp *BackupMeta) error { - return nil +func checkPhysicalBackupDataFiles(_ context.Context, stg storage.Storage, bcp *BackupMeta) error { + eg := util.NewErrorGroup(runtime.NumCPU() * 2) + for _, rs := range bcp.Replsets { + eg.Go(func() error { + var filelist Filelist + if version.HasFilelistFile(bcp.PBMVersion) { + var err error + filelist, err = ReadFilelistForReplset(stg, bcp.Name, rs.Name) + if err != nil { + return errors.Wrapf(err, "read filelist for replset %s", rs.Name) + } + } else { + filelist = rs.Files + } + if len(filelist) == 0 { + return errors.Errorf("empty filelist for replset %s", rs.Name) + } + + for _, f := range filelist { + if f.Len <= 0 { + continue // no file expected + } + + eg.Go(func() error { + filepath := path.Join(bcp.Name, rs.Name, f.Path(bcp.Compression)) + stat, err := stg.FileStat(filepath) + if err != nil { + return errors.Wrapf(err, "file %s", filepath) + } + if stat.Size == 0 { + return errors.Errorf("empty file %s", filepath) + } + + return nil + }) + } + + return nil + }) + } + + errs := eg.Wait() + return errors.Join(errs...) +} + +func ReadFilelistForReplset(stg storage.Storage, bcpName, rsName string) (Filelist, error) { + pfFilepath := path.Join(bcpName, rsName, FilelistName) + rdr, err := stg.SourceReader(pfFilepath) + if err != nil { + return nil, errors.Wrapf(err, "open %q", pfFilepath) + } + defer rdr.Close() + + filelist, err := ReadFilelist(rdr) + if err != nil { + return nil, errors.Wrapf(err, "parse filelist %q", pfFilepath) + } + + return filelist, nil } func ReadArchiveNamespaces(stg storage.Storage, metafile string) ([]*archive.Namespace, error) { diff --git a/pbm/backup/types.go b/pbm/backup/types.go index 52e387872..2997b88b4 100644 --- a/pbm/backup/types.go +++ b/pbm/backup/types.go @@ -4,6 +4,7 @@ import ( "fmt" "io" "os" + "path/filepath" "go.mongodb.org/mongo-driver/bson" "go.mongodb.org/mongo-driver/bson/primitive" @@ -156,6 +157,15 @@ func (f File) String() string { return fmt.Sprintf("%s [%d:%d]", f.Name, f.Off, f.Len) } +func (f File) Path(c compress.CompressionType) string { + src := filepath.Join(f.Name + c.Suffix()) + if f.Len == 0 { + return src + } + + return fmt.Sprintf("%s.%d-%d", src, f.Off, f.Len) +} + func (f *File) WriteTo(w io.Writer) (int64, error) { fd, err := os.Open(f.Name) if err != nil { diff --git a/pbm/restore/logical.go b/pbm/restore/logical.go index fdaa584b2..94901d446 100644 --- a/pbm/restore/logical.go +++ b/pbm/restore/logical.go @@ -759,7 +759,6 @@ func (r *Restore) RunSnapshot( usersAndRolesOpt restoreUsersAndRolesOption, ) error { var rdr io.ReadCloser - var err error if version.IsLegacyArchive(bcp.PBMVersion) { sr, err := r.bcpStg.SourceReader(dump) diff --git a/pbm/restore/physical.go b/pbm/restore/physical.go index 9dfae5160..38d0d55b5 100644 --- a/pbm/restore/physical.go +++ b/pbm/restore/physical.go @@ -1089,10 +1089,7 @@ func (r *PhysRestore) copyFiles() (*s3.DownloadStat, error) { for i := len(r.files) - 1; i >= 0; i-- { set := r.files[i] for _, f := range set.Data { - src := filepath.Join(set.BcpName, setName, f.Name+set.Cmpr.Suffix()) - if f.Len != 0 { - src += fmt.Sprintf(".%d-%d", f.Off, f.Len) - } + src := filepath.Join(set.BcpName, setName, f.Path(set.Cmpr)) // cut dbpath from destination if there is any (see PBM-1058) fname := f.Name if set.dbpath != "" { diff --git a/sdk/impl.go b/sdk/impl.go index 83afcc1ff..51043a554 100644 --- a/sdk/impl.go +++ b/sdk/impl.go @@ -2,7 +2,6 @@ package sdk import ( "context" - "path" "runtime" "time" @@ -192,7 +191,7 @@ func fillFilelistForBackup(ctx context.Context, bcp *BackupMetadata) error { rs := &bcp.Replsets[i] eg.Go(func() error { - filelist, err := getFilelistForReplset(stg, bcp.Name, rs.Name) + filelist, err := backup.ReadFilelistForReplset(stg, bcp.Name, rs.Name) if err != nil { return errors.Wrapf(err, "get filelist for %q [rs: %s] backup", bcp.Name, rs.Name) } @@ -226,7 +225,7 @@ func fillFilelistForBackup(ctx context.Context, bcp *BackupMetadata) error { rs := &bcp.Replsets[i] eg.Go(func() error { - filelist, err := getFilelistForReplset(stg, bcp.Name, rs.Name) + filelist, err := backup.ReadFilelistForReplset(stg, bcp.Name, rs.Name) if err != nil { return errors.Wrapf(err, "fetch files for %q [rs: %s] backup", bcp.Name, rs.Name) } @@ -254,22 +253,6 @@ func getStorageForRead(ctx context.Context, bcp *backup.BackupMeta) (storage.Sto return stg, nil } -func getFilelistForReplset(stg storage.Storage, bcpName, rsName string) (backup.Filelist, error) { - pfFilepath := path.Join(bcpName, rsName, backup.FilelistName) - rdr, err := stg.SourceReader(pfFilepath) - if err != nil { - return nil, errors.Wrapf(err, "open %q", pfFilepath) - } - defer rdr.Close() - - filelist, err := backup.ReadFilelist(rdr) - if err != nil { - return nil, errors.Wrapf(err, "parse filelist %q", pfFilepath) - } - - return filelist, nil -} - func (c *Client) GetRestoreByName(ctx context.Context, name string) (*RestoreMetadata, error) { return restore.GetRestoreMeta(ctx, c.conn, name) } From 9bf8c88cfe7ecaa3ac223f806cfba803c05b206c Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Tue, 24 Sep 2024 13:03:51 +0200 Subject: [PATCH 36/45] PBM-921: ensure system.views before "creating" views/timeseries --- pbm/oplog/restore.go | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/pbm/oplog/restore.go b/pbm/oplog/restore.go index 69239a0ac..15ab8ad2f 100644 --- a/pbm/oplog/restore.go +++ b/pbm/oplog/restore.go @@ -661,6 +661,7 @@ func (o *OplogRestore) handleNonTxnOp(op db.Oplog) error { return errors.Wrap(err, "filtering UUIDs from oplog") } + dbName, collName, _ := strings.Cut(op.Namespace, ".") if op.Operation == "c" { if len(op.Object) == 0 { return errors.Errorf("empty object value for op: %v", op) @@ -671,9 +672,6 @@ func (o *OplogRestore) handleNonTxnOp(op db.Oplog) error { return errors.Errorf("unknown oplog command name %v: %v", cmdName, op) } - ns := strings.Split(op.Namespace, ".") - dbName := ns[0] - switch cmdName { case "commitIndexBuild": // commitIndexBuild was introduced in 4.4, one "commitIndexBuild" command can contain several @@ -793,6 +791,12 @@ func (o *OplogRestore) handleNonTxnOp(op db.Oplog) error { return errors.Wrap(err, "oplog: drop collection before create") } } + } else if op.Operation == "i" && collName == "system.views" { + // PBM-921: ensure the collection exists before "creating" views or timeseries + err := o.dst.Database(dbName).CreateCollection(context.TODO(), "system.views") + if err != nil { + return errors.Wrapf(err, "ensure %s.system.views collection", dbName) + } } err = o.applyOps([]interface{}{op}) From 953255a6b75eea7d6f928a30e958cce0b4538107 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 25 Sep 2024 23:00:45 +0200 Subject: [PATCH 37/45] PBM-921: ignore NamespaceExists when creating system.views --- pbm/oplog/restore.go | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/pbm/oplog/restore.go b/pbm/oplog/restore.go index 15ab8ad2f..8df6e73a2 100644 --- a/pbm/oplog/restore.go +++ b/pbm/oplog/restore.go @@ -795,7 +795,14 @@ func (o *OplogRestore) handleNonTxnOp(op db.Oplog) error { // PBM-921: ensure the collection exists before "creating" views or timeseries err := o.dst.Database(dbName).CreateCollection(context.TODO(), "system.views") if err != nil { - return errors.Wrapf(err, "ensure %s.system.views collection", dbName) + // MongoDB 5.0 and 6.0 returns NamespaceExists error. + // MongoDB 7.0 and 8.0 does not return error. + // https://github.com/mongodb/mongo/blob/v6.0/src/mongo/base/error_codes.yml#L84 + const NamespaceExists = 48 + var cmdError mongo.CommandError + if !errors.As(err, &cmdError) || cmdError.Code != NamespaceExists { + return errors.Wrapf(err, "ensure %s.system.views collection", dbName) + } } } From 5098404b5e25fd7052fc138d45c1943bd6be3e88 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Mon, 23 Sep 2024 20:18:20 +0200 Subject: [PATCH 38/45] PBM-397: update agent status right after startup --- cmd/pbm-agent/agent.go | 180 ++++++++++++++++++++++++++++------------- pbm/topo/agent.go | 10 +-- 2 files changed, 124 insertions(+), 66 deletions(-) diff --git a/cmd/pbm-agent/agent.go b/cmd/pbm-agent/agent.go index 3f6b3d338..8da36d828 100644 --- a/cmd/pbm-agent/agent.go +++ b/cmd/pbm-agent/agent.go @@ -282,8 +282,17 @@ func (a *Agent) HbStatus(ctx context.Context) { MongoVer: nodeVersion.VersionString, PerconaVer: nodeVersion.PSMDBVersion, } + + updateAgentStat(ctx, a, l, true, &hb) + err = topo.SetAgentStatus(ctx, a.leadConn, &hb) + if err != nil { + l.Error("set status: %v", err) + } + defer func() { - if err := topo.RemoveAgentStatus(ctx, a.leadConn, hb); err != nil { + l.Debug("deleting agent status") + err := topo.RemoveAgentStatus(context.Background(), a.leadConn, hb) + if err != nil { logger := logger.NewEvent("agentCheckup", "", "", primitive.Timestamp{}) logger.Error("remove agent heartbeat: %v", err) } @@ -292,74 +301,128 @@ func (a *Agent) HbStatus(ctx context.Context) { tk := time.NewTicker(defs.AgentsStatCheckRange) defer tk.Stop() + storageCheckTime := time.Now() + parallelAgentCheckTime := time.Now() + // check storage once in a while if all is ok (see https://jira.percona.com/browse/PBM-647) - const checkStoreIn = int(60 / (defs.AgentsStatCheckRange / time.Second)) - cc := 0 - for range tk.C { - // don't check if on pause (e.g. physical restore) - if !a.HbIsRun() { - continue - } + const storageCheckInterval = 15 * time.Second + const parallelAgentCheckInternval = 20 * time.Second - hb.PBMStatus = a.pbmStatus(ctx) - logHbStatus("PBM connection", hb.PBMStatus, l) + for { + select { + case <-ctx.Done(): + return + case <-tk.C: + // don't check if on pause (e.g. physical restore) + if !a.HbIsRun() { + continue + } - hb.NodeStatus = a.nodeStatus(ctx) - logHbStatus("node connection", hb.NodeStatus, l) + now := time.Now() + if now.Sub(parallelAgentCheckTime) >= parallelAgentCheckInternval { + a.warnIfParallelAgentDetected(ctx, l, hb.Heartbeat) + parallelAgentCheckTime = now + } - cc++ - hb.StorageStatus = a.storStatus(ctx, l, cc == checkStoreIn) - logHbStatus("storage connection", hb.StorageStatus, l) - if cc == checkStoreIn { - cc = 0 + if now.Sub(storageCheckTime) >= storageCheckInterval { + updateAgentStat(ctx, a, l, true, &hb) + err = topo.SetAgentStatus(ctx, a.leadConn, &hb) + if err == nil { + storageCheckTime = now + } + } else { + updateAgentStat(ctx, a, l, false, &hb) + err = topo.SetAgentStatus(ctx, a.leadConn, &hb) + } + if err != nil { + l.Error("set status: %v", err) + } } + } +} - hb.Err = "" - hb.Hidden = false - hb.Passive = false +func updateAgentStat( + ctx context.Context, + agent *Agent, + l log.LogEvent, + checkStore bool, + hb *topo.AgentStat, +) { + hb.PBMStatus = agent.pbmStatus(ctx) + logHbStatus("PBM connection", hb.PBMStatus, l) - inf, err := topo.GetNodeInfo(ctx, a.nodeConn) - if err != nil { - l.Error("get NodeInfo: %v", err) - hb.Err += fmt.Sprintf("get NodeInfo: %v", err) + hb.NodeStatus = agent.nodeStatus(ctx) + logHbStatus("node connection", hb.NodeStatus, l) + + hb.StorageStatus = agent.storStatus(ctx, l, checkStore, hb) + logHbStatus("storage connection", hb.StorageStatus, l) + + hb.Err = "" + hb.Hidden = false + hb.Passive = false + + inf, err := topo.GetNodeInfo(ctx, agent.nodeConn) + if err != nil { + l.Error("get NodeInfo: %v", err) + hb.Err += fmt.Sprintf("get NodeInfo: %v", err) + } else { + hb.Hidden = inf.Hidden + hb.Passive = inf.Passive + hb.Arbiter = inf.ArbiterOnly + if inf.SecondaryDelayOld != 0 { + hb.DelaySecs = inf.SecondaryDelayOld } else { - hb.Hidden = inf.Hidden - hb.Passive = inf.Passive - hb.Arbiter = inf.ArbiterOnly - if inf.SecondaryDelayOld != 0 { - hb.DelaySecs = inf.SecondaryDelayOld - } else { - hb.DelaySecs = inf.SecondaryDelaySecs - } + hb.DelaySecs = inf.SecondaryDelaySecs } - if inf != nil && inf.ArbiterOnly { - hb.State = defs.NodeStateArbiter - hb.StateStr = "ARBITER" + hb.Heartbeat, err = topo.ClusterTimeFromNodeInfo(inf) + if err != nil { + hb.Err += fmt.Sprintf("get cluster time: %v", err) + } + } + + if inf != nil && inf.ArbiterOnly { + hb.State = defs.NodeStateArbiter + hb.StateStr = "ARBITER" + } else { + n, err := topo.GetNodeStatus(ctx, agent.nodeConn, agent.brief.Me) + if err != nil { + l.Error("get replSetGetStatus: %v", err) + hb.Err += fmt.Sprintf("get replSetGetStatus: %v", err) + hb.State = defs.NodeStateUnknown + hb.StateStr = "UNKNOWN" } else { - n, err := topo.GetNodeStatus(ctx, a.nodeConn, a.brief.Me) - if err != nil { - l.Error("get replSetGetStatus: %v", err) - hb.Err += fmt.Sprintf("get replSetGetStatus: %v", err) - hb.State = defs.NodeStateUnknown - hb.StateStr = "UNKNOWN" - } else { - hb.State = n.State - hb.StateStr = n.StateStr + hb.State = n.State + hb.StateStr = n.StateStr - rLag, err := topo.ReplicationLag(ctx, a.nodeConn, a.brief.Me) - if err != nil { - l.Error("get replication lag: %v", err) - hb.Err += fmt.Sprintf("get replication lag: %v", err) - } - hb.ReplicationLag = rLag + rLag, err := topo.ReplicationLag(ctx, agent.nodeConn, agent.brief.Me) + if err != nil { + l.Error("get replication lag: %v", err) + hb.Err += fmt.Sprintf("get replication lag: %v", err) } + hb.ReplicationLag = rLag } + } +} - err = topo.SetAgentStatus(ctx, a.leadConn, hb) - if err != nil { - l.Error("set status: %v", err) +func (a *Agent) warnIfParallelAgentDetected( + ctx context.Context, + l log.LogEvent, + lastHeartbeat primitive.Timestamp, +) { + s, err := topo.GetAgentStatus(ctx, a.leadConn, a.brief.SetName, a.brief.Me) + if err != nil { + if errors.Is(err, mongo.ErrNoDocuments) { + return } + l.Error("detecting parallel agent: get status: %v", err) + return + } + if !s.Heartbeat.Equal(lastHeartbeat) { + l.Warning("detected possible parallel agent for the node: "+ + "expected last heartbeat to be %d.%d, actual is %d.%d", + lastHeartbeat.T, lastHeartbeat.I, s.Heartbeat.T, s.Heartbeat.I) + return } } @@ -381,13 +444,14 @@ func (a *Agent) nodeStatus(ctx context.Context) topo.SubsysStatus { return topo.SubsysStatus{OK: true} } -func (a *Agent) storStatus(ctx context.Context, log log.LogEvent, forceCheckStorage bool) topo.SubsysStatus { +func (a *Agent) storStatus( + ctx context.Context, + log log.LogEvent, + forceCheckStorage bool, + stat *topo.AgentStat, +) topo.SubsysStatus { // check storage once in a while if all is ok (see https://jira.percona.com/browse/PBM-647) // but if storage was(is) failed, check it always - stat, err := topo.GetAgentStatus(ctx, a.leadConn, a.brief.SetName, a.brief.Me) - if err != nil { - log.Warning("get current storage status: %v", err) - } if !forceCheckStorage && stat.StorageStatus.OK { return topo.SubsysStatus{OK: true} } diff --git a/pbm/topo/agent.go b/pbm/topo/agent.go index 287d68d5d..07a738a17 100644 --- a/pbm/topo/agent.go +++ b/pbm/topo/agent.go @@ -124,14 +124,8 @@ func (s *AgentStat) MongoVersion() version.MongoVersion { return v } -func SetAgentStatus(ctx context.Context, m connect.Client, stat AgentStat) error { - ct, err := GetClusterTime(ctx, m) - if err != nil { - return errors.Wrap(err, "get cluster time") - } - stat.Heartbeat = ct - - _, err = m.AgentsStatusCollection().ReplaceOne( +func SetAgentStatus(ctx context.Context, m connect.Client, stat *AgentStat) error { + _, err := m.AgentsStatusCollection().ReplaceOne( ctx, bson.D{{"n", stat.Node}, {"rs", stat.RS}}, stat, From 901e112281a2370f1ea7e3987272b7a9c17796dc Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Mon, 23 Sep 2024 20:20:29 +0200 Subject: [PATCH 39/45] PBM-397: cancel context on SIGINT and SIGKILL to force agent status cleanup --- cmd/pbm-agent/main.go | 3 ++- pbm/ctrl/recv.go | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/cmd/pbm-agent/main.go b/cmd/pbm-agent/main.go index 77e66deb4..de4854844 100644 --- a/cmd/pbm-agent/main.go +++ b/cmd/pbm-agent/main.go @@ -5,6 +5,7 @@ import ( "fmt" stdlog "log" "os" + "os/signal" "runtime" "strconv" "strings" @@ -86,7 +87,7 @@ func runAgent(mongoURI string, dumpConns int) error { mtLog.SetDateFormat(log.LogTimeFormat) mtLog.SetVerbosity(&options.Verbosity{VLevel: mtLog.DebugLow}) - ctx, cancel := context.WithCancel(context.Background()) + ctx, cancel := signal.NotifyContext(context.Background(), os.Interrupt, os.Kill) defer cancel() leadConn, err := connect.Connect(ctx, mongoURI, "pbm-agent") diff --git a/pbm/ctrl/recv.go b/pbm/ctrl/recv.go index 59fc25e43..37b5242d7 100644 --- a/pbm/ctrl/recv.go +++ b/pbm/ctrl/recv.go @@ -54,6 +54,9 @@ func ListenCmd(ctx context.Context, m connect.Client, cl <-chan struct{}) (<-cha ) if err != nil { errc <- errors.Wrap(err, "watch the cmd stream") + if errors.Is(err, context.Canceled) { + return + } continue } From 5540251bf6f6c5f255e981eb46798af2574507c4 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Mon, 30 Sep 2024 15:06:15 +0200 Subject: [PATCH 40/45] PBM-397: check for parallel agent once per minute --- cmd/pbm-agent/agent.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmd/pbm-agent/agent.go b/cmd/pbm-agent/agent.go index 8da36d828..812892058 100644 --- a/cmd/pbm-agent/agent.go +++ b/cmd/pbm-agent/agent.go @@ -306,7 +306,7 @@ func (a *Agent) HbStatus(ctx context.Context) { // check storage once in a while if all is ok (see https://jira.percona.com/browse/PBM-647) const storageCheckInterval = 15 * time.Second - const parallelAgentCheckInternval = 20 * time.Second + const parallelAgentCheckInternval = time.Minute for { select { From 1b2a8ba866cd54c36c84580b2031de49d73757c2 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Mon, 30 Sep 2024 15:08:34 +0200 Subject: [PATCH 41/45] PBM-397: return ctx.Err() from ListenCmd() --- pbm/ctrl/recv.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pbm/ctrl/recv.go b/pbm/ctrl/recv.go index 37b5242d7..040b5f8e3 100644 --- a/pbm/ctrl/recv.go +++ b/pbm/ctrl/recv.go @@ -44,6 +44,9 @@ func ListenCmd(ctx context.Context, m connect.Client, cl <-chan struct{}) (<-cha var lastCmd Command for { select { + case <-ctx.Done(): + errc <- ctx.Err() + return case <-cl: return default: @@ -54,9 +57,6 @@ func ListenCmd(ctx context.Context, m connect.Client, cl <-chan struct{}) (<-cha ) if err != nil { errc <- errors.Wrap(err, "watch the cmd stream") - if errors.Is(err, context.Canceled) { - return - } continue } From a69ed32026ab883ba0f40182aa14a8d04ec70d62 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Tue, 1 Oct 2024 16:25:40 +0200 Subject: [PATCH 42/45] PBM-1404: fix cond: len with -1 for no diff --- pbm/backup/storage.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pbm/backup/storage.go b/pbm/backup/storage.go index acd55b1b5..d7ed0a1ea 100644 --- a/pbm/backup/storage.go +++ b/pbm/backup/storage.go @@ -130,7 +130,7 @@ func checkPhysicalBackupDataFiles(_ context.Context, stg storage.Storage, bcp *B } for _, f := range filelist { - if f.Len <= 0 { + if f.Len < 0 { continue // no file expected } From 0f1100853d66f769c10c795128938ed461624245 Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Tue, 1 Oct 2024 17:13:03 +0200 Subject: [PATCH 43/45] PBM-1400: add num-parallel-collections to pbm config --- cmd/pbm-agent/backup.go | 6 +++++- cmd/pbm-agent/oplog.go | 8 +++++++- cmd/pbm-agent/restore.go | 10 +++++++++- pbm/config/config.go | 7 +++++-- pbm/restore/logical.go | 15 +++++++-------- 5 files changed, 33 insertions(+), 13 deletions(-) diff --git a/cmd/pbm-agent/backup.go b/cmd/pbm-agent/backup.go index 9b384e7a3..2e18eb82b 100644 --- a/cmd/pbm-agent/backup.go +++ b/cmd/pbm-agent/backup.go @@ -114,7 +114,11 @@ func (a *Agent) Backup(ctx context.Context, cmd *ctrl.BackupCmd, opid ctrl.OPID, case defs.LogicalBackup: fallthrough default: - bcp = backup.New(a.leadConn, a.nodeConn, a.brief, a.numParallelColls) + numParallelColls := a.numParallelColls + if cfg.Backup != nil && cfg.Backup.NumParallelCollections > 0 { + numParallelColls = cfg.Backup.NumParallelCollections + } + bcp = backup.New(a.leadConn, a.nodeConn, a.brief, numParallelColls) } bcp.SetConfig(cfg) diff --git a/cmd/pbm-agent/oplog.go b/cmd/pbm-agent/oplog.go index 69641d442..9be619407 100644 --- a/cmd/pbm-agent/oplog.go +++ b/cmd/pbm-agent/oplog.go @@ -70,8 +70,14 @@ func (a *Agent) OplogReplay(ctx context.Context, r *ctrl.ReplayCmd, opID ctrl.OP } }() + cfg, err := config.GetConfig(ctx, a.leadConn) + if err != nil { + l.Error("get PBM config: %v", err) + return + } + l.Info("oplog replay started") - rr := restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap, 0) + rr := restore.New(a.leadConn, a.nodeConn, a.brief, cfg, r.RSMap, 0) err = rr.ReplayOplog(ctx, r, opID, l) if err != nil { if errors.Is(err, restore.ErrNoDataForShard) { diff --git a/cmd/pbm-agent/restore.go b/cmd/pbm-agent/restore.go index 77510b687..9a5dc02cc 100644 --- a/cmd/pbm-agent/restore.go +++ b/cmd/pbm-agent/restore.go @@ -106,6 +106,12 @@ func (a *Agent) Restore(ctx context.Context, r *ctrl.RestoreCmd, opid ctrl.OPID, r.BackupName = bcp.Name } + cfg, err := config.GetConfig(ctx, a.leadConn) + if err != nil { + l.Error("get PBM configuration: %v", err) + return + } + l.Info("recovery started") switch bcpType { @@ -118,9 +124,11 @@ func (a *Agent) Restore(ctx context.Context, r *ctrl.RestoreCmd, opid ctrl.OPID, numParallelColls := runtime.NumCPU() / 2 if r.NumParallelColls != nil && *r.NumParallelColls > 0 { numParallelColls = int(*r.NumParallelColls) + } else if cfg.Restore != nil && cfg.Restore.NumParallelCollections > 0 { + numParallelColls = cfg.Restore.NumParallelCollections } - rr := restore.New(a.leadConn, a.nodeConn, a.brief, r.RSMap, numParallelColls) + rr := restore.New(a.leadConn, a.nodeConn, a.brief, cfg, r.RSMap, numParallelColls) if r.OplogTS.IsZero() { err = rr.Snapshot(ctx, r, opid, bcp) } else { diff --git a/pbm/config/config.go b/pbm/config/config.go index 180e204de..3cb985acf 100644 --- a/pbm/config/config.go +++ b/pbm/config/config.go @@ -321,8 +321,9 @@ type RestoreConf struct { // Logical restore // // num of documents to buffer - BatchSize int `bson:"batchSize" json:"batchSize,omitempty" yaml:"batchSize,omitempty"` - NumInsertionWorkers int `bson:"numInsertionWorkers" json:"numInsertionWorkers,omitempty" yaml:"numInsertionWorkers,omitempty"` + BatchSize int `bson:"batchSize" json:"batchSize,omitempty" yaml:"batchSize,omitempty"` + NumInsertionWorkers int `bson:"numInsertionWorkers" json:"numInsertionWorkers,omitempty" yaml:"numInsertionWorkers,omitempty"` + NumParallelCollections int `bson:"numParallelCollections" json:"numParallelCollections,omitempty" yaml:"numParallelCollections,omitempty"` // NumDownloadWorkers sets the num of goroutine would be requesting chunks // during the download. By default, it's set to GOMAXPROCS. @@ -361,6 +362,8 @@ type BackupConf struct { Timeouts *BackupTimeouts `bson:"timeouts,omitempty" json:"timeouts,omitempty" yaml:"timeouts,omitempty"` Compression compress.CompressionType `bson:"compression,omitempty" json:"compression,omitempty" yaml:"compression,omitempty"` CompressionLevel *int `bson:"compressionLevel,omitempty" json:"compressionLevel,omitempty" yaml:"compressionLevel,omitempty"` + + NumParallelCollections int `bson:"numParallelCollections" json:"numParallelCollections,omitempty" yaml:"numParallelCollections,omitempty"` } func (cfg *BackupConf) Clone() *BackupConf { diff --git a/pbm/restore/logical.go b/pbm/restore/logical.go index 94901d446..cd455a5c6 100644 --- a/pbm/restore/logical.go +++ b/pbm/restore/logical.go @@ -44,6 +44,7 @@ type Restore struct { brief topo.NodeBrief stopHB chan struct{} nodeInfo *topo.NodeInfo + cfg *config.Config bcpStg storage.Storage oplogStg storage.Storage @@ -82,6 +83,7 @@ func New( leadConn connect.Client, nodeConn *mongo.Client, brief topo.NodeBrief, + cfg *config.Config, rsMap map[string]string, numParallelColls int, ) *Restore { @@ -95,6 +97,8 @@ func New( brief: brief, rsMap: rsMap, + cfg: cfg, + numParallelColls: numParallelColls, indexCatalog: idx.NewIndexCatalog(), @@ -835,7 +839,7 @@ func (r *Restore) RunSnapshot( defer rdr.Close() // Restore snapshot (mongorestore) - err = r.snapshot(ctx, rdr) + err = r.snapshot(rdr) if err != nil { return errors.Wrap(err, "mongorestore") } @@ -1188,13 +1192,8 @@ func (r *Restore) applyOplog(ctx context.Context, ranges []oplogRange, options * return nil } -func (r *Restore) snapshot(ctx context.Context, input io.Reader) error { - cfg, err := config.GetConfig(ctx, r.leadConn) - if err != nil { - return errors.Wrap(err, "unable to get PBM config settings") - } - - rf, err := snapshot.NewRestore(r.brief.URI, cfg, r.numParallelColls) +func (r *Restore) snapshot(input io.Reader) error { + rf, err := snapshot.NewRestore(r.brief.URI, r.cfg, r.numParallelColls) if err != nil { return err } From cf567c137682d3247f9544883ee0c087a452f3ad Mon Sep 17 00:00:00 2001 From: Dmytro Zghoba Date: Wed, 2 Oct 2024 10:51:14 +0200 Subject: [PATCH 44/45] bump version --- pbm/version/version.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pbm/version/version.go b/pbm/version/version.go index a7389748b..845b4afcb 100644 --- a/pbm/version/version.go +++ b/pbm/version/version.go @@ -16,7 +16,7 @@ import ( ) // current PBM version -const version = "2.6.0" +const version = "2.7.0" var ( platform string From bb679003dcf473e3d036f927d0bb722553905a9f Mon Sep 17 00:00:00 2001 From: Sandra Date: Wed, 2 Oct 2024 12:58:35 +0300 Subject: [PATCH 45/45] PBM-1403. Update go version to prevent CVE-2024-34156 --- packaging/scripts/mongodb-backup_builder.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/packaging/scripts/mongodb-backup_builder.sh b/packaging/scripts/mongodb-backup_builder.sh index a7a20a93a..7f0feecb6 100644 --- a/packaging/scripts/mongodb-backup_builder.sh +++ b/packaging/scripts/mongodb-backup_builder.sh @@ -141,7 +141,7 @@ install_golang() { elif [ x"$ARCH" = "xaarch64" ]; then GO_ARCH="arm64" fi - wget https://go.dev/dl/go1.22.5.linux-${GO_ARCH}.tar.gz -O /tmp/go1.22.tar.gz + wget https://go.dev/dl/go1.22.8.linux-${GO_ARCH}.tar.gz -O /tmp/go1.22.tar.gz tar --transform=s,go,go1.22, -zxf /tmp/go1.22.tar.gz rm -rf /usr/local/go* mv go1.22 /usr/local/