Monitor for stuck snapshots in both directions -- from receiver and f…

…rom sender
dgraph-io · Aug 12, 2024 · 648b121 · 648b121
1 parent b7cc01e
commit 648b121
Show file tree

Hide file tree

Showing 2 changed files with 118 additions and 4 deletions.
diff --git a/worker/draft.go b/worker/draft.go
@@ -941,13 +941,19 @@ func (n *node) retrieveSnapshot(snap pb.Snapshot) error {
 	var pool *conn.Pool
 	addr := snap.Context.GetAddr()
 	glog.V(2).Infof("Snapshot.RaftContext.Addr: %q", addr)
-	if len(addr) > 0 {
+	if addr == n.MyAddr {
+		// There was a bug here. If the the address was my address, then the
+		// snapshot request would loop back to myself, and get stuck in an
+		// infinite loop.
+		glog.V(2).Infof("Snapshot.RaftContext address is mine. Skipping that...")
+
+	} else if len(addr) > 0 {
 		p, err := conn.GetPools().Get(addr)
 		if err != nil {
 			glog.V(2).Infof("conn.Get(%q) Error: %v", addr, err)
 		} else {
 			pool = p
-			glog.V(2).Infof("Leader connection picked from RaftContext")
+			glog.V(2).Infof("Leader connection picked from RaftContext: %s", addr)
 		}
 	}
 	if pool == nil {
@@ -957,6 +963,7 @@ func (n *node) retrieveSnapshot(snap pb.Snapshot) error {
 			return err
 		}
 		pool = p
+		glog.V(2).Infof("Leader connection picked from membership state: %s", p.Addr)
 	}
 
 	// Need to clear pl's stored in memory for the case when retrieving snapshot with
@@ -1890,6 +1897,7 @@ func (n *node) InitAndStartNode() {
 	_, _ = n.startTask(opRollup)
 	go n.stopAllTasks()
 	go n.Run()
+	go n.checkForFailedSnapshot()
 }
 
 func (n *node) AmLeader() bool {

diff --git a/worker/snapshot.go b/worker/snapshot.go
@@ -70,6 +70,7 @@ func (n *node) populateSnapshot(snap pb.Snapshot, pl *conn.Pool) error {
 		sw := pstore.NewStreamWriter()
 		defer sw.Cancel()
 
+		glog.Infof("Creating StreamWriter\n")
 		if err := sw.Prepare(); err != nil {
 			return err
 		}
@@ -79,14 +80,43 @@ func (n *node) populateSnapshot(snap pb.Snapshot, pl *conn.Pool) error {
 		writer = pstore.NewManagedWriteBatch()
 	}
 
+	glog.Infof("Starting to receive snapshot...\n")
+	lastReceived := time.Now().Unix()
+	go func() {
+		tick := time.NewTicker(10 * time.Second)
+		defer tick.Stop()
+
+		for {
+			select {
+			case <-ctx.Done():
+				return
+			case <-tick.C:
+				last := atomic.LoadInt64(&lastReceived)
+				if last == 0 {
+					continue
+				}
+				lastTs := time.Unix(last, 0)
+				if time.Since(lastTs) > 300*time.Second {
+					glog.Warningf("Haven't received anything for over 300s." +
+						" Abandoning snapshot retrieval...\n")
+					cancel()
+				}
+			}
+		}
+	}()
 	// We can use count to check the number of posting lists returned in tests.
 	size := 0
 	var done *pb.KVS
 	for {
+		// Track when we last received anything from the leader. If we don't
+		// receive anything for a while, we should abandon this connection.
+		atomic.StoreInt64(&lastReceived, time.Now().Unix())
 		kvs, err := stream.Recv()
 		if err != nil {
-			return err
+			return errors.Wrapf(err, "stream.Recv")
 		}
+		atomic.StoreInt64(&lastReceived, 0) // We got something.
+
 		if kvs.Done {
 			done = kvs
 			glog.V(1).Infoln("All key-values have been received.")
@@ -262,11 +292,87 @@ func doStreamSnapshot(snap *pb.Snapshot, out pb.Worker_StreamSnapshotServer) err
 	return nil
 }
 
+func (n *node) checkForFailedSnapshot() {
+	tick := time.NewTicker(10 * time.Second)
+	defer tick.Stop()
+
+	loopOverStatus := func() {
+		glog.V(2).Infof("Entering snapshot monitoring state")
+		defer func() {
+			glog.V(2).Infof("Exiting snapshot monitoring state")
+		}()
+
+		var last time.Time
+
+		for range tick.C {
+			if n.closer.Ctx().Err() != nil {
+				return
+			}
+			status := n.Raft().Status()
+			if status.Lead != status.ID {
+				// I'm not the leader, so don't do anything.
+				return
+			}
+			var snapshotId uint64
+			for id, prog := range status.Progress {
+				if prog.State != raft.ProgressStateSnapshot {
+					continue
+				}
+				snapshotId = id
+				break
+			}
+			if snapshotId == 0 {
+				// No Alpha is in pending snapshot state.
+				glog.Infof("No Alpha in pending snapshot state.\n")
+				return
+			}
+			if last.IsZero() {
+				last = time.Now()
+				glog.Infof("Registered a pending snapshot for ID: %#x\n", snapshotId)
+			}
+
+			var ongoing bool
+			for _, t := range GetOngoingTasks() {
+				if t == "opSnapshot" {
+					glog.V(2).Infof("Snapshot: Ongoing. All good.\n")
+					ongoing = true
+				}
+			}
+
+			if !ongoing && time.Since(last) > 60*time.Second {
+				// Report snapshot as failed.
+				glog.Warningf("Reporting snapshot for ID: %#x as failed\n", snapshotId)
+				n.Raft().ReportSnapshot(snapshotId, raft.SnapshotFailure)
+				return
+			}
+		}
+	}
+
+	for {
+		select {
+		case <-n.closer.HasBeenClosed():
+			return
+		case <-tick.C:
+			status := n.Raft().Status()
+			if status.Lead != status.ID {
+				// I'm not the leader, so don't do anything.
+				continue
+			}
+			for _, prog := range status.Progress {
+				if prog.State != raft.ProgressStateSnapshot {
+					continue
+				}
+				loopOverStatus()
+			}
+		}
+	}
+}
+
 func (w *grpcWorker) StreamSnapshot(stream pb.Worker_StreamSnapshotServer) error {
 	// Pause rollups during snapshot streaming.
 	closer, err := groups().Node.startTask(opSnapshot)
 	if err != nil {
-		return err
+		return errors.Wrapf(err, "startTask on StreamSnapshot failed")
 	}
 	defer closer.Done()