Skip to content

Commit

Permalink
Monitor for stuck snapshots in both directions -- from receiver and f…
Browse files Browse the repository at this point in the history
…rom sender
  • Loading branch information
mangalaman93 committed Aug 12, 2024
1 parent b7cc01e commit 648b121
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 4 deletions.
12 changes: 10 additions & 2 deletions worker/draft.go
Original file line number Diff line number Diff line change
Expand Up @@ -941,13 +941,19 @@ func (n *node) retrieveSnapshot(snap pb.Snapshot) error {
var pool *conn.Pool
addr := snap.Context.GetAddr()
glog.V(2).Infof("Snapshot.RaftContext.Addr: %q", addr)
if len(addr) > 0 {
if addr == n.MyAddr {
// There was a bug here. If the the address was my address, then the
// snapshot request would loop back to myself, and get stuck in an
// infinite loop.
glog.V(2).Infof("Snapshot.RaftContext address is mine. Skipping that...")

} else if len(addr) > 0 {
p, err := conn.GetPools().Get(addr)
if err != nil {
glog.V(2).Infof("conn.Get(%q) Error: %v", addr, err)
} else {
pool = p
glog.V(2).Infof("Leader connection picked from RaftContext")
glog.V(2).Infof("Leader connection picked from RaftContext: %s", addr)
}
}
if pool == nil {
Expand All @@ -957,6 +963,7 @@ func (n *node) retrieveSnapshot(snap pb.Snapshot) error {
return err
}
pool = p
glog.V(2).Infof("Leader connection picked from membership state: %s", p.Addr)
}

// Need to clear pl's stored in memory for the case when retrieving snapshot with
Expand Down Expand Up @@ -1890,6 +1897,7 @@ func (n *node) InitAndStartNode() {
_, _ = n.startTask(opRollup)
go n.stopAllTasks()
go n.Run()
go n.checkForFailedSnapshot()
}

func (n *node) AmLeader() bool {
Expand Down
110 changes: 108 additions & 2 deletions worker/snapshot.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,7 @@ func (n *node) populateSnapshot(snap pb.Snapshot, pl *conn.Pool) error {
sw := pstore.NewStreamWriter()
defer sw.Cancel()

glog.Infof("Creating StreamWriter\n")
if err := sw.Prepare(); err != nil {
return err
}
Expand All @@ -79,14 +80,43 @@ func (n *node) populateSnapshot(snap pb.Snapshot, pl *conn.Pool) error {
writer = pstore.NewManagedWriteBatch()
}

glog.Infof("Starting to receive snapshot...\n")
lastReceived := time.Now().Unix()
go func() {
tick := time.NewTicker(10 * time.Second)
defer tick.Stop()

for {
select {
case <-ctx.Done():
return
case <-tick.C:
last := atomic.LoadInt64(&lastReceived)
if last == 0 {
continue
}
lastTs := time.Unix(last, 0)
if time.Since(lastTs) > 300*time.Second {
glog.Warningf("Haven't received anything for over 300s." +
" Abandoning snapshot retrieval...\n")
cancel()
}
}
}
}()
// We can use count to check the number of posting lists returned in tests.
size := 0
var done *pb.KVS
for {
// Track when we last received anything from the leader. If we don't
// receive anything for a while, we should abandon this connection.
atomic.StoreInt64(&lastReceived, time.Now().Unix())
kvs, err := stream.Recv()
if err != nil {
return err
return errors.Wrapf(err, "stream.Recv")
}
atomic.StoreInt64(&lastReceived, 0) // We got something.

if kvs.Done {
done = kvs
glog.V(1).Infoln("All key-values have been received.")
Expand Down Expand Up @@ -262,11 +292,87 @@ func doStreamSnapshot(snap *pb.Snapshot, out pb.Worker_StreamSnapshotServer) err
return nil
}

func (n *node) checkForFailedSnapshot() {
tick := time.NewTicker(10 * time.Second)
defer tick.Stop()

loopOverStatus := func() {
glog.V(2).Infof("Entering snapshot monitoring state")
defer func() {
glog.V(2).Infof("Exiting snapshot monitoring state")
}()

var last time.Time

for range tick.C {
if n.closer.Ctx().Err() != nil {
return
}
status := n.Raft().Status()
if status.Lead != status.ID {
// I'm not the leader, so don't do anything.
return
}
var snapshotId uint64
for id, prog := range status.Progress {
if prog.State != raft.ProgressStateSnapshot {

Check failure on line 318 in worker/snapshot.go

View workflow job for this annotation

GitHub Actions / lint

undefined: raft.ProgressStateSnapshot

Check failure on line 318 in worker/snapshot.go

View workflow job for this annotation

GitHub Actions / lint

undefined: raft.ProgressStateSnapshot
continue
}
snapshotId = id
break
}
if snapshotId == 0 {
// No Alpha is in pending snapshot state.
glog.Infof("No Alpha in pending snapshot state.\n")
return
}
if last.IsZero() {
last = time.Now()
glog.Infof("Registered a pending snapshot for ID: %#x\n", snapshotId)
}

var ongoing bool
for _, t := range GetOngoingTasks() {
if t == "opSnapshot" {
glog.V(2).Infof("Snapshot: Ongoing. All good.\n")
ongoing = true
}
}

if !ongoing && time.Since(last) > 60*time.Second {
// Report snapshot as failed.
glog.Warningf("Reporting snapshot for ID: %#x as failed\n", snapshotId)
n.Raft().ReportSnapshot(snapshotId, raft.SnapshotFailure)
return
}
}
}

for {
select {
case <-n.closer.HasBeenClosed():
return
case <-tick.C:
status := n.Raft().Status()
if status.Lead != status.ID {
// I'm not the leader, so don't do anything.
continue
}
for _, prog := range status.Progress {
if prog.State != raft.ProgressStateSnapshot {

Check failure on line 362 in worker/snapshot.go

View workflow job for this annotation

GitHub Actions / lint

undefined: raft.ProgressStateSnapshot) (typecheck)

Check failure on line 362 in worker/snapshot.go

View workflow job for this annotation

GitHub Actions / lint

undefined: raft.ProgressStateSnapshot) (typecheck)
continue
}
loopOverStatus()
}
}
}
}

func (w *grpcWorker) StreamSnapshot(stream pb.Worker_StreamSnapshotServer) error {
// Pause rollups during snapshot streaming.
closer, err := groups().Node.startTask(opSnapshot)
if err != nil {
return err
return errors.Wrapf(err, "startTask on StreamSnapshot failed")
}
defer closer.Done()

Expand Down

0 comments on commit 648b121

Please sign in to comment.