From ebc250457426e6c741aab9d7efb9de494660ec37 Mon Sep 17 00:00:00 2001 From: Henry Sun Date: Thu, 12 Dec 2024 11:49:00 -0800 Subject: [PATCH] Cleanup snapshot if snapshot loading fails Summary: Delete a moved-in snapshot if opening the snapshot fails to avoid situations where a previous failure leaves behind files that prevents future snapshots from being moved to the correct locations. Reviewed By: jaher Differential Revision: D67121203 fbshipit-source-id: ff996dbce15ddb4bc3db85b89c3065eb12ecac4a --- src/wa_raft_server.erl | 36 +++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/src/wa_raft_server.erl b/src/wa_raft_server.erl index 607e7bb..c348b24 100644 --- a/src/wa_raft_server.erl +++ b/src/wa_raft_server.erl @@ -767,21 +767,27 @@ stalled({call, From}, ?SNAPSHOT_AVAILABLE_COMMAND(Root, #raft_log_pos{index = Sn catch filelib:ensure_dir(Path), case prim_file:rename(Root, Path) of ok -> - ?LOG_NOTICE("Server[~0p, term ~0p, stalled] applying snapshot ~p:~p", - [Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}), - ok = wa_raft_storage:open_snapshot(Storage, SnapshotPos), - {ok, View1} = wa_raft_log:reset(View0, SnapshotPos), - State1 = State0#raft_state{log_view = View1, last_applied = SnapshotIndex, commit_index = SnapshotIndex}, - State2 = load_config(State1), - ?LOG_NOTICE("Server[~0p, term ~0p, stalled] switching to follower after installing snapshot at ~p:~p.", - [Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}), - State3 = case SnapshotTerm > CurrentTerm of - true -> advance_term(?FUNCTION_NAME, SnapshotTerm, undefined, State2); - false -> State2 - end, - % At this point, we assume that we received some cluster membership configuration from - % our peer so it is safe to transition to an operational state. - {next_state, follower, State3, [{reply, From, ok}]}; + try + ?LOG_NOTICE("Server[~0p, term ~0p, stalled] applying snapshot ~p:~p", + [Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}), + ok = wa_raft_storage:open_snapshot(Storage, SnapshotPos), + {ok, View1} = wa_raft_log:reset(View0, SnapshotPos), + State1 = State0#raft_state{log_view = View1, last_applied = SnapshotIndex, commit_index = SnapshotIndex}, + State2 = load_config(State1), + ?LOG_NOTICE("Server[~0p, term ~0p, stalled] switching to follower after installing snapshot at ~p:~p.", + [Name, CurrentTerm, SnapshotIndex, SnapshotTerm], #{domain => [whatsapp, wa_raft]}), + State3 = case SnapshotTerm > CurrentTerm of + true -> advance_term(?FUNCTION_NAME, SnapshotTerm, undefined, State2); + false -> State2 + end, + % At this point, we assume that we received some cluster membership configuration from + % our peer so it is safe to transition to an operational state. + {next_state, follower, State3, [{reply, From, ok}]} + after + % It is assumed that the loading of the snapshot will move the snapshot away or + % otherwise disassociate the storage state from the snapshot path. + catch file:del_dir_r(Path) + end; {error, Reason} -> ?LOG_WARNING("Server[~0p, term ~0p, stalled] failed to rename available snapshot ~p to ~p due to ~p", [Name, CurrentTerm, Root, Path, Reason], #{domain => [whatsapp, wa_raft]}),