Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[storage] fix the replay-verify stuck #10930

Merged
merged 1 commit into from
Nov 28, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,9 @@ impl ReplayVerifyCoordinator {
);
}

// Once it begins replay, we want to directly start from the version that failed
let save_start_version = (next_txn_version > 0).then_some(next_txn_version);

next_txn_version = std::cmp::max(next_txn_version, snapshot_version.map_or(0, |v| v + 1));

let transactions = metadata_view.select_transaction_backups(
Expand Down Expand Up @@ -185,7 +188,7 @@ impl ReplayVerifyCoordinator {
.into_iter()
.map(|t| t.manifest)
.collect::<Vec<_>>(),
None,
save_start_version,
Some((next_txn_version, false)), /* replay_from_version */
None, /* epoch_history */
self.verify_execution_mode.clone(),
Expand Down
70 changes: 28 additions & 42 deletions storage/db-tool/src/replay_verify.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Copyright © Aptos Foundation
// SPDX-License-Identifier: Apache-2.0

use anyhow::{bail, Result};
use anyhow::Result;
use aptos_backup_cli::{
coordinators::replay_verify::{ReplayError, ReplayVerifyCoordinator},
metadata::cache::MetadataCacheOpt,
Expand All @@ -17,7 +17,7 @@ use aptos_executor_types::VerifyExecutionMode;
use aptos_logger::info;
use aptos_types::transaction::Version;
use clap::Parser;
use std::{path::PathBuf, sync::Arc};
use std::{path::PathBuf, process, sync::Arc};

/// Read the backup files, replay them and verify the modules
#[derive(Parser)]
Expand Down Expand Up @@ -59,8 +59,6 @@ pub struct Opt {
lazy_quit: bool,
}

const RETRY_ATTEMPT: u8 = 5;

impl Opt {
pub async fn run(self) -> Result<()> {
let restore_handler = Arc::new(AptosDB::open_kv_only(
Expand All @@ -73,46 +71,34 @@ impl Opt {
DEFAULT_MAX_NUM_NODES_PER_LRU_CACHE_SHARD,
)?)
.get_restore_handler();
let mut attempt = 0;
while attempt < RETRY_ATTEMPT {
let ret = ReplayVerifyCoordinator::new(
self.storage.clone().init_storage().await?,
self.metadata_cache_opt.clone(),
self.trusted_waypoints_opt.clone(),
self.concurrent_downloads.get(),
self.replay_concurrency_level.get(),
restore_handler.clone(),
self.start_version.unwrap_or(0),
self.end_version.unwrap_or(Version::MAX),
self.validate_modules,
VerifyExecutionMode::verify_except(self.txns_to_skip.clone())
.set_lazy_quit(self.lazy_quit),
)?
.run()
.await;
match ret {
Err(e) => match e {
ReplayError::TxnMismatch => {
info!("ReplayVerify coordinator exiting with Txn output mismatch error.");
break;
},
_ => {
info!(
"ReplayVerify coordinator retrying with attempt {}.",
attempt
);
},
let ret = ReplayVerifyCoordinator::new(
self.storage.init_storage().await?,
self.metadata_cache_opt,
self.trusted_waypoints_opt,
self.concurrent_downloads.get(),
self.replay_concurrency_level.get(),
restore_handler,
self.start_version.unwrap_or(0),
self.end_version.unwrap_or(Version::MAX),
self.validate_modules,
VerifyExecutionMode::verify_except(self.txns_to_skip).set_lazy_quit(self.lazy_quit),
)?
.run()
.await;
match ret {
Err(e) => match e {
ReplayError::TxnMismatch => {
info!("ReplayVerify coordinator exiting with Txn output mismatch error.");
process::exit(2);
},
_ => {
info!("ReplayVerify coordinator succeeded");
return Ok(());
process::exit(1);
},
}
attempt += 1;
}
bail!(
"ReplayVerify coordinator failed after {} attempts.",
RETRY_ATTEMPT
)
},
_ => {
info!("ReplayVerify coordinator succeeded");
},
};
Ok(())
}
}
15 changes: 14 additions & 1 deletion testsuite/replay_verify.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,18 @@
]


# retry the replay_verify_partition if it fails
def retry_replay_verify_partition(func, *args, **kwargs) -> Tuple[int, int, bytes]:
(partition_number, code, msg) = (0, 0, b"")
NUM_OF_RETRIES = 3
for i in range(1, NUM_OF_RETRIES + 1):
print(f"try {i}")
(partition_number, code, msg) = func(*args, **kwargs)
if code != 1:
break
return (partition_number, code, msg)


def replay_verify_partition(
n: int,
N: int,
Expand Down Expand Up @@ -208,9 +220,10 @@ def main(runner_no=None, runner_cnt=None, start_version=None, end_version=None):

with Pool(N) as p:
all_partitions = p.starmap(
replay_verify_partition,
retry_replay_verify_partition,
[
(
replay_verify_partition,
n,
N,
runner_start,
Expand Down
2 changes: 1 addition & 1 deletion testsuite/replay_verify_run_local.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,5 @@ def local_setup():
if __name__ == "__main__":
local_setup()
replay_verify.main(
runner_no=None, runner_cnt=None, start_version=261693085, end_version=267000000
runner_no=None, runner_cnt=None, start_version=291217350, end_version=292975771
)
Loading