From 90bc3c610554c4dad55b9b81a09f717c1d89580c Mon Sep 17 00:00:00 2001 From: Bo Wu Date: Wed, 15 Nov 2023 12:03:40 -0800 Subject: [PATCH] [storage] fix the replay-verify stuck --- .../src/coordinators/replay_verify.rs | 5 +- storage/db-tool/src/replay_verify.rs | 70 ++++++++----------- testsuite/replay_verify.py | 15 +++- testsuite/replay_verify_run_local.py | 2 +- 4 files changed, 47 insertions(+), 45 deletions(-) diff --git a/storage/backup/backup-cli/src/coordinators/replay_verify.rs b/storage/backup/backup-cli/src/coordinators/replay_verify.rs index 0d91e420ac16e..6142ffc3bc596 100644 --- a/storage/backup/backup-cli/src/coordinators/replay_verify.rs +++ b/storage/backup/backup-cli/src/coordinators/replay_verify.rs @@ -144,6 +144,9 @@ impl ReplayVerifyCoordinator { ); } + // Once it begins replay, we want to directly start from the version that failed + let save_start_version = (next_txn_version > 0).then_some(next_txn_version); + next_txn_version = std::cmp::max(next_txn_version, snapshot_version.map_or(0, |v| v + 1)); let transactions = metadata_view.select_transaction_backups( @@ -185,7 +188,7 @@ impl ReplayVerifyCoordinator { .into_iter() .map(|t| t.manifest) .collect::>(), - None, + save_start_version, Some((next_txn_version, false)), /* replay_from_version */ None, /* epoch_history */ self.verify_execution_mode.clone(), diff --git a/storage/db-tool/src/replay_verify.rs b/storage/db-tool/src/replay_verify.rs index 2c33ed49adca9..7bb81b6eb86fa 100644 --- a/storage/db-tool/src/replay_verify.rs +++ b/storage/db-tool/src/replay_verify.rs @@ -1,7 +1,7 @@ // Copyright © Aptos Foundation // SPDX-License-Identifier: Apache-2.0 -use anyhow::{bail, Result}; +use anyhow::Result; use aptos_backup_cli::{ coordinators::replay_verify::{ReplayError, ReplayVerifyCoordinator}, metadata::cache::MetadataCacheOpt, @@ -17,7 +17,7 @@ use aptos_executor_types::VerifyExecutionMode; use aptos_logger::info; use aptos_types::transaction::Version; use clap::Parser; -use std::{path::PathBuf, sync::Arc}; +use std::{path::PathBuf, process, sync::Arc}; /// Read the backup files, replay them and verify the modules #[derive(Parser)] @@ -59,8 +59,6 @@ pub struct Opt { lazy_quit: bool, } -const RETRY_ATTEMPT: u8 = 5; - impl Opt { pub async fn run(self) -> Result<()> { let restore_handler = Arc::new(AptosDB::open_kv_only( @@ -73,46 +71,34 @@ impl Opt { DEFAULT_MAX_NUM_NODES_PER_LRU_CACHE_SHARD, )?) .get_restore_handler(); - let mut attempt = 0; - while attempt < RETRY_ATTEMPT { - let ret = ReplayVerifyCoordinator::new( - self.storage.clone().init_storage().await?, - self.metadata_cache_opt.clone(), - self.trusted_waypoints_opt.clone(), - self.concurrent_downloads.get(), - self.replay_concurrency_level.get(), - restore_handler.clone(), - self.start_version.unwrap_or(0), - self.end_version.unwrap_or(Version::MAX), - self.validate_modules, - VerifyExecutionMode::verify_except(self.txns_to_skip.clone()) - .set_lazy_quit(self.lazy_quit), - )? - .run() - .await; - match ret { - Err(e) => match e { - ReplayError::TxnMismatch => { - info!("ReplayVerify coordinator exiting with Txn output mismatch error."); - break; - }, - _ => { - info!( - "ReplayVerify coordinator retrying with attempt {}.", - attempt - ); - }, + let ret = ReplayVerifyCoordinator::new( + self.storage.init_storage().await?, + self.metadata_cache_opt, + self.trusted_waypoints_opt, + self.concurrent_downloads.get(), + self.replay_concurrency_level.get(), + restore_handler, + self.start_version.unwrap_or(0), + self.end_version.unwrap_or(Version::MAX), + self.validate_modules, + VerifyExecutionMode::verify_except(self.txns_to_skip).set_lazy_quit(self.lazy_quit), + )? + .run() + .await; + match ret { + Err(e) => match e { + ReplayError::TxnMismatch => { + info!("ReplayVerify coordinator exiting with Txn output mismatch error."); + process::exit(2); }, _ => { - info!("ReplayVerify coordinator succeeded"); - return Ok(()); + process::exit(1); }, - } - attempt += 1; - } - bail!( - "ReplayVerify coordinator failed after {} attempts.", - RETRY_ATTEMPT - ) + }, + _ => { + info!("ReplayVerify coordinator succeeded"); + }, + }; + Ok(()) } } diff --git a/testsuite/replay_verify.py b/testsuite/replay_verify.py index 9718210ff2458..464ff7a8902f3 100755 --- a/testsuite/replay_verify.py +++ b/testsuite/replay_verify.py @@ -71,6 +71,18 @@ ] +# retry the replay_verify_partition if it fails +def retry_replay_verify_partition(func, *args, **kwargs) -> Tuple[int, int, bytes]: + (partition_number, code, msg) = (0, 0, b"") + NUM_OF_RETRIES = 3 + for i in range(1, NUM_OF_RETRIES + 1): + print(f"try {i}") + (partition_number, code, msg) = func(*args, **kwargs) + if code != 1: + break + return (partition_number, code, msg) + + def replay_verify_partition( n: int, N: int, @@ -208,9 +220,10 @@ def main(runner_no=None, runner_cnt=None, start_version=None, end_version=None): with Pool(N) as p: all_partitions = p.starmap( - replay_verify_partition, + retry_replay_verify_partition, [ ( + replay_verify_partition, n, N, runner_start, diff --git a/testsuite/replay_verify_run_local.py b/testsuite/replay_verify_run_local.py index ab8bcaa50a2d7..19756a56180e2 100755 --- a/testsuite/replay_verify_run_local.py +++ b/testsuite/replay_verify_run_local.py @@ -46,5 +46,5 @@ def local_setup(): if __name__ == "__main__": local_setup() replay_verify.main( - runner_no=None, runner_cnt=None, start_version=261693085, end_version=267000000 + runner_no=None, runner_cnt=None, start_version=291217350, end_version=292975771 )