Skip to content

Commit

Permalink
aptos-debugger: gen-replay-verify-jobs command
Browse files Browse the repository at this point in the history
  • Loading branch information
msmouse committed Sep 12, 2024
1 parent 3da5ac6 commit 22de2b8
Show file tree
Hide file tree
Showing 6 changed files with 155 additions and 3 deletions.
23 changes: 20 additions & 3 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,22 +77,39 @@ on:
default: "high-perf-docker-with-local-ssd"

jobs:
build:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
ranges: ${{ steps.gen-jobs.outputs.ranges }}
steps:
- uses: aptos-labs/aptos-core/.github/actions/get-aptos-debugger@0911-alden-cache-build
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
GIT_SHA: ${{ inputs.GIT_SHA || github.sha }}
- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
target/release/aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-path ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--output-file job_ranges.json
echo "::set-output name=ranges::$(cat job_ranges.json)"
replay-verify:
needs: build
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
max-parallel: 16
matrix:
number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
range: [1, 2]
steps:
- name: Echo Runner Number
run: echo "Runner is ${{ matrix.number }}"
Expand Down
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions storage/backup/backup-cli/src/metadata/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ impl MetadataView {
self.compaction_timestamps.clone()
}

pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] {
&self.state_snapshot_backups
}

pub fn select_state_snapshot(
&self,
target_version: Version,
Expand Down
1 change: 1 addition & 0 deletions storage/db-tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ aptos-vm = { workspace = true }
bcs = { workspace = true }
clap = { workspace = true }
itertools = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }

[dev-dependencies]
Expand Down
125 changes: 125 additions & 0 deletions storage/db-tool/src/gen_replay_verify_jobs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
// Copyright (c) Aptos Foundation
// SPDX-License-Identifier: Apache-2.0

use aptos_backup_cli::{
metadata::{
cache::{sync_and_load, MetadataCacheOpt},
StateSnapshotBackupMeta,
},
storage::DBToolStorageOpt,
utils::ConcurrentDownloadsOpt,
};
use aptos_logger::warn;
use aptos_types::transaction::Version;
use clap::Parser;
use itertools::Itertools;
use std::{io::Write, iter::once, path::PathBuf};

#[derive(Parser)]
pub struct Opt {
#[clap(flatten)]
metadata_cache_opt: MetadataCacheOpt,
#[clap(flatten)]
storage: DBToolStorageOpt,
#[clap(flatten)]
concurrent_downloads: ConcurrentDownloadsOpt,
#[clap(
long,
help = "The first transaction version required to be replayed and verified. [Defaults to 0]"
)]
start_version: Option<Version>,
#[clap(
long,
help = "Target number of transactions for each job to replay",
default_value = "20000000"
)]
target_job_size: u64,
#[clap(
long,
help = "Determines the oldest epoch to replay, relative to the latest",
default_value = "4000"
)]
max_epochs: u64,
#[clap(long, help = "Output job ranges")]
output_json_file: PathBuf,
}

impl Opt {
pub async fn run(self) -> anyhow::Result<()> {
let storage = self.storage.init_storage().await?;
let metadata_view = sync_and_load(
&self.metadata_cache_opt,
storage,
self.concurrent_downloads.get(),
)
.await?;

let storage_state = metadata_view.get_storage_state()?;
let global_end_version = storage_state
.latest_transaction_version
.expect("No transaction backups.")
+ 1;
let latest_epoch = storage_state
.latest_state_snapshot_epoch
.expect("No state snapshots.");
let max_epochs = self.max_epochs.min(latest_epoch + 1);
let global_min_epoch = latest_epoch + 1 - max_epochs;

let fake_end = StateSnapshotBackupMeta {
epoch: latest_epoch,
version: global_end_version,
manifest: "".to_string(),
};
let job_ranges = metadata_view
.all_state_snapshots()
.iter()
.skip_while(|s| s.epoch < global_min_epoch)
.chain(once(&fake_end))
.collect_vec()
.iter()
.rev()
.tuple_windows()
// to simplify things, if start_version appears in the middle of a range, give up the range
.take_while(|(_end, begin)| begin.version >= self.start_version.unwrap_or(0))
.peekable()
.batching(|it| {
match it.next() {
Some((end, mut begin)) => {
if end.version - begin.version >= self.target_job_size {
// cut big range short, this hopefully automatically skips load tests
let msg = if end.epoch - begin.epoch > 15 {
"!!! Need more snapshots !!!"
} else {
""
};
warn!(
begin = begin,
end = end,
"Big gap between snapshots. {} versions in {} epochs. {}",
end.version - begin.version,
end.epoch - begin.epoch,
msg,
);
Some((begin.version, begin.version + self.target_job_size))
} else {
while let Some((_prev_end, prev_begin)) = it.peek() {
if end.version - prev_begin.version > self.target_job_size {
break;
}
begin = prev_begin;
let _ = it.next();
}
Some((begin.version, end.version))
}
},
None => None,
}
})
.map(|(begin, end)| format!("{} {}", begin, end))
.collect_vec();

std::fs::File::create(&self.output_json_file)?.write_all(&serde_json::to_vec(&job_ranges)?)?;

Ok(())
}
}
4 changes: 4 additions & 0 deletions storage/db-tool/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ extern crate core;
mod backup;
mod backup_maintenance;
mod bootstrap;
mod gen_replay_verify_jobs;
mod replay_verify;
pub mod restore;
#[cfg(test)]
Expand Down Expand Up @@ -33,6 +34,8 @@ pub enum DBTool {

ReplayVerify(replay_verify::Opt),

GenReplayVerifyJobs(gen_replay_verify_jobs::Opt),

#[clap(subcommand)]
Restore(restore::Command),
}
Expand All @@ -49,6 +52,7 @@ impl DBTool {
info!("Replay verify result: {:?}", ret);
ret
},
DBTool::GenReplayVerifyJobs(cmd) => cmd.run().await,
DBTool::Restore(cmd) => cmd.run().await,
}
}
Expand Down

0 comments on commit 22de2b8

Please sign in to comment.