Skip to content

Commit

Permalink
[GHA] replay-verify replays at state snapshot versions
Browse files Browse the repository at this point in the history
so that no work is wasted

1. added `gen-replay-verify-jobs` sub-command to the aptos-debugger, to
   generate txn ranges that begins at state snapshots and of desired
   size in number of transactions. If there's too many txns between two
   adjacent snapshots, the range is trancated to the target size. This
   way we deal with load tests Tapos kind of situation automatically.
2. Each job runs only one replay -- there's no longer "partitions".
   Instead, we issue a lot more jobs with concurrency control. This way
   jobs run in "waves" and "load balancing" is automatically achieved.
3. A single "prepare" job does the building and jobs generation, and the
   actual replay jobs don't need to build the binary, etc.

Todo: pre-load gcloud command to the runners so that we don't spend 30-40
seconds in each job to install it.
  • Loading branch information
msmouse committed Sep 13, 2024
1 parent da106a1 commit e1a4d0e
Show file tree
Hide file tree
Showing 7 changed files with 330 additions and 19 deletions.
13 changes: 8 additions & 5 deletions .github/workflows/replay-verify.yaml

Large diffs are not rendered by default.

173 changes: 159 additions & 14 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,40 +77,185 @@ on:
default: "high-perf-docker-with-local-ssd"

jobs:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
ranges: ${{ steps.gen-jobs.outputs.ranges }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}

- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
cp target/release/aptos-debugger .
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--output-json-file job_ranges.json \
--start-version $HISTORY_START
echo "ranges=$(cat job_ranges.json)" >> $GITHUB_OUTPUT
cat job_ranges.json | jq || true
- name: Cache backup storage config so the replay jobs don't need to checkout entire repo
uses: actions/cache/save@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}

replay-verify:
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
max-parallel: 16
matrix:
number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
range: ${{ fromJson(needs.prepare.outputs.ranges) }}
steps:
- name: Echo Runner Number
run: echo "Runner is ${{ matrix.number }}"
- uses: actions/checkout@v4
- name: Parse job - ${{ matrix.range }}
id: parse-job
shell: bash
run: |
read name begin end sesc <<< "${{ matrix.range }}"
echo name=$name >> $GITHUB_OUTPUT
echo begin=$begin >> $GITHUB_OUTPUT
echo end=$end>> $GITHUB_OUTPUT
echo desc=$desc>> $GITHUB_OUTPUT
- name: Load cached aptos-debugger binary
uses: actions/cache/restore@v4
with:
ref: ${{ inputs.GIT_SHA }}
path: aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
- name: Load cached backup storage config
uses: actions/cache/restore@v4
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}
fail-on-cache-miss: true

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Build CLI binaries in release mode
shell: bash
run: cargo build --release -p aptos-debugger
- name: phase 1 - restore snapshot, with retries
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
for try in {0..3}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--txns-to-skip $TXNS_TO_SKIP \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.begin }} \
\
--lazy-quit \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || true # exit 0 if successful, otherwise retry
done
exit(1)
- name: Run replay-verify in parallel
shell: bash
run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners
- name: phase 2 - replay-verify transactions, with retries
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
for try in {0..3}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--txns-to-skip $TXNS_TO_SKIP \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.end }} \
\
--lazy-quit \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || true # exit 0 if successful, otherwise retry
done
exit(1)
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions storage/backup/backup-cli/src/metadata/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ impl MetadataView {
self.compaction_timestamps.clone()
}

pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] {
&self.state_snapshot_backups
}

pub fn select_state_snapshot(
&self,
target_version: Version,
Expand Down
1 change: 1 addition & 0 deletions storage/db-tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ aptos-vm = { workspace = true }
bcs = { workspace = true }
clap = { workspace = true }
itertools = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }

[dev-dependencies]
Expand Down
153 changes: 153 additions & 0 deletions storage/db-tool/src/gen_replay_verify_jobs.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
// Copyright (c) Aptos Foundation
// SPDX-License-Identifier: Apache-2.0

use aptos_backup_cli::{
metadata::{
cache::{sync_and_load, MetadataCacheOpt},
StateSnapshotBackupMeta,
},
storage::DBToolStorageOpt,
utils::ConcurrentDownloadsOpt,
};
use aptos_logger::warn;
use aptos_types::transaction::Version;
use clap::Parser;
use itertools::Itertools;
use std::{io::Write, iter::once, path::PathBuf};

#[derive(Parser)]
pub struct Opt {
#[clap(flatten)]
metadata_cache_opt: MetadataCacheOpt,
#[clap(flatten)]
storage: DBToolStorageOpt,
#[clap(flatten)]
concurrent_downloads: ConcurrentDownloadsOpt,
#[clap(
long,
help = "The first transaction version required to be replayed and verified. [Defaults to 0]"
)]
start_version: Option<Version>,
#[clap(
long,
help = "Target number of transactions for each job to replay",
default_value = "20000000"
)]
target_job_size: u64,
#[clap(
long,
help = "Determines the oldest epoch to replay, relative to the latest",
default_value = "4000"
)]
max_epochs: u64,
#[clap(long, help = "Output job ranges")]
output_json_file: PathBuf,
}

impl Opt {
pub async fn run(self) -> anyhow::Result<()> {
let storage = self.storage.init_storage().await?;
let metadata_view = sync_and_load(
&self.metadata_cache_opt,
storage,
self.concurrent_downloads.get(),
)
.await?;

let storage_state = metadata_view.get_storage_state()?;
let global_end_version = storage_state
.latest_transaction_version
.expect("No transaction backups.")
+ 1;
let latest_epoch = storage_state
.latest_state_snapshot_epoch
.expect("No state snapshots.");
let max_epochs = self.max_epochs.min(latest_epoch + 1);
let global_min_epoch = latest_epoch + 1 - max_epochs;

let fake_end = StateSnapshotBackupMeta {
epoch: latest_epoch,
version: global_end_version,
manifest: "".to_string(),
};
let mut job_idx = 0;
let job_ranges = metadata_view
.all_state_snapshots()
.iter()
.filter(|s| s.epoch >= global_min_epoch && s.version <= global_end_version)
.chain(once(&fake_end))
.collect_vec()
.iter()
.rev()
.tuple_windows()
// to simplify things, if start_version appears in the middle of a range, give up the range
.take_while(|(_end, begin)| begin.version >= self.start_version.unwrap_or(0))
.peekable()
.batching(|it| {
job_idx += 1;
match it.next() {
Some((end, mut begin)) => {
if end.version - begin.version >= self.target_job_size {
// cut big range short, this hopefully automatically skips load tests
let msg = if end.epoch - begin.epoch > 15 {
"!!! Need more snapshots !!!"
} else {
""
};
warn!(
begin = begin,
end = end,
"Big gap between snapshots. {} versions in {} epochs. {}",
end.version - begin.version,
end.epoch - begin.epoch,
msg,
);
Some((
format!("{job_idx}-Partial"),
begin.version,
begin.version + self.target_job_size,
format!(
"Partial replay epoch {} - {}, {} txns starting from version {}, another {} versions omitted, until {}. {}",
begin.epoch,
end.epoch - 1,
self.target_job_size,
begin.version,
end.version - begin.version - self.target_job_size,
end.version,
msg
)
))
} else {
while let Some((_prev_end, prev_begin)) = it.peek() {
if end.version - prev_begin.version > self.target_job_size {
break;
}
begin = prev_begin;
let _ = it.next();
}
Some((
format!("{job_idx}"),
begin.version,
end.version,
format!(
"Replay epoch {} - {}, {} txns starting from version {}.",
begin.epoch,
end.epoch - 1,
end.version - begin.version,
begin.version,
)
))
}
},
None => None,
}
})
.map(|(name, begin, end, desc)| format!("{name} {begin} {end} {desc}"))
.collect_vec();

std::fs::File::create(&self.output_json_file)?
.write_all(&serde_json::to_vec(&job_ranges)?)?;

Ok(())
}
}
Loading

0 comments on commit e1a4d0e

Please sign in to comment.