Skip to content

Commit

Permalink
[GHA] replay-verify replays at state snapshot versions
Browse files Browse the repository at this point in the history
so that no work is wasted

1. added `gen-replay-verify-jobs` sub-command to the aptos-debugger, to
   generate txn ranges that begins at state snapshots and of desired
   size in number of transactions. If there's too many txns between two
   adjacent snapshots, the range is trancated to the target size. This
   way we deal with load tests Tapos kind of situation automatically.
2. A single "prepare" job does the building and jobs generation, and the
   actual replay jobs don't need to build the binary, etc.
  • Loading branch information
msmouse committed Sep 21, 2024
1 parent ce6158a commit 60a1d2b
Show file tree
Hide file tree
Showing 9 changed files with 441 additions and 30 deletions.
1 change: 1 addition & 0 deletions .github/actions/rust-setup/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ runs:
# rust-cache action will cache ~/.cargo and ./target
# https://github.com/Swatinem/rust-cache#cache-details
- name: Run cargo cache
if: !startsWith(github.ref, 'refs/pull/')
uses: Swatinem/rust-cache@359a70e43a0bb8a13953b04a90f76428b4959bb6 # [email protected]
with:
key: ${{ inputs.ADDITIONAL_KEY }}
Expand Down
28 changes: 19 additions & 9 deletions .github/workflows/replay-verify.yaml

Large diffs are not rendered by default.

206 changes: 188 additions & 18 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ on:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
Expand All @@ -39,7 +43,7 @@ on:
description: "Github job timeout in minutes"
type: number
required: true
default: 720
default: 180
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
Expand All @@ -65,6 +69,10 @@ on:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
Expand All @@ -77,40 +85,202 @@ on:
default: "high-perf-docker-with-local-ssd"

jobs:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
job_ids: ${{ steps.gen-jobs.outputs.job_ids }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}

- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: |
aptos-debugger
testsuite/replay_verify.py
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
cp target/release/aptos-debugger .
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
run: |
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version ${{ inputs.HISTORY_START }} \
--ranges-to-skip "${{ inputs.RANGES_TO_SKIP }}" \
\
--max-ranges-per-job 16 \
--output-json-file jobs.json \
jq -c 'length as $N | [range(0; $N)]' jobs.json > job_ids.json
cat job_ids.json
jq . jobs.json
echo "job_ids=$(cat job_ids.json)" >> $GITHUB_OUTPUT
- name: Cache backup storage config and job definition
uses: actions/cache/save@v4
with:
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}

replay-verify:
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
matrix:
number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
job_id: ${{ fromJson(needs.prepare.outputs.job_ids) }}
steps:
- name: Echo Runner Number
run: echo "Runner is ${{ matrix.number }}"
- uses: actions/checkout@v4
- name: Load cached aptos-debugger binary and replay_verify.py script
uses: actions/cache/restore@v4
with:
ref: ${{ inputs.GIT_SHA }}
path: |
aptos-debugger
testsuite/replay_verify.py
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
- name: Load cached backup storage config and job definitions
uses: actions/cache/restore@v4
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
path: |
${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
jobs.json
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}
fail-on-cache-miss: true

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Build CLI binaries in release mode
shell: bash
run: cargo build --release -p aptos-debugger

- name: Run replay-verify in parallel
shell: bash
run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
shell: bash
run: |
set -o nounset -o errexit -o pipefail
replay() {
idx=$1
id=$2
begin=$3
end=$4
desc=$5
echo ---------
echo Job start. $id: $desc
echo ---------
MC=metadata_cache_$idx
cp -r metadata_cache $MC
DB=db_$idx
for try in {0..6}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
res=0
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir $MC \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version $begin \
--end-version $end \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir $DB \
--concurrent-downloads 8 \
--replay-concurrency-level 2 \
|| res=$?
if [[ $res == 0 || $res == 2 ]]
then
return $res
fi
done
return 1
}
pids=()
idx=0
while read id begin end desc; do
replay $idx $id $begin $end "$desc" 2>&1 | sed "s/^/[partition $idx]: /" &
pids[$idx]=$!
idx=$((idx+1))
done < <(jq '.[${{ matrix.job_id }}][]' jobs.json)
res=0
for idx in `seq 0 $((idx-1))`
do
range_res=0
wait ${pids[$idx]} || range_res=$?
echo partition $idx returned $range_res
if [[ $range_res != 0 ]]
then
res=$range_res
fi
done
echo All partitions done, returning $res
exit $res
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 23 additions & 3 deletions execution/executor/src/chunk_executor.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ use std::{
atomic::{AtomicBool, Ordering},
Arc,
},
time::Instant,
};

pub static SIG_VERIFY_POOL: Lazy<Arc<rayon::ThreadPool>> = Lazy::new(|| {
Expand Down Expand Up @@ -598,9 +599,11 @@ impl<V: VMExecutor> TransactionReplayer for ChunkExecutorInner<V> {
mut event_vecs: Vec<Vec<ContractEvent>>,
verify_execution_mode: &VerifyExecutionMode,
) -> Result<()> {
let started = Instant::now();
let num_txns = transactions.len();
let mut latest_view = self.commit_queue.lock().expect_latest_view()?;
let chunk_begin = latest_view.num_transactions() as Version;
let chunk_end = chunk_begin + transactions.len() as Version; // right-exclusive
let chunk_end = chunk_begin + num_txns as Version; // right-exclusive

// Find epoch boundaries.
let mut epochs = Vec::new();
Expand Down Expand Up @@ -636,11 +639,28 @@ impl<V: VMExecutor> TransactionReplayer for ChunkExecutorInner<V> {

self.commit_queue
.lock()
.enqueue_chunk_to_commit_directly(executed_chunk.expect("Nothing to commit."))
.enqueue_chunk_to_commit_directly(executed_chunk.expect("Nothing to commit."))?;
info!(
num_txns = num_txns,
tps = (num_txns as f64 / started.elapsed().as_secs_f64()),
"TransactionReplayer::replay() OK"
);

Ok(())
}

fn commit(&self) -> Result<ExecutedChunk> {
self.commit_chunk_impl()
let started = Instant::now();

let chunk = self.commit_chunk_impl()?;

let num_committed = chunk.transactions_to_commit().len();
info!(
num_committed = num_committed,
tps = num_committed as f64 / started.elapsed().as_secs_f64(),
"TransactionReplayer::commit() OK"
);
Ok(chunk)
}
}

Expand Down
4 changes: 4 additions & 0 deletions storage/backup/backup-cli/src/metadata/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ impl MetadataView {
self.compaction_timestamps.clone()
}

pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] {
&self.state_snapshot_backups
}

pub fn select_state_snapshot(
&self,
target_version: Version,
Expand Down
1 change: 1 addition & 0 deletions storage/db-tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ aptos-vm = { workspace = true }
bcs = { workspace = true }
clap = { workspace = true }
itertools = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }

[dev-dependencies]
Expand Down
Loading

0 comments on commit 60a1d2b

Please sign in to comment.