Skip to content

Commit

Permalink
[GHA] replay-verify replays at state snapshot versions
Browse files Browse the repository at this point in the history
so that no work is wasted

1. added `gen-replay-verify-jobs` sub-command to the aptos-debugger, to
   generate txn ranges that begins at state snapshots and of desired
   size in number of transactions. If there's too many txns between two
   adjacent snapshots, the range is trancated to the target size. This
   way we deal with load tests Tapos kind of situation automatically.
2. Each job runs only one replay -- there's no longer "partitions".
   Instead, we issue a lot more jobs with concurrency control. This way
   jobs run in "waves" and "load balancing" is automatically achieved.
3. A single "prepare" job does the building and jobs generation, and the
   actual replay jobs don't need to build the binary, etc.

Todo: pre-load gcloud command to the runners so that we don't spend 30-40
seconds in each job to install it.
  • Loading branch information
msmouse committed Sep 12, 2024
1 parent da106a1 commit 722b18b
Show file tree
Hide file tree
Showing 7 changed files with 322 additions and 16 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ on:
pull_request:
paths:
- ".github/workflows/replay-verify.yaml"
- ".github/workflows/workflow-run-replay-verify.yaml"
- "testsuite/replay_verify.py"
schedule:
- cron: "0 22 * * 0,2,4" # The main branch cadence. This runs every Sun,Tues,Thurs
Expand Down Expand Up @@ -84,7 +85,7 @@ jobs:
# replay-verify config
BUCKET: ${{ inputs.MAINNET_BUCKET || 'aptos-mainnet-backup' }}
SUB_DIR: e1
HISTORY_START: 0
HISTORY_START: 518000000
TXNS_TO_SKIP: 12253479 12277499 148358668
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/gcs.yaml
# workflow config
Expand All @@ -101,7 +102,7 @@ jobs:
# replay-verify config
BUCKET: ${{ inputs.TESTNET_BUCKET || 'aptos-testnet-backup' }}
SUB_DIR: e1
HISTORY_START: 250000000 # TODO: We need an exhaustive list of txns_to_skip before we can set this to 0.
HISTORY_START: 862000000 # TODO: We need an exhaustive list of txns_to_skip before we can set this to 0.
TXNS_TO_SKIP: 46874937 151020059 409163615 409163669 409163708 409163774 409163845 409163955 409164059 409164191 414625832
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/gcs.yaml
# workflow config
Expand Down
170 changes: 156 additions & 14 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -77,40 +77,182 @@ on:
default: "high-perf-docker-with-local-ssd"

jobs:
prepare:
runs-on: ${{ inputs.RUNS_ON }}
outputs:
ranges: ${{ steps.gen-jobs.outputs.ranges }}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}

- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
path: target/release/aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
target/release/aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--output-json-file job_ranges.json
echo "ranges=$(cat job_ranges.json)" >> $GITHUB_OUTPUT
cat job_ranges.json | jq || true
- name: Cache backup storage config so the replay jobs don't need to checkout entire repo
uses: actions/cache/save@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}

replay-verify:
needs: prepare
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
max-parallel: 16
matrix:
number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
range: ${{ fromJson(needs.prepare.outputs.ranges) }}
steps:
- name: Echo Runner Number
run: echo "Runner is ${{ matrix.number }}"
- uses: actions/checkout@v4
- name: Parse job - ${{ matrix.range }}
id: parse-job
shell: bash
run: |
read name begin end sesc <<< "${{ matrix.range }}"
echo name=$name >> $GITHUB_OUTPUT
echo begin=$begin >> $GITHUB_OUTPUT
echo end=$begin >> $GITHUB_OUTPUT
echo desc=$desc>> $GITHUB_OUTPUT
- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache/restore@v4
with:
ref: ${{ inputs.GIT_SHA }}
path: target/release/aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
- name: Load cached backup storage config
uses: actions/cache/restore@v4
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ github.run_id }}
fail-on-cache-miss: true

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Build CLI binaries in release mode
shell: bash
run: cargo build --release -p aptos-debugger
- name: phase 1 - restore snapshot, with retries
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
for try in {0..6}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
target/release/aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--txns-to-skip $TXNS_TO_SKIP \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.begin }} \
\
--lazy-quit \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || true # exit 0 if successful, otherwise retry
done
exit(1)
- name: Run replay-verify in parallel
shell: bash
run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners
- name: phase 2 - replay-verify transactions, with retries
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START }}
HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
for try in {0..6}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
target/release/aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
--txns-to-skip $TXNS_TO_SKIP \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.end }} \
\
--lazy-quit \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || true # exit 0 if successful, otherwise retry
done
exit(1)
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions storage/backup/backup-cli/src/metadata/view.rs
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,10 @@ impl MetadataView {
self.compaction_timestamps.clone()
}

pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] {
&self.state_snapshot_backups
}

pub fn select_state_snapshot(
&self,
target_version: Version,
Expand Down
1 change: 1 addition & 0 deletions storage/db-tool/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ aptos-vm = { workspace = true }
bcs = { workspace = true }
clap = { workspace = true }
itertools = { workspace = true }
serde_json = { workspace = true }
tokio = { workspace = true }

[dev-dependencies]
Expand Down
Loading

0 comments on commit 722b18b

Please sign in to comment.