Skip to content

Commit

Permalink
[GHA] replay-verify replays at state snapshot versions
Browse files Browse the repository at this point in the history
so that no work is wasted

1. added `gen-replay-verify-jobs` sub-command to the aptos-debugger, to
   generate txn ranges that begins at state snapshots and of desired
   size in number of transactions. If there's too many txns between two
   adjacent snapshots, the range is trancated to the target size. This
   way we deal with load tests Tapos kind of situation automatically.
2. Each job runs only one replay -- there's no longer "partitions".
   Instead, we issue a lot more jobs with concurrency control. This way
   jobs run in "waves" and "load balancing" is automatically achieved.
3. A single "prepare" job does the building and jobs generation, and the
   actual replay jobs don't need to build the binary, etc.
  • Loading branch information
msmouse committed Sep 17, 2024
1 parent da106a1 commit 5a1a907
Show file tree
Hide file tree
Showing 12 changed files with 546 additions and 42 deletions.
1 change: 1 addition & 0 deletions .github/actions/rust-setup/action.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ runs:
# rust-cache action will cache ~/.cargo and ./target
# https://github.com/Swatinem/rust-cache#cache-details
- name: Run cargo cache
if: !startsWith(github.ref, 'refs/pull/')
uses: Swatinem/rust-cache@359a70e43a0bb8a13953b04a90f76428b4959bb6 # [email protected]
with:
key: ${{ inputs.ADDITIONAL_KEY }}
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/module-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ jobs:
SUB_DIR: e1
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/s3-public.yaml
# workflow config
RUNS_ON: high-perf-docker-with-local-ssd
RUNS_ON: "runs-on,cpu=16,family=m6id,hdd=900,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
TIMEOUT_MINUTES: 20

verify-modules-mainnet:
Expand All @@ -52,7 +52,7 @@ jobs:
SUB_DIR: e1
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/s3-public.yaml
# workflow config
RUNS_ON: high-perf-docker-with-local-ssd
RUNS_ON: "runs-on,cpu=16,family=m6id,hdd=900,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
TIMEOUT_MINUTES: 20

test-verify-modules:
Expand All @@ -65,5 +65,5 @@ jobs:
SUB_DIR: e1
BACKUP_CONFIG_TEMPLATE_PATH: terraform/helm/fullnode/files/backup/s3-public.yaml
# workflow config
RUNS_ON: "high-perf-docker-with-local-ssd"
RUNS_ON: "runs-on,cpu=16,family=m6id,hdd=900,image=aptos-ubuntu-x64,spot=false,run-id=${{ github.run_id }}"
TIMEOUT_MINUTES: 20
35 changes: 24 additions & 11 deletions .github/workflows/replay-verify.yaml

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion .github/workflows/workflow-run-module-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ on:
description: "The runner to use for the job."
type: string
required: true
default: "high-perf-docker-with-local-ssd"
default: "runs-on,cpu=16,family=m6id,hdd=500,image=aptos-ubuntu-x64,spot=false"
TIMEOUT_MINUTES:
description: "Github job timeout in minutes"
type: number
Expand Down
152 changes: 152 additions & 0 deletions .github/workflows/workflow-run-replay-verify-batch.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
name: "*run replay-verify batch"

on:
# This allows the workflow to be triggered from another workflow
workflow_call:
inputs:
GIT_SHA:
required: true
type: string
description: The git SHA1 to test.
# replay-verify config
BUCKET:
required: true
type: string
description: The bucket to use for the backup. If not specified, it will use the default bucket.
SUB_DIR:
required: true
type: string
description: The subdirectory to use for the backup. If not specified, it will use the default subdirectory.
TXNS_TO_SKIP:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
required: true
# GHA job config
RUNS_ON:
description: "The runner to use for the job."
type: string
required: true
default: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false"
TIMEOUT_MINUTES:
description: "Github job timeout in minutes"
type: number
required: true
default: 180
RANGES_JSON:
description: "The ranges to use for the job."
type: string
required: true

jobs:
replay-verify:
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 180 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
max-parallel: 100
matrix:
range: ${{ fromJson(inputs.RANGES_JSON) }}
steps:
- name: Parse job - ${{ matrix.range }}
id: parse-job
shell: bash
run: |
read name begin end sesc <<< "${{ matrix.range }}"
echo name=$name >> $GITHUB_OUTPUT
echo begin=$begin >> $GITHUB_OUTPUT
echo end=$end>> $GITHUB_OUTPUT
echo desc=$desc>> $GITHUB_OUTPUT
- name: Load cached aptos-debugger binary
uses: actions/cache/restore@v4
with:
path: aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
fail-on-cache-miss: true

- name: Load cached backup storage metadata cache dir
uses: actions/cache/restore@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
fail-on-cache-miss: true

- name: Load cached backup storage config
uses: actions/cache/restore@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}
fail-on-cache-miss: true

- id: auth
uses: "google-github-actions/auth@v2"
with:
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: phase 1 - restore snapshot, with retries
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
run: |
for try in {0..3}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.begin }} \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || [[ $? -eq 2 ]] && exit 2 || true # exit on return code 0 or 2, otherwise retry
done
exit(1)
- name: phase 2 - replay-verify transactions, with retries
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
run: |
for try in {0..3}
do
if [ $try -gt 0 ]; then
SLEEP=$((10 * $try))
echo "sleeping for $SLEEP seconds before retry #$try" >&2
sleep $SLEEP
fi
./aptos-debugger aptos-db replay-verify \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--txns-to-skip "${{ inputs.TXNS_TO_SKIP || '0' }}" \
--start-version ${{ steps.parse-job.outputs.begin }} \
--end-version ${{ steps.parse-job.outputs.end }} \
\
--lazy-quit \
--enable-storage-sharding \
--target-db-dir db \
--concurrent-downloads 8 \
--replay-concurrency-level 8 \
\
&& exit 0 || [[ $? -eq 2 ]] && exit 2 || true # exit on return code 0 or 2, otherwise retry
done
exit(1)
154 changes: 130 additions & 24 deletions .github/workflows/workflow-run-replay-verify.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ on:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
Expand All @@ -34,12 +38,12 @@ on:
description: "The runner to use for the job."
type: string
required: true
default: "high-perf-docker-with-local-ssd"
default: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false"
TIMEOUT_MINUTES:
description: "Github job timeout in minutes"
type: number
required: true
default: 720
default: 180
# This allows the workflow to be triggered manually from the Github UI or CLI
# NOTE: because the "number" type is not supported, we default to 720 minute timeout
workflow_dispatch:
Expand All @@ -65,6 +69,10 @@ on:
required: false
type: string
description: The list of transaction versions to skip. If not specified, it will use the default list.
RANGES_TO_SKIP:
required: false
type: string
description: The optional list of transaction ranges to skip..
BACKUP_CONFIG_TEMPLATE_PATH:
description: "The path to the backup config template to use."
type: string
Expand All @@ -74,43 +82,141 @@ on:
description: "The runner to use for the job."
type: string
required: true
default: "high-perf-docker-with-local-ssd"
default: "runs-on,cpu=8,family=m6id,hdd=100,image=aptos-ubuntu-x64,spot=false"

jobs:
replay-verify:
timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
runs-on: ${{ inputs.RUNS_ON }}
strategy:
fail-fast: false
matrix:
number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
prepare:
runs-on: "runs-on,cpu=64,family=c7,hdd=500,image=aptos-ubuntu-x64,spot=false"
outputs:
ranges0: ${{ steps.gen-jobs.outputs.ranges0 }}
ranges1: ${{ steps.gen-jobs.outputs.ranges1 }}
ranges2: ${{ steps.gen-jobs.outputs.ranges2 }}
ranges3: ${{ steps.gen-jobs.outputs.ranges3 }}
steps:
- name: Echo Runner Number
run: echo "Runner is ${{ matrix.number }}"
- uses: actions/checkout@v4
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.GIT_SHA }}

- uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
- name: Load cached aptos-debugger binary
id: cache-aptos-debugger-binary
uses: actions/cache@v4
with:
# copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
# which cleans up the target directory in its post action
path: aptos-debugger
key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}

- name: Prepare for build if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
with:
GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}

- name: Build and strip aptos-debugger binary if not cached
if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
shell: bash
run: |
cargo build --release -p aptos-debugger
strip -s target/release/aptos-debugger
cp target/release/aptos-debugger .
- id: auth
uses: "google-github-actions/auth@v2"
with:
GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
workload_identity_provider: ${{ secrets.GCP_WORKLOAD_IDENTITY_PROVIDER }}
service_account: ${{ secrets.GCP_SERVICE_ACCOUNT_EMAIL }}

- name: Install GCloud SDK
uses: "google-github-actions/setup-gcloud@v2"
with:
version: ">= 418.0.0"
install_components: "kubectl,gke-gcloud-auth-plugin"

- name: Build CLI binaries in release mode
shell: bash
run: cargo build --release -p aptos-debugger
- name: get timestamp to use in cache key
id: get-timestamp
run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT

- name: Run replay-verify in parallel
shell: bash
run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners
- name: Load cached backup storage metadata cache dir (and save back afterwards)
uses: actions/cache@v4
with:
path: metadata_cache
key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-

- name: Generate job ranges
id: gen-jobs
env:
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
HISTORY_START: ${{ inputs.HISTORY_START }}
TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
run: |
./aptos-debugger aptos-db gen-replay-verify-jobs \
--metadata-cache-dir ./metadata_cache \
--command-adapter-config ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }} \
--start-version ${{ inputs.HISTORY_START }} \
--ranges-to-skip "${{ inputs.RANGES_TO_SKIP }}" \
--output-json-files job_ranges.0.json \
--output-json-files job_ranges.1.json \
--output-json-files job_ranges.2.json \
--output-json-files job_ranges.3.json \
echo "ranges0=$(cat job_ranges.0.json)" >> $GITHUB_OUTPUT
echo "ranges1=$(cat job_ranges.1.json)" >> $GITHUB_OUTPUT
echo "ranges2=$(cat job_ranges.2.json)" >> $GITHUB_OUTPUT
echo "ranges3=$(cat job_ranges.3.json)" >> $GITHUB_OUTPUT
- name: Cache backup storage config so the replay jobs don't need to checkout entire repo
uses: actions/cache/save@v4
with:
path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
key: backup-config-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ github.run_id }}

replay-verify-batch0:
needs: prepare
uses: ./.github/workflows/workflow-run-replay-verify-batch.yaml
secrets: inherit
with:
RANGES_JSON: ${{ needs.prepare.outputs.ranges0 }}
GIT_SHA: ${{ inputs.GIT_SHA }}
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
RUNS_ON: ${{ inputs.RUNS_ON }}
TIMEOUT_MINUTES: ${{ inputs.TIMEOUT_MINUTES }}
replay-verify-batch1:
needs: prepare
uses: ./.github/workflows/workflow-run-replay-verify-batch.yaml
secrets: inherit
with:
RANGES_JSON: ${{ needs.prepare.outputs.ranges1 }}
GIT_SHA: ${{ inputs.GIT_SHA }}
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
RUNS_ON: ${{ inputs.RUNS_ON }}
TIMEOUT_MINUTES: ${{ inputs.TIMEOUT_MINUTES }}
replay-verify-batch2:
needs: prepare
uses: ./.github/workflows/workflow-run-replay-verify-batch.yaml
secrets: inherit
with:
RANGES_JSON: ${{ needs.prepare.outputs.ranges2 }}
GIT_SHA: ${{ inputs.GIT_SHA }}
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
RUNS_ON: ${{ inputs.RUNS_ON }}
TIMEOUT_MINUTES: ${{ inputs.TIMEOUT_MINUTES }}
replay-verify-batch3:
needs: prepare
uses: ./.github/workflows/workflow-run-replay-verify-batch.yaml
secrets: inherit
with:
RANGES_JSON: ${{ needs.prepare.outputs.ranges3 }}
GIT_SHA: ${{ inputs.GIT_SHA }}
BUCKET: ${{ inputs.BUCKET }}
SUB_DIR: ${{ inputs.SUB_DIR }}
BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
RUNS_ON: ${{ inputs.RUNS_ON }}
TIMEOUT_MINUTES: ${{ inputs.TIMEOUT_MINUTES }}
Loading

0 comments on commit 5a1a907

Please sign in to comment.