[GHA] replay-verify replays at state snapshot versions

so that no work is wasted 1. added `gen-replay-verify-jobs` sub-command to the aptos-debugger, to generate txn ranges that begins at state snapshots and of desired size in number of transactions. If there's too many txns between two adjacent snapshots, the range is trancated to the target size. This way we deal with load tests Tapos kind of situation automatically. 2. Each job runs only one replay -- there's no longer "partitions". Instead, we issue a lot more jobs with concurrency control. This way jobs run in "waves" and "load balancing" is automatically achieved. 3. A single "prepare" job does the building and jobs generation, and the actual replay jobs don't need to build the binary, etc. Todo: pre-load gcloud command to the runners so that we don't spend 30-40 seconds in each job to install it.
aptos-labs · Sep 13, 2024 · e1a4d0e · e1a4d0e
1 parent da106a1
commit e1a4d0e
Show file tree

Hide file tree

Showing 7 changed files with 330 additions and 19 deletions.
diff --git a/.github/workflows/replay-verify.yaml b/.github/workflows/replay-verify.yaml
diff --git a/.github/workflows/workflow-run-replay-verify.yaml b/.github/workflows/workflow-run-replay-verify.yaml
@@ -77,40 +77,185 @@ on:
         default: "high-perf-docker-with-local-ssd"
 
 jobs:
+  prepare:
+    runs-on: ${{ inputs.RUNS_ON }}
+    outputs:
+      ranges: ${{ steps.gen-jobs.outputs.ranges }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.GIT_SHA }}
+
+      - name: Load cached aptos-debugger binary
+        id: cache-aptos-debugger-binary
+        uses: actions/cache@v4
+        with:
+          # copy the binary to the root of the repo and cache it there, because rust-setup calls a cache-rust action
+          # which cleans up the target directory in its post action
+          path: aptos-debugger
+          key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
+
+      - name: Prepare for build if not cached
+        if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
+        uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
+        with:
+          GIT_CREDENTIALS: ${{ inputs.GIT_CREDENTIALS }}
+
+      - name: Build and strip aptos-debugger binary if not cached
+        if: steps.cache-aptos-debugger-binary.outputs.cache-hit != 'true'
+        shell: bash
+        run: |
+          cargo build --release -p aptos-debugger
+          strip -s target/release/aptos-debugger
+          cp target/release/aptos-debugger .
+
+      - name: Install GCloud SDK
+        uses: "google-github-actions/setup-gcloud@v2"
+        with:
+          version: ">= 418.0.0"
+          install_components: "kubectl,gke-gcloud-auth-plugin"
+
+      - name: get timestamp to use in cache key
+        id: get-timestamp
+        run: echo "ts=$(date +%s)" >> $GITHUB_OUTPUT
+
+      - name: Load cached backup storage metadata cache dir (and save back afterwards)
+        uses: actions/cache@v4
+        with:
+          path: metadata_cache
+          key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-${{ steps.get-timestamp.outputs.ts }}
+          restore-keys: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
+
+      - name: Generate job ranges
+        id: gen-jobs
+        env:
+          BUCKET: ${{ inputs.BUCKET }}
+          SUB_DIR: ${{ inputs.SUB_DIR }}
+          HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
+          BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
+        run: |
+          ./aptos-debugger aptos-db gen-replay-verify-jobs  \
+            --metadata-cache-dir ./metadata_cache \
+            --command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
+            --output-json-file job_ranges.json \
+            --start-version $HISTORY_START
+
+          echo "ranges=$(cat job_ranges.json)" >> $GITHUB_OUTPUT
+
+          cat job_ranges.json | jq || true
+
+      - name: Cache backup storage config so the replay jobs don't need to checkout entire repo
+        uses: actions/cache/save@v4
+        with:
+          path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
+          key: backup-config-${{ github.run_id }}
+
   replay-verify:
+    needs: prepare
     timeout-minutes: ${{ inputs.TIMEOUT_MINUTES || 720 }}
     runs-on: ${{ inputs.RUNS_ON }}
     strategy:
       fail-fast: false
+      max-parallel: 16
       matrix:
-        number: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18] # runner number
+        range: ${{ fromJson(needs.prepare.outputs.ranges) }}
     steps:
-      - name: Echo Runner Number
-        run: echo "Runner is ${{ matrix.number }}"
-      - uses: actions/checkout@v4
+      - name: Parse job - ${{ matrix.range }}
+        id: parse-job
+        shell: bash
+        run: |
+          read name begin end sesc <<< "${{ matrix.range }}"
+          echo name=$name >> $GITHUB_OUTPUT
+          echo begin=$begin >> $GITHUB_OUTPUT
+          echo end=$end>> $GITHUB_OUTPUT
+          echo desc=$desc>> $GITHUB_OUTPUT
+
+      - name: Load cached aptos-debugger binary
+        uses: actions/cache/restore@v4
         with:
-          ref: ${{ inputs.GIT_SHA }}
+          path: aptos-debugger
+          key: aptos-debugger-${{ inputs.GIT_SHA || github.sha }}
+          fail-on-cache-miss: true
+
+      - name: Load cached backup storage metadata cache dir
+        uses: actions/cache/restore@v4
+        with:
+          path: metadata_cache
+          key: metadata-cache-${{ inputs.BUCKET }}/${{ inputs.SUB_DIR }}-
+          fail-on-cache-miss: true
 
-      - uses: aptos-labs/aptos-core/.github/actions/rust-setup@main
+      - name: Load cached backup storage config
+        uses: actions/cache/restore@v4
         with:
-          GIT_CREDENTIALS: ${{ secrets.GIT_CREDENTIALS }}
+          path: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
+          key: backup-config-${{ github.run_id }}
+          fail-on-cache-miss: true
 
       - name: Install GCloud SDK
         uses: "google-github-actions/setup-gcloud@v2"
         with:
           version: ">= 418.0.0"
           install_components: "kubectl,gke-gcloud-auth-plugin"
 
-      - name: Build CLI binaries in release mode
-        shell: bash
-        run: cargo build --release -p aptos-debugger
+      - name: phase 1 - restore snapshot, with retries
+        env:
+          BUCKET: ${{ inputs.BUCKET }}
+          SUB_DIR: ${{ inputs.SUB_DIR }}
+          HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
+          TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
+          BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
+        run: |
+          for try in {0..3}
+          do
+            if [ $try -gt 0 ]; then
+              SLEEP=$((10 * $try))
+              echo "sleeping for $SLEEP seconds before retry #$try" >&2
+              sleep $SLEEP
+            fi
+            ./aptos-debugger aptos-db replay-verify \
+              --metadata-cache-dir ./metadata_cache \
+              --command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
+              --txns-to-skip $TXNS_TO_SKIP \
+              --start-version ${{ steps.parse-job.outputs.begin }} \
+              --end-version ${{ steps.parse-job.outputs.begin }} \
+              \
+              --lazy-quit \
+              --target-db-dir db \
+              --concurrent-downloads 8 \
+              --replay-concurrency-level 8 \
+              \
+              && exit 0 || true # exit 0 if successful, otherwise retry
+          done
+          exit(1)
 
-      - name: Run replay-verify in parallel
-        shell: bash
-        run: testsuite/replay_verify.py ${{ matrix.number }} 19 # first argument is the runner number, second argument is the total number of runners
+      - name: phase 2 - replay-verify transactions, with retries
         env:
           BUCKET: ${{ inputs.BUCKET }}
           SUB_DIR: ${{ inputs.SUB_DIR }}
-          HISTORY_START: ${{ inputs.HISTORY_START }}
+          HISTORY_START: ${{ inputs.HISTORY_START || '0' }}
           TXNS_TO_SKIP: ${{ inputs.TXNS_TO_SKIP }}
           BACKUP_CONFIG_TEMPLATE_PATH: ${{ inputs.BACKUP_CONFIG_TEMPLATE_PATH }}
+        run: |
+          for try in {0..3}
+          do
+            if [ $try -gt 0 ]; then
+              SLEEP=$((10 * $try))
+              echo "sleeping for $SLEEP seconds before retry #$try" >&2
+              sleep $SLEEP
+            fi
+            ./aptos-debugger aptos-db replay-verify \
+              --metadata-cache-dir ./metadata_cache \
+              --command-adapter-config $BACKUP_CONFIG_TEMPLATE_PATH \
+              --txns-to-skip $TXNS_TO_SKIP \
+              --start-version ${{ steps.parse-job.outputs.begin }} \
+              --end-version ${{ steps.parse-job.outputs.end }} \
+              \
+              --lazy-quit \
+              --target-db-dir db \
+              --concurrent-downloads 8 \
+              --replay-concurrency-level 8 \
+              \
+              && exit 0 || true # exit 0 if successful, otherwise retry
+          done
+          exit(1)
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/storage/backup/backup-cli/src/metadata/view.rs b/storage/backup/backup-cli/src/metadata/view.rs
@@ -105,6 +105,10 @@ impl MetadataView {
         self.compaction_timestamps.clone()
     }
 
+    pub fn all_state_snapshots(&self) -> &[StateSnapshotBackupMeta] {
+        &self.state_snapshot_backups
+    }
+
     pub fn select_state_snapshot(
         &self,
         target_version: Version,

diff --git a/storage/db-tool/Cargo.toml b/storage/db-tool/Cargo.toml
@@ -26,6 +26,7 @@ aptos-vm = { workspace = true }
 bcs = { workspace = true }
 clap = { workspace = true }
 itertools = { workspace = true }
+serde_json = { workspace = true }
 tokio = { workspace = true }
 
 [dev-dependencies]

diff --git a/storage/db-tool/src/gen_replay_verify_jobs.rs b/storage/db-tool/src/gen_replay_verify_jobs.rs
@@ -0,0 +1,153 @@
+// Copyright (c) Aptos Foundation
+// SPDX-License-Identifier: Apache-2.0
+
+use aptos_backup_cli::{
+    metadata::{
+        cache::{sync_and_load, MetadataCacheOpt},
+        StateSnapshotBackupMeta,
+    },
+    storage::DBToolStorageOpt,
+    utils::ConcurrentDownloadsOpt,
+};
+use aptos_logger::warn;
+use aptos_types::transaction::Version;
+use clap::Parser;
+use itertools::Itertools;
+use std::{io::Write, iter::once, path::PathBuf};
+
+#[derive(Parser)]
+pub struct Opt {
+    #[clap(flatten)]
+    metadata_cache_opt: MetadataCacheOpt,
+    #[clap(flatten)]
+    storage: DBToolStorageOpt,
+    #[clap(flatten)]
+    concurrent_downloads: ConcurrentDownloadsOpt,
+    #[clap(
+        long,
+        help = "The first transaction version required to be replayed and verified. [Defaults to 0]"
+    )]
+    start_version: Option<Version>,
+    #[clap(
+        long,
+        help = "Target number of transactions for each job to replay",
+        default_value = "20000000"
+    )]
+    target_job_size: u64,
+    #[clap(
+        long,
+        help = "Determines the oldest epoch to replay, relative to the latest",
+        default_value = "4000"
+    )]
+    max_epochs: u64,
+    #[clap(long, help = "Output job ranges")]
+    output_json_file: PathBuf,
+}
+
+impl Opt {
+    pub async fn run(self) -> anyhow::Result<()> {
+        let storage = self.storage.init_storage().await?;
+        let metadata_view = sync_and_load(
+            &self.metadata_cache_opt,
+            storage,
+            self.concurrent_downloads.get(),
+        )
+        .await?;
+
+        let storage_state = metadata_view.get_storage_state()?;
+        let global_end_version = storage_state
+            .latest_transaction_version
+            .expect("No transaction backups.")
+            + 1;
+        let latest_epoch = storage_state
+            .latest_state_snapshot_epoch
+            .expect("No state snapshots.");
+        let max_epochs = self.max_epochs.min(latest_epoch + 1);
+        let global_min_epoch = latest_epoch + 1 - max_epochs;
+
+        let fake_end = StateSnapshotBackupMeta {
+            epoch: latest_epoch,
+            version: global_end_version,
+            manifest: "".to_string(),
+        };
+        let mut job_idx = 0;
+        let job_ranges = metadata_view
+            .all_state_snapshots()
+            .iter()
+            .filter(|s| s.epoch >= global_min_epoch && s.version <= global_end_version)
+            .chain(once(&fake_end))
+            .collect_vec()
+            .iter()
+            .rev()
+            .tuple_windows()
+            // to simplify things, if start_version appears in the middle of a range, give up the range
+            .take_while(|(_end, begin)| begin.version >= self.start_version.unwrap_or(0))
+            .peekable()
+            .batching(|it| {
+                job_idx += 1;
+                match it.next() {
+                    Some((end, mut begin)) => {
+                        if end.version - begin.version >= self.target_job_size {
+                            // cut big range short, this hopefully automatically skips load tests
+                            let msg = if end.epoch - begin.epoch > 15 {
+                                "!!! Need more snapshots !!!"
+                            } else {
+                                ""
+                            };
+                            warn!(
+                                begin = begin,
+                                end = end,
+                                "Big gap between snapshots. {} versions in {} epochs. {}",
+                                end.version - begin.version,
+                                end.epoch - begin.epoch,
+                                msg,
+                            );
+                            Some((
+                                format!("{job_idx}-Partial"),
+                                begin.version,
+                                begin.version + self.target_job_size,
+                                format!(
+                                    "Partial replay epoch {} - {}, {} txns starting from version {}, another {} versions omitted, until {}. {}",
+                                    begin.epoch,
+                                    end.epoch - 1,
+                                    self.target_job_size,
+                                    begin.version,
+                                    end.version - begin.version - self.target_job_size,
+                                    end.version,
+                                    msg
+                                )
+                            ))
+                        } else {
+                            while let Some((_prev_end, prev_begin)) = it.peek() {
+                                if end.version - prev_begin.version > self.target_job_size {
+                                    break;
+                                }
+                                begin = prev_begin;
+                                let _ = it.next();
+                            }
+                            Some((
+                                format!("{job_idx}"),
+                                begin.version,
+                                end.version,
+                                format!(
+                                    "Replay epoch {} - {}, {} txns starting from version {}.",
+                                    begin.epoch,
+                                    end.epoch - 1,
+                                    end.version - begin.version,
+                                    begin.version,
+                                )
+                            ))
+                        }
+                    },
+                    None => None,
+                }
+            })
+            .map(|(name, begin, end, desc)| format!("{name} {begin} {end} {desc}"))
+            .collect_vec();
+
+        std::fs::File::create(&self.output_json_file)?
+            .write_all(&serde_json::to_vec(&job_ranges)?)?;
+
+        Ok(())
+    }
+}