From 4cc7fe14ea46c0f4e9f1b8cecb7a1c97faf5e544 Mon Sep 17 00:00:00 2001 From: Igor Date: Tue, 17 Dec 2024 13:46:18 -0800 Subject: [PATCH] Add documentation to forge-stable test suite --- .github/workflows/forge-stable.yaml | 64 +++++++++++++++++++ .../src/suites/realistic_environment.rs | 32 +++++----- testsuite/forge-cli/src/suites/ungrouped.rs | 16 +++-- 3 files changed, 91 insertions(+), 21 deletions(-) diff --git a/.github/workflows/forge-stable.yaml b/.github/workflows/forge-stable.yaml index aea8a24b6c8f0..6dc2509d8563a 100644 --- a/.github/workflows/forge-stable.yaml +++ b/.github/workflows/forge-stable.yaml @@ -1,6 +1,44 @@ # Continuously run stable forge tests against the latest main branch. name: Continuous Forge Tests - Stable +# We have various Forge Stable tests here, that test out different situations and workloads. +# +# Dashboard showing historical results: https://grafana.aptoslabs.com/d/bdnt45ggsg000f/forge-stable-performance?orgId=1 + +# Tests are named based on how they are set up, some of the common flavors are: +# * "realistic-env" - tests with "realistic-env" in their name try to have network and hardware environemnt +# be more realisistic. They use "wrap_with_realistic_env", which sets: +# * MultiRegionNetworkEmulationTest which splits nodes into 4 "regions", which have different +# x-region and in-region latencies and reliability rates +# * CpuChaosTest which tries to make nodes have heterogenous hardware, by loading a few cores fully +# on a few nodes. But this is not too helpful, as block execution time variance is minimal +# (as we generally have a few idle cores, and real variance mostly comes from variance in cpu speed instead) +# * sweep - means running a multiple tests within a single test, by having everything the same, except for one +# thing - i.e. the thing we sweep over. There are two main dimensions we "sweep" over: +# * load sweep - this generally uses const tps workload, and varies the load across the tests (i.e. 10 vs 100 vs 1000 TPS) +# * workload sweep - this varies the transaction type being submitted, trying to test out how the system behaves +# when different part of the system are stressed (i.e. low vs high output sizes, good vs bad gas calibration, parallel vs sequential, etc) +# * graceful - tests where we are overloading the system - i.e. submitting more transactions than we expect system to handle, +# and seeing how it behaves. overall e2e latency is then high, but we can test that only validator -> block proposal has increased. +# additionally, we generally add a small TPS high-fee traffic in these tests, to confirm it is unaffected by the high load. +# * changing-working-quorum - tests where we intentionally make nodes unreachable (cut their network), and bring them back, +# and go to cut network on next set of nodes - requiring state-sync to catch up, consensus to work with different set of +# nodes being required to form consensus. During each iteration, we test that enough progress was made. +# +# Main success criteria used across the tests are: +# * throughput and expiration/rejection rate +# * latency (avg / p50 / p90 / p99 ) +# * latency breakdown across the components - currently within a validator alone: +# batch->pos->proposal->ordered->committed +# TODO: we should add other stages - before batch and after committed to success criteria, to be able to track them better +# * chain progress - checking longest pause between the blocks (both in num failed runs, and in absolute time) across the run +# * catchup - that all nodes can go over the same version at the end of the load (i.e. catching if any individual validator got stuck) +# * no restarts - fails if any node has restarted within the test +# * system metrics - checks for CPU and RAM utilization during the test +# TODO: we should add network bandwidth and disk iops as system metrics checks as well +# +# You can find more details about the individual tests below. + permissions: issues: write pull-requests: write @@ -28,20 +66,46 @@ on: options: - all - framework-upgrade-test + # Test varies the load, i.e. sending 10, 100, 1000, 5000, etc TPS, and then measuring + # onchain TPS, expired rate, as well as p50/p90/p99 latency, among other things, + # testing that we don't degrade performance both for low, mid and high loads. - realistic-env-load-sweep + # Test varies the workload, across some basic workloads (i.e. some cheap, some expensive), + # and checks that throughput and performance across different stages - realistic-env-workload-sweep + # Test sends ConstTps workload above what the system can handle, while additionally sending + # non-small high-fee traffic (1000 TPS), and measures overall system performance. - realistic-env-graceful-overload + # Test varies the workload (opposite ends of gas calibration, high and low output sizes, + # sequential / parallel, etc), and sends ConstTPS for each above what the system can handle, + # while sending low TPS of high fee transactions. And primarily confirms that high-fee traffic + # has predictably low latency, and execution pipeline doesn't get backed up. - realistic-env-graceful-workload-sweep + # Test varies workload, which is user-contracts, such that max throughput varies from high to mid to low, + # while testing that unrelated transactions paying the same gas price, are able to go through. - realistic-env-fairness-workload-sweep + # Test which tunes all configurations for largest throughput possible (potentially sacrificing latency a bit) - realistic-network-tuned-for-throughput + # Run small-ish load, but checks that at all times all nodes are making progress, + # catching any unexpected unreliabilities/delays in consensus - consensus-stress-test + # Send a mix of different workloads, to catch issues with different interactions of workloads. + # this is MUCH MUCH less comprehensive than replay-verify, but the best we can do with txn emitter at the moment - workload-mix-test - single-vfn-perf - fullnode-reboot-stress-test - compat + # Send low TPS (100 TPS) + # Cut network on enough nodes, such that all others are needed for consensus. Then bring a few back, and cut + # same amount of new ones - requiring all that were brought back to state-sync and continue executing. + # Check that in each iteration - we were able to make meaningful progress. - changing-working-quorum-test + # Same as above run-forge-changing-working-quorum-test, just sending a bit higher load - 500TPS + # TODO - we should probably increase load here significantly - changing-working-quorum-test-high-load - pfn-const-tps-realistic-env + # Run a production config (same as land blocking run) max load (via mempool backlog), but run it for 2 hours, + # to check reliability and consistency of the newtork. - realistic-env-max-load-long JOB_PARALLELISM: required: false diff --git a/testsuite/forge-cli/src/suites/realistic_environment.rs b/testsuite/forge-cli/src/suites/realistic_environment.rs index cfb527e5dec85..9e59f4e67177d 100644 --- a/testsuite/forge-cli/src/suites/realistic_environment.rs +++ b/testsuite/forge-cli/src/suites/realistic_environment.rs @@ -178,11 +178,10 @@ pub(crate) fn realistic_env_fairness_workload_sweep() -> ForgeConfig { .with_transactions_per_account(1), ]), criteria: Vec::new(), - background_traffic: background_traffic_for_sweep_with_latency(&[ - (3.0, 8.0), - (3.0, 8.0), - (3.0, 4.0), - ]), + background_traffic: background_traffic_for_sweep_with_latency( + &[(3.0, 8.0), (3.0, 8.0), (3.0, 4.0)], + false, + ), }) } @@ -212,16 +211,19 @@ pub(crate) fn realistic_env_graceful_workload_sweep() -> ForgeConfig { .with_transactions_per_account(1), ]), criteria: Vec::new(), - background_traffic: background_traffic_for_sweep_with_latency(&[ - (4.0, 5.0), - (2.2, 3.0), - (3.5, 5.0), - (4.0, 6.0), - (2.5, 4.0), - (3.5, 5.0), - // TODO - p50 and p90 is set to high, until it is calibrated/understood. - (3.0, 10.0), - ]), + background_traffic: background_traffic_for_sweep_with_latency( + &[ + (4.0, 5.0), + (2.2, 3.0), + (3.5, 5.0), + (4.0, 6.0), + (2.5, 4.0), + (3.5, 5.0), + // TODO - p50 and p90 is set to high, until it is calibrated/understood. + (3.0, 10.0), + ], + true, + ), }) .with_emit_job( EmitJobRequest::default() diff --git a/testsuite/forge-cli/src/suites/ungrouped.rs b/testsuite/forge-cli/src/suites/ungrouped.rs index 789780062b33a..31a81f09c5dad 100644 --- a/testsuite/forge-cli/src/suites/ungrouped.rs +++ b/testsuite/forge-cli/src/suites/ungrouped.rs @@ -403,16 +403,19 @@ fn consensus_stress_test() -> ForgeConfig { ) } -fn background_emit_request() -> EmitJobRequest { - EmitJobRequest::default() +fn background_emit_request(high_gas_price: bool) -> EmitJobRequest { + let mut result = EmitJobRequest::default() .num_accounts_mode(NumAccountsMode::TransactionsPerAccount(1)) - .mode(EmitJobMode::ConstTps { tps: 10 }) - .gas_price(5 * aptos_global_constants::GAS_UNIT_PRICE) + .mode(EmitJobMode::ConstTps { tps: 10 }); + if high_gas_price { + result = result.gas_price(5 * aptos_global_constants::GAS_UNIT_PRICE); + } + result } pub fn background_traffic_for_sweep(num_cases: usize) -> Option { Some(BackgroundTraffic { - traffic: background_emit_request(), + traffic: background_emit_request(true), criteria: std::iter::repeat(9.5) .take(num_cases) .map(|min_tps| { @@ -426,9 +429,10 @@ pub fn background_traffic_for_sweep(num_cases: usize) -> Option Option { Some(BackgroundTraffic { - traffic: background_emit_request(), + traffic: background_emit_request(high_gas_price), criteria: criteria .iter() .map(|(p50, p90)| {