Add documentation to forge-stable test suite

aptos-labs · Jan 8, 2025 · 4cc7fe1 · 4cc7fe1
1 parent 8ff3aa1
commit 4cc7fe1
Show file tree

Hide file tree

Showing 3 changed files with 91 additions and 21 deletions.
diff --git a/.github/workflows/forge-stable.yaml b/.github/workflows/forge-stable.yaml
@@ -1,6 +1,44 @@
 # Continuously run stable forge tests against the latest main branch.
 name: Continuous Forge Tests - Stable
 
+# We have various Forge Stable tests here, that test out different situations and workloads.
+#
+# Dashboard showing historical results: https://grafana.aptoslabs.com/d/bdnt45ggsg000f/forge-stable-performance?orgId=1
+
+# Tests are named based on how they are set up, some of the common flavors are:
+# * "realistic-env" - tests with "realistic-env" in their name try to have network and hardware environemnt
+#   be more realisistic. They use "wrap_with_realistic_env", which sets:
+#   * MultiRegionNetworkEmulationTest which splits nodes into 4 "regions", which have different
+#     x-region and in-region latencies and reliability rates
+#   * CpuChaosTest which tries to make nodes have heterogenous hardware, by loading a few cores fully 
+#     on a few nodes. But this is not too helpful, as block execution time variance is minimal 
+#     (as we generally have a few idle cores, and real variance mostly comes from variance in cpu speed instead)
+# * sweep - means running a multiple tests within a single test, by having everything the same, except for one 
+#   thing - i.e. the thing we sweep over. There are two main dimensions we "sweep" over:
+#   * load sweep - this generally uses const tps workload, and varies the load across the tests (i.e. 10 vs 100 vs 1000 TPS) 
+#   * workload sweep - this varies the transaction type being submitted, trying to test out how the system behaves
+#     when different part of the system are stressed (i.e. low vs high output sizes, good vs bad gas calibration, parallel vs sequential, etc)
+# * graceful - tests where we are overloading the system - i.e. submitting more transactions than we expect system to handle,
+#   and seeing how it behaves. overall e2e latency is then high, but we can test that only validator -> block proposal has increased.
+#   additionally, we generally add a small TPS high-fee traffic in these tests, to confirm it is unaffected by the high load.
+# * changing-working-quorum - tests where we intentionally make nodes unreachable (cut their network), and bring them back,
+#   and go to cut network on next set of nodes - requiring state-sync to catch up, consensus to work with different set of 
+#   nodes being required to form consensus. During each iteration, we test that enough progress was made.
+#
+# Main success criteria used across the tests are:
+# * throughput and expiration/rejection rate 
+# * latency (avg / p50 / p90 / p99 )
+# * latency breakdown across the components - currently within a validator alone:
+#   batch->pos->proposal->ordered->committed
+#   TODO: we should add other stages - before batch and after committed to success criteria, to be able to track them better
+# * chain progress - checking longest pause between the blocks (both in num failed runs, and in absolute time) across the run
+# * catchup - that all nodes can go over the same version at the end of the load (i.e. catching if any individual validator got stuck)
+# * no restarts - fails if any node has restarted within the test
+# * system metrics - checks for CPU and RAM utilization during the test
+#   TODO: we should add network bandwidth and disk iops as system metrics checks as well
+#
+# You can find more details about the individual tests below.
+
 permissions:
   issues: write
   pull-requests: write
@@ -28,20 +66,46 @@ on:
         options:
           - all
           - framework-upgrade-test
+          # Test varies the load, i.e. sending 10, 100, 1000, 5000, etc TPS, and then measuring 
+          # onchain TPS, expired rate, as well as p50/p90/p99 latency, among other things,
+          # testing that we don't degrade performance both for low, mid and high loads. 
           - realistic-env-load-sweep
+          # Test varies the workload, across some basic workloads (i.e. some cheap, some expensive), 
+          # and checks that throughput and performance across different stages
           - realistic-env-workload-sweep
+          # Test sends ConstTps workload above what the system can handle, while additionally sending
+          # non-small high-fee traffic (1000 TPS), and measures overall system performance.
           - realistic-env-graceful-overload
+          # Test varies the workload (opposite ends of gas calibration, high and low output sizes, 
+          # sequential / parallel, etc), and sends ConstTPS for each above what the system can handle, 
+          # while sending low TPS of high fee transactions. And primarily confirms that high-fee traffic 
+          # has predictably low latency, and execution pipeline doesn't get backed up.
           - realistic-env-graceful-workload-sweep
+          # Test varies workload, which is user-contracts, such that max throughput varies from high to mid to low, 
+          # while testing that unrelated transactions paying the same gas price, are able to go through.
           - realistic-env-fairness-workload-sweep
+          # Test which tunes all configurations for largest throughput possible (potentially sacrificing latency a bit)
           - realistic-network-tuned-for-throughput
+          # Run small-ish load, but checks that at all times all nodes are making progress, 
+          # catching any unexpected unreliabilities/delays in consensus
           - consensus-stress-test
+          # Send a mix of different workloads, to catch issues with different interactions of workloads.
+          # this is MUCH MUCH less comprehensive than replay-verify, but the best we can do with txn emitter at the moment
           - workload-mix-test
           - single-vfn-perf
           - fullnode-reboot-stress-test
           - compat
+          # Send low TPS (100 TPS)
+          # Cut network on enough nodes, such that all others are needed for consensus. Then bring a few back, and cut 
+          # same amount of new ones - requiring all that were brought back to state-sync and continue executing.
+          # Check that in each iteration - we were able to make meaningful progress.
           - changing-working-quorum-test
+          # Same as above run-forge-changing-working-quorum-test, just sending a bit higher load - 500TPS
+          # TODO - we should probably increase load here significantly
           - changing-working-quorum-test-high-load
           - pfn-const-tps-realistic-env
+          # Run a production config (same as land blocking run) max load (via mempool backlog), but run it for 2 hours,
+          # to check reliability and consistency of the newtork. 
           - realistic-env-max-load-long
       JOB_PARALLELISM:
         required: false

diff --git a/testsuite/forge-cli/src/suites/realistic_environment.rs b/testsuite/forge-cli/src/suites/realistic_environment.rs
@@ -178,11 +178,10 @@ pub(crate) fn realistic_env_fairness_workload_sweep() -> ForgeConfig {
                 .with_transactions_per_account(1),
         ]),
         criteria: Vec::new(),
-        background_traffic: background_traffic_for_sweep_with_latency(&[
-            (3.0, 8.0),
-            (3.0, 8.0),
-            (3.0, 4.0),
-        ]),
+        background_traffic: background_traffic_for_sweep_with_latency(
+            &[(3.0, 8.0), (3.0, 8.0), (3.0, 4.0)],
+            false,
+        ),
     })
 }
 
@@ -212,16 +211,19 @@ pub(crate) fn realistic_env_graceful_workload_sweep() -> ForgeConfig {
                 .with_transactions_per_account(1),
         ]),
         criteria: Vec::new(),
-        background_traffic: background_traffic_for_sweep_with_latency(&[
-            (4.0, 5.0),
-            (2.2, 3.0),
-            (3.5, 5.0),
-            (4.0, 6.0),
-            (2.5, 4.0),
-            (3.5, 5.0),
-            // TODO - p50 and p90 is set to high, until it is calibrated/understood.
-            (3.0, 10.0),
-        ]),
+        background_traffic: background_traffic_for_sweep_with_latency(
+            &[
+                (4.0, 5.0),
+                (2.2, 3.0),
+                (3.5, 5.0),
+                (4.0, 6.0),
+                (2.5, 4.0),
+                (3.5, 5.0),
+                // TODO - p50 and p90 is set to high, until it is calibrated/understood.
+                (3.0, 10.0),
+            ],
+            true,
+        ),
     })
     .with_emit_job(
         EmitJobRequest::default()

diff --git a/testsuite/forge-cli/src/suites/ungrouped.rs b/testsuite/forge-cli/src/suites/ungrouped.rs
@@ -403,16 +403,19 @@ fn consensus_stress_test() -> ForgeConfig {
     )
 }
 
-fn background_emit_request() -> EmitJobRequest {
-    EmitJobRequest::default()
+fn background_emit_request(high_gas_price: bool) -> EmitJobRequest {
+    let mut result = EmitJobRequest::default()
         .num_accounts_mode(NumAccountsMode::TransactionsPerAccount(1))
-        .mode(EmitJobMode::ConstTps { tps: 10 })
-        .gas_price(5 * aptos_global_constants::GAS_UNIT_PRICE)
+        .mode(EmitJobMode::ConstTps { tps: 10 });
+    if high_gas_price {
+        result = result.gas_price(5 * aptos_global_constants::GAS_UNIT_PRICE);
+    }
+    result
 }
 
 pub fn background_traffic_for_sweep(num_cases: usize) -> Option<BackgroundTraffic> {
     Some(BackgroundTraffic {
-        traffic: background_emit_request(),
+        traffic: background_emit_request(true),
         criteria: std::iter::repeat(9.5)
             .take(num_cases)
             .map(|min_tps| {
@@ -426,9 +429,10 @@ pub fn background_traffic_for_sweep(num_cases: usize) -> Option<BackgroundTraffi
 
 pub fn background_traffic_for_sweep_with_latency(
     criteria: &[(f32, f32)],
+    high_gas_price: bool,
 ) -> Option<BackgroundTraffic> {
     Some(BackgroundTraffic {
-        traffic: background_emit_request(),
+        traffic: background_emit_request(high_gas_price),
         criteria: criteria
             .iter()
             .map(|(p50, p90)| {