implement self-adjustable caching (#27)

# What ❔ This PR implements caching logic with self-adjustable caching configurations for setup/trace/arguments polynomials. ## Why ❔ Adjustable ratios between caching and re-computation of various polynomial groups allow for variable tradeoff between memory usage and performance. This PR implements a cost model and a logic for determining the optimal caching strategy for the amount of memory that is available. The result is that when the amount of available GPU memory is ~21 GB or more, full performance can be extracted and on the other end of the spectrum the prover can run with less than 6 GB of GPU RAM although with significantly lower performance that the full performance. Here are results of a benchmarks done on a L4 GPU for the MainVM base layer circuit to show the tradeoff of performance vs memory usage. Keep in mind that the values for memory usage are numbers of memory reserved for various data structures used by the prover, there are additional memory requirements coming from the OS, GPU driver or other processes, these can differ significantly based on a particular GPU model and the OS. For example for a L4 running under linux, the observed additional allocated memory amounted to ~0,75 GB. | VRAM used (GB)|First run (s)|Subsequent runs (s)| |-:|-:|-:| |20.0|1.976|1.304| |19.5|1.999|1.339| |19.0|2.038|1.380| |18.5|2.133|1.467| |14.5|2.012|1.534| |13.5|2.060|1.570| |13.0|2.099|1.614| |12.5|2.116|1.653| |12.0|2.151|1.667| |11.5|2.185|1.701| |11.0|2.221|1.747| |10.5|2.278|1.800| |10.0|2.310|1.827| |9.5|2.363|1.883| |9.0|2.442|1.964| |8.5|2.492|2.017| |8.0|2.580|2.104| |7.5|2.721|2.243| |7.0|2.990|2.485| |6.5|3.022|2.523| |6.0|3.400|2.892| |5.5|3.480|2.980| ## Checklist - [x] PR title corresponds to the body of PR (we generate changelog entries from PRs). - [x] Tests for the changes have been added / updated. - [x] Documentation comments have been added / updated. - [x] Code has been formatted via `cargo fmt` and linted with `cargo check`.
matter-labs · Feb 6, 2024 · 350fbf8 · 350fbf8
1 parent 6592cd1
commit 350fbf8
Show file tree

Hide file tree

Showing 32 changed files with 2,795 additions and 2,161 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -6,14 +6,10 @@ edition = "2021"
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-# boojum = { path = "../era-boojum", package = "boojum" }
-# boojum-cuda = { path = "../era-boojum-cuda" }
-# cudart = { path = "../era-cuda/cudart", package = "cudart" }
-# circuit_definitions = { path = "../era-zkevm_test_harness/circuit_definitions", package = "circuit_definitions", optional = true }
-
 boojum = { git = "https://github.com/matter-labs/era-boojum", branch = "main" }
 boojum-cuda = { git = "https://github.com/matter-labs/era-boojum-cuda", branch = "main" }
 cudart = { git = "https://github.com/matter-labs/era-cuda", branch = "main", package = "cudart" }
+cudart-sys = { git = "https://github.com/matter-labs/era-cuda", branch = "main", package = "cudart-sys" }
 circuit_definitions = { git = "https://github.com/matter-labs/era-zkevm_test_harness", branch = "v1.4.1", package = "circuit_definitions", optional = true }
 
 rand = "0.8"
@@ -36,5 +32,7 @@ serial_test = "^2"
 [features]
 default = ["zksync"]
 zksync = ["circuit_definitions"]
-recompute = []
 allocator_stats = []
+
+[profile.release]
+incremental = true
diff --git a/src/constraint_evaluation.rs b/src/constraint_evaluation.rs
@@ -191,23 +191,18 @@ pub fn multi_polys_as_single_slice_mut<'a, P: PolyForm>(polys: &mut [Poly<'a, P>
 }
 
 // Accumulates into quotient
-pub fn generic_evaluate_constraints_by_coset<'a, 'b>(
-    trace_polys: &TracePolynomials<'a, CosetEvaluations>,
-    setup_polys: &SetupPolynomials<'a, CosetEvaluations>,
-    gates: &[cs_helpers::GateEvaluationParams],
+pub fn generic_evaluate_constraints_by_coset(
+    variable_cols: &[Poly<CosetEvaluations>],
+    witness_cols: &[Poly<CosetEvaluations>],
+    constant_cols: &[Poly<CosetEvaluations>],
+    gates: &[GateEvaluationParams],
     _selectors_placement: TreeNode,
     domain_size: usize,
     challenge: EF,
     challenge_power_offset: usize,
-    quotient: &mut ComplexPoly<'b, CosetEvaluations>,
-) -> CudaResult<()>
-where
-    'a: 'b,
-{
-    assert_eq!(
-        trace_polys.variable_cols[0].domain_size(),
-        quotient.domain_size()
-    );
+    quotient: &mut ComplexPoly<CosetEvaluations>,
+) -> CudaResult<()> {
+    assert_eq!(variable_cols[0].domain_size(), quotient.domain_size());
 
     let quotient_as_single_slice = unsafe {
         assert_eq!(
@@ -217,12 +212,6 @@ where
         let len = 2 * quotient.domain_size();
         std::slice::from_raw_parts_mut(quotient.c0.storage.as_mut().as_mut_ptr(), len)
     };
-    let TracePolynomials {
-        variable_cols,
-        witness_cols,
-        multiplicity_cols: _,
-    } = trace_polys;
-    let SetupPolynomials { constant_cols, .. } = setup_polys;
 
     let all_variable_cols = multi_polys_as_single_slice(&variable_cols);
     let all_witness_cols = multi_polys_as_single_slice(&witness_cols);

diff --git a/src/context.rs b/src/context.rs
@@ -1,147 +1,104 @@
 use super::*;
+use boojum_cuda::context::Context;
+use std::collections::HashMap;
 
 pub struct ProverContext;
 
 pub const ZKSYNC_DEFAULT_TRACE_LOG_LENGTH: usize = 20;
 
 impl ProverContext {
-    pub fn create() -> CudaResult<Self> {
+    fn create_internal(
+        cuda_ctx: Context,
+        small_device_alloc: SmallStaticDeviceAllocator,
+        device_alloc: StaticDeviceAllocator,
+        small_host_alloc: SmallStaticHostAllocator,
+        host_alloc: StaticHostAllocator,
+    ) -> CudaResult<Self> {
         unsafe {
             assert!(_CUDA_CONTEXT.is_none());
-            assert!(_DEVICE_ALLOCATOR.is_none());
-            assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
-            assert!(_HOST_ALLOCATOR.is_none());
-            assert!(_SMALL_HOST_ALLOCATOR.is_none());
-            assert!(_EXEC_STREAM.is_none());
-            assert!(_H2D_STREAM.is_none());
-            assert!(_D2H_STREAM.is_none());
-        }
-        // size counts in field elements
-        let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
-        let cuda_ctx = CudaContext::create(12, 12)?;
-
-        // grab small slice then consume everything
-        let small_device_alloc = SmallStaticDeviceAllocator::init()?;
-        let device_alloc = StaticDeviceAllocator::init_all(block_size)?;
-
-        let small_host_alloc = SmallStaticHostAllocator::init()?;
-        let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
-
-        unsafe {
             _CUDA_CONTEXT = Some(cuda_ctx);
+            assert!(_DEVICE_ALLOCATOR.is_none());
             _DEVICE_ALLOCATOR = Some(device_alloc);
+            assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
             _SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
+            assert!(_HOST_ALLOCATOR.is_none());
             _HOST_ALLOCATOR = Some(host_alloc);
+            assert!(_SMALL_HOST_ALLOCATOR.is_none());
             _SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
+            assert!(_EXEC_STREAM.is_none());
             _EXEC_STREAM = Some(Stream::create()?);
+            assert!(_H2D_STREAM.is_none());
             _H2D_STREAM = Some(Stream::create()?);
+            assert!(_D2H_STREAM.is_none());
             _D2H_STREAM = Some(Stream::create()?);
-        }
-
+            assert!(_SETUP_CACHE.is_none());
+            assert!(_STRATEGY_CACHE.is_none());
+            _STRATEGY_CACHE = Some(HashMap::new());
+        };
         Ok(Self {})
     }
 
-    #[allow(dead_code)]
-    pub(crate) fn create_limited_dev(block_size: usize) -> CudaResult<Self> {
-        unsafe {
-            assert!(_CUDA_CONTEXT.is_none());
-            assert!(_DEVICE_ALLOCATOR.is_none());
-            assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
-            assert!(_HOST_ALLOCATOR.is_none());
-            assert!(_SMALL_HOST_ALLOCATOR.is_none());
-            assert!(_EXEC_STREAM.is_none());
-            assert!(_H2D_STREAM.is_none());
-            assert!(_D2H_STREAM.is_none());
-        }
+    pub fn create() -> CudaResult<Self> {
         // size counts in field elements
+        let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
         let cuda_ctx = CudaContext::create(12, 12)?;
-
         // grab small slice then consume everything
         let small_device_alloc = SmallStaticDeviceAllocator::init()?;
-        let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
+        let device_alloc = StaticDeviceAllocator::init_all(block_size)?;
         let small_host_alloc = SmallStaticHostAllocator::init()?;
         let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
-
-        unsafe {
-            _CUDA_CONTEXT = Some(cuda_ctx);
-            _DEVICE_ALLOCATOR = Some(device_alloc);
-            _SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
-            _HOST_ALLOCATOR = Some(host_alloc);
-            _SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
-            _EXEC_STREAM = Some(Stream::create()?);
-            _H2D_STREAM = Some(Stream::create()?);
-            _D2H_STREAM = Some(Stream::create()?);
-        }
-
-        Ok(Self {})
+        Self::create_internal(
+            cuda_ctx,
+            small_device_alloc,
+            device_alloc,
+            small_host_alloc,
+            host_alloc,
+        )
     }
 
-    pub fn create_limited() -> CudaResult<Self> {
-        unsafe {
-            assert!(_CUDA_CONTEXT.is_none());
-            assert!(_DEVICE_ALLOCATOR.is_none());
-            assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
-            assert!(_HOST_ALLOCATOR.is_none());
-            assert!(_SMALL_HOST_ALLOCATOR.is_none());
-            assert!(_EXEC_STREAM.is_none());
-            assert!(_H2D_STREAM.is_none());
-            assert!(_D2H_STREAM.is_none());
-        }
+    pub fn create_limited(num_blocks: usize) -> CudaResult<Self> {
         // size counts in field elements
         let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
         let cuda_ctx = CudaContext::create(12, 12)?;
-
         // grab small slice then consume everything
         let small_device_alloc = SmallStaticDeviceAllocator::init()?;
-        let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
+        let device_alloc = StaticDeviceAllocator::init(num_blocks, block_size)?;
         let small_host_alloc = SmallStaticHostAllocator::init()?;
         let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
-
-        unsafe {
-            _CUDA_CONTEXT = Some(cuda_ctx);
-            _DEVICE_ALLOCATOR = Some(device_alloc);
-            _SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
-            _HOST_ALLOCATOR = Some(host_alloc);
-            _SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
-            _EXEC_STREAM = Some(Stream::create()?);
-            _H2D_STREAM = Some(Stream::create()?);
-            _D2H_STREAM = Some(Stream::create()?);
-        }
-
-        Ok(Self {})
+        Self::create_internal(
+            cuda_ctx,
+            small_device_alloc,
+            device_alloc,
+            small_host_alloc,
+            host_alloc,
+        )
     }
 
-    #[allow(dead_code)]
+    #[cfg(test)]
     pub(crate) fn dev(domain_size: usize) -> CudaResult<Self> {
         assert!(domain_size.is_power_of_two());
         // size counts in field elements
         let block_size = domain_size;
         let cuda_ctx = CudaContext::create(12, 12)?;
-
         let small_device_alloc = SmallStaticDeviceAllocator::init()?;
         let device_alloc = StaticDeviceAllocator::init_all(block_size)?;
-
         let small_host_alloc = SmallStaticHostAllocator::init()?;
         let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
-
-        unsafe {
-            _CUDA_CONTEXT = Some(cuda_ctx);
-            _DEVICE_ALLOCATOR = Some(device_alloc);
-            _SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
-            _HOST_ALLOCATOR = Some(host_alloc);
-            _SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
-            _EXEC_STREAM = Some(Stream::create()?);
-            _H2D_STREAM = Some(Stream::create()?);
-            _D2H_STREAM = Some(Stream::create()?);
-        }
-
-        Ok(Self {})
+        Self::create_internal(
+            cuda_ctx,
+            small_device_alloc,
+            device_alloc,
+            small_host_alloc,
+            host_alloc,
+        )
     }
 }
 
 impl Drop for ProverContext {
     fn drop(&mut self) {
         unsafe {
+            _setup_cache_reset();
+
             let cuda_ctx = _CUDA_CONTEXT.take().expect("cuda ctx");
             cuda_ctx.destroy().expect("destroy cuda ctx");
 
@@ -183,14 +140,16 @@ impl Drop for ProverContext {
                 .inner
                 .destroy()
                 .expect("destroy d2h stream");
+
+            drop(_STRATEGY_CACHE.take());
         }
     }
 }
 
-pub(crate) static mut _CUDA_CONTEXT: Option<CudaContext> = None;
-pub(crate) static mut _EXEC_STREAM: Option<Stream> = None;
-pub(crate) static mut _H2D_STREAM: Option<Stream> = None;
-pub(crate) static mut _D2H_STREAM: Option<Stream> = None;
+static mut _CUDA_CONTEXT: Option<CudaContext> = None;
+static mut _EXEC_STREAM: Option<Stream> = None;
+static mut _H2D_STREAM: Option<Stream> = None;
+static mut _D2H_STREAM: Option<Stream> = None;
 
 pub(crate) fn get_stream() -> &'static CudaStream {
     unsafe { &_EXEC_STREAM.as_ref().expect("execution stream").inner }
@@ -207,11 +166,11 @@ pub(crate) fn get_d2h_stream() -> &'static CudaStream {
 }
 
 pub fn synchronize_streams() -> CudaResult<()> {
-    get_stream().synchronize()?;
-    get_h2d_stream().synchronize()?;
-    get_d2h_stream().synchronize()?;
-
-    Ok(())
+    if_not_dry_run! {
+        get_stream().synchronize()?;
+        get_h2d_stream().synchronize()?;
+        get_d2h_stream().synchronize()
+    }
 }
 
 // use custom wrapper to work around send + sync requirement of static var
@@ -230,38 +189,82 @@ impl Stream {
 unsafe impl Send for Stream {}
 unsafe impl Sync for Stream {}
 
-pub(crate) static mut _DEVICE_ALLOCATOR: Option<StaticDeviceAllocator> = None;
-pub(crate) static mut _SMALL_DEVICE_ALLOCATOR: Option<SmallStaticDeviceAllocator> = None;
-pub(crate) static mut _HOST_ALLOCATOR: Option<StaticHostAllocator> = None;
-pub(crate) static mut _SMALL_HOST_ALLOCATOR: Option<SmallStaticHostAllocator> = None;
+static mut _DEVICE_ALLOCATOR: Option<StaticDeviceAllocator> = None;
+static mut _SMALL_DEVICE_ALLOCATOR: Option<SmallStaticDeviceAllocator> = None;
+static mut _HOST_ALLOCATOR: Option<StaticHostAllocator> = None;
+static mut _SMALL_HOST_ALLOCATOR: Option<SmallStaticHostAllocator> = None;
 
 pub(crate) fn _alloc() -> &'static StaticDeviceAllocator {
     unsafe {
-        &_DEVICE_ALLOCATOR
+        _DEVICE_ALLOCATOR
             .as_ref()
             .expect("device allocator should be initialized")
     }
 }
 
 pub(crate) fn _small_alloc() -> &'static SmallStaticDeviceAllocator {
     unsafe {
-        &_SMALL_DEVICE_ALLOCATOR
+        _SMALL_DEVICE_ALLOCATOR
             .as_ref()
             .expect("small device allocator should be initialized")
     }
 }
 pub(crate) fn _host_alloc() -> &'static StaticHostAllocator {
     unsafe {
-        &_HOST_ALLOCATOR
+        _HOST_ALLOCATOR
             .as_ref()
             .expect("host allocator should be initialized")
     }
 }
 
 pub(crate) fn _small_host_alloc() -> &'static SmallStaticHostAllocator {
     unsafe {
-        &_SMALL_HOST_ALLOCATOR
+        _SMALL_HOST_ALLOCATOR
             .as_ref()
             .expect("small host allocator should be initialized")
     }
 }
+
+static mut _SETUP_CACHE: Option<SetupCache> = None;
+
+pub(crate) fn _setup_cache_get() -> Option<&'static mut SetupCache> {
+    unsafe { _SETUP_CACHE.as_mut() }
+}
+
+pub(crate) fn _setup_cache_set(value: SetupCache) {
+    unsafe {
+        assert!(_SETUP_CACHE.is_none());
+        _SETUP_CACHE = Some(value)
+    }
+}
+
+pub(crate) fn _setup_cache_reset() {
+    unsafe { _SETUP_CACHE = None }
+}
+
+static mut _STRATEGY_CACHE: Option<HashMap<Vec<[F; 4]>, CacheStrategy>> = None;
+
+pub(crate) fn _strategy_cache_get() -> &'static mut HashMap<Vec<[F; 4]>, CacheStrategy> {
+    unsafe {
+        _STRATEGY_CACHE
+            .as_mut()
+            .expect("strategy cache should be initialized")
+    }
+}
+pub(crate) fn _strategy_cache_reset() {
+    unsafe { _STRATEGY_CACHE = Some(HashMap::new()) }
+}
+
+pub(crate) fn is_prover_context_initialized() -> bool {
+    unsafe {
+        _CUDA_CONTEXT.is_some()
+            & _EXEC_STREAM.is_some()
+            & _H2D_STREAM.is_some()
+            & _D2H_STREAM.is_some()
+            & _DEVICE_ALLOCATOR.is_some()
+            & _SMALL_DEVICE_ALLOCATOR.is_some()
+            & _HOST_ALLOCATOR.is_some()
+            & _SMALL_HOST_ALLOCATOR.is_some()
+            & _STRATEGY_CACHE.is_some()
+    }
+}