Skip to content
This repository has been archived by the owner on Aug 16, 2024. It is now read-only.

Commit

Permalink
implement self-adjustable caching (#27)
Browse files Browse the repository at this point in the history
# What ❔

This PR implements caching logic with self-adjustable caching
configurations for setup/trace/arguments polynomials.

## Why ❔

Adjustable ratios between caching and re-computation of various
polynomial groups allow for variable tradeoff between memory usage and
performance.
This PR implements a cost model and a logic for determining the optimal
caching strategy for the amount of memory that is available.
The result is that when the amount of available GPU memory is ~21 GB or
more, full performance can be extracted and on the other end of the
spectrum the prover can run with less than 6 GB of GPU RAM although with
significantly lower performance that the full performance.

Here are results of a benchmarks done on a L4 GPU for the MainVM base
layer circuit to show the tradeoff of performance vs memory usage. Keep
in mind that the values for memory usage are numbers of memory reserved
for various data structures used by the prover, there are additional
memory requirements coming from the OS, GPU driver or other processes,
these can differ significantly based on a particular GPU model and the
OS. For example for a L4 running under linux, the observed additional
allocated memory amounted to ~0,75 GB.

| VRAM used (GB)|First run (s)|Subsequent runs (s)|
|-:|-:|-:|
|20.0|1.976|1.304|
|19.5|1.999|1.339|
|19.0|2.038|1.380|
|18.5|2.133|1.467|
|14.5|2.012|1.534|
|13.5|2.060|1.570|
|13.0|2.099|1.614|
|12.5|2.116|1.653|
|12.0|2.151|1.667|
|11.5|2.185|1.701|
|11.0|2.221|1.747|
|10.5|2.278|1.800|
|10.0|2.310|1.827|
|9.5|2.363|1.883|
|9.0|2.442|1.964|
|8.5|2.492|2.017|
|8.0|2.580|2.104|
|7.5|2.721|2.243|
|7.0|2.990|2.485|
|6.5|3.022|2.523|
|6.0|3.400|2.892|
|5.5|3.480|2.980|

## Checklist

- [x] PR title corresponds to the body of PR (we generate changelog
entries from PRs).
- [x] Tests for the changes have been added / updated.
- [x] Documentation comments have been added / updated.
- [x] Code has been formatted via `cargo fmt` and linted with `cargo
check`.
  • Loading branch information
robik75 authored Feb 6, 2024
1 parent 6592cd1 commit 350fbf8
Show file tree
Hide file tree
Showing 32 changed files with 2,795 additions and 2,161 deletions.
10 changes: 4 additions & 6 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,10 @@ edition = "2021"
# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html

[dependencies]
# boojum = { path = "../era-boojum", package = "boojum" }
# boojum-cuda = { path = "../era-boojum-cuda" }
# cudart = { path = "../era-cuda/cudart", package = "cudart" }
# circuit_definitions = { path = "../era-zkevm_test_harness/circuit_definitions", package = "circuit_definitions", optional = true }

boojum = { git = "https://github.com/matter-labs/era-boojum", branch = "main" }
boojum-cuda = { git = "https://github.com/matter-labs/era-boojum-cuda", branch = "main" }
cudart = { git = "https://github.com/matter-labs/era-cuda", branch = "main", package = "cudart" }
cudart-sys = { git = "https://github.com/matter-labs/era-cuda", branch = "main", package = "cudart-sys" }
circuit_definitions = { git = "https://github.com/matter-labs/era-zkevm_test_harness", branch = "v1.4.1", package = "circuit_definitions", optional = true }

rand = "0.8"
Expand All @@ -36,5 +32,7 @@ serial_test = "^2"
[features]
default = ["zksync"]
zksync = ["circuit_definitions"]
recompute = []
allocator_stats = []

[profile.release]
incremental = true
27 changes: 8 additions & 19 deletions src/constraint_evaluation.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,23 +191,18 @@ pub fn multi_polys_as_single_slice_mut<'a, P: PolyForm>(polys: &mut [Poly<'a, P>
}

// Accumulates into quotient
pub fn generic_evaluate_constraints_by_coset<'a, 'b>(
trace_polys: &TracePolynomials<'a, CosetEvaluations>,
setup_polys: &SetupPolynomials<'a, CosetEvaluations>,
gates: &[cs_helpers::GateEvaluationParams],
pub fn generic_evaluate_constraints_by_coset(
variable_cols: &[Poly<CosetEvaluations>],
witness_cols: &[Poly<CosetEvaluations>],
constant_cols: &[Poly<CosetEvaluations>],
gates: &[GateEvaluationParams],
_selectors_placement: TreeNode,
domain_size: usize,
challenge: EF,
challenge_power_offset: usize,
quotient: &mut ComplexPoly<'b, CosetEvaluations>,
) -> CudaResult<()>
where
'a: 'b,
{
assert_eq!(
trace_polys.variable_cols[0].domain_size(),
quotient.domain_size()
);
quotient: &mut ComplexPoly<CosetEvaluations>,
) -> CudaResult<()> {
assert_eq!(variable_cols[0].domain_size(), quotient.domain_size());

let quotient_as_single_slice = unsafe {
assert_eq!(
Expand All @@ -217,12 +212,6 @@ where
let len = 2 * quotient.domain_size();
std::slice::from_raw_parts_mut(quotient.c0.storage.as_mut().as_mut_ptr(), len)
};
let TracePolynomials {
variable_cols,
witness_cols,
multiplicity_cols: _,
} = trace_polys;
let SetupPolynomials { constant_cols, .. } = setup_polys;

let all_variable_cols = multi_polys_as_single_slice(&variable_cols);
let all_witness_cols = multi_polys_as_single_slice(&witness_cols);
Expand Down
221 changes: 112 additions & 109 deletions src/context.rs
Original file line number Diff line number Diff line change
@@ -1,147 +1,104 @@
use super::*;
use boojum_cuda::context::Context;
use std::collections::HashMap;

pub struct ProverContext;

pub const ZKSYNC_DEFAULT_TRACE_LOG_LENGTH: usize = 20;

impl ProverContext {
pub fn create() -> CudaResult<Self> {
fn create_internal(
cuda_ctx: Context,
small_device_alloc: SmallStaticDeviceAllocator,
device_alloc: StaticDeviceAllocator,
small_host_alloc: SmallStaticHostAllocator,
host_alloc: StaticHostAllocator,
) -> CudaResult<Self> {
unsafe {
assert!(_CUDA_CONTEXT.is_none());
assert!(_DEVICE_ALLOCATOR.is_none());
assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
assert!(_HOST_ALLOCATOR.is_none());
assert!(_SMALL_HOST_ALLOCATOR.is_none());
assert!(_EXEC_STREAM.is_none());
assert!(_H2D_STREAM.is_none());
assert!(_D2H_STREAM.is_none());
}
// size counts in field elements
let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
let cuda_ctx = CudaContext::create(12, 12)?;

// grab small slice then consume everything
let small_device_alloc = SmallStaticDeviceAllocator::init()?;
let device_alloc = StaticDeviceAllocator::init_all(block_size)?;

let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
assert!(_DEVICE_ALLOCATOR.is_none());
_DEVICE_ALLOCATOR = Some(device_alloc);
assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
_SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
assert!(_HOST_ALLOCATOR.is_none());
_HOST_ALLOCATOR = Some(host_alloc);
assert!(_SMALL_HOST_ALLOCATOR.is_none());
_SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
assert!(_EXEC_STREAM.is_none());
_EXEC_STREAM = Some(Stream::create()?);
assert!(_H2D_STREAM.is_none());
_H2D_STREAM = Some(Stream::create()?);
assert!(_D2H_STREAM.is_none());
_D2H_STREAM = Some(Stream::create()?);
}

assert!(_SETUP_CACHE.is_none());
assert!(_STRATEGY_CACHE.is_none());
_STRATEGY_CACHE = Some(HashMap::new());
};
Ok(Self {})
}

#[allow(dead_code)]
pub(crate) fn create_limited_dev(block_size: usize) -> CudaResult<Self> {
unsafe {
assert!(_CUDA_CONTEXT.is_none());
assert!(_DEVICE_ALLOCATOR.is_none());
assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
assert!(_HOST_ALLOCATOR.is_none());
assert!(_SMALL_HOST_ALLOCATOR.is_none());
assert!(_EXEC_STREAM.is_none());
assert!(_H2D_STREAM.is_none());
assert!(_D2H_STREAM.is_none());
}
pub fn create() -> CudaResult<Self> {
// size counts in field elements
let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
let cuda_ctx = CudaContext::create(12, 12)?;

// grab small slice then consume everything
let small_device_alloc = SmallStaticDeviceAllocator::init()?;
let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
let device_alloc = StaticDeviceAllocator::init_all(block_size)?;
let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
_DEVICE_ALLOCATOR = Some(device_alloc);
_SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
_HOST_ALLOCATOR = Some(host_alloc);
_SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
_EXEC_STREAM = Some(Stream::create()?);
_H2D_STREAM = Some(Stream::create()?);
_D2H_STREAM = Some(Stream::create()?);
}

Ok(Self {})
Self::create_internal(
cuda_ctx,
small_device_alloc,
device_alloc,
small_host_alloc,
host_alloc,
)
}

pub fn create_limited() -> CudaResult<Self> {
unsafe {
assert!(_CUDA_CONTEXT.is_none());
assert!(_DEVICE_ALLOCATOR.is_none());
assert!(_SMALL_DEVICE_ALLOCATOR.is_none());
assert!(_HOST_ALLOCATOR.is_none());
assert!(_SMALL_HOST_ALLOCATOR.is_none());
assert!(_EXEC_STREAM.is_none());
assert!(_H2D_STREAM.is_none());
assert!(_D2H_STREAM.is_none());
}
pub fn create_limited(num_blocks: usize) -> CudaResult<Self> {
// size counts in field elements
let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
let cuda_ctx = CudaContext::create(12, 12)?;

// grab small slice then consume everything
let small_device_alloc = SmallStaticDeviceAllocator::init()?;
let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
let device_alloc = StaticDeviceAllocator::init(num_blocks, block_size)?;
let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
_DEVICE_ALLOCATOR = Some(device_alloc);
_SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
_HOST_ALLOCATOR = Some(host_alloc);
_SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
_EXEC_STREAM = Some(Stream::create()?);
_H2D_STREAM = Some(Stream::create()?);
_D2H_STREAM = Some(Stream::create()?);
}

Ok(Self {})
Self::create_internal(
cuda_ctx,
small_device_alloc,
device_alloc,
small_host_alloc,
host_alloc,
)
}

#[allow(dead_code)]
#[cfg(test)]
pub(crate) fn dev(domain_size: usize) -> CudaResult<Self> {
assert!(domain_size.is_power_of_two());
// size counts in field elements
let block_size = domain_size;
let cuda_ctx = CudaContext::create(12, 12)?;

let small_device_alloc = SmallStaticDeviceAllocator::init()?;
let device_alloc = StaticDeviceAllocator::init_all(block_size)?;

let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
_DEVICE_ALLOCATOR = Some(device_alloc);
_SMALL_DEVICE_ALLOCATOR = Some(small_device_alloc);
_HOST_ALLOCATOR = Some(host_alloc);
_SMALL_HOST_ALLOCATOR = Some(small_host_alloc);
_EXEC_STREAM = Some(Stream::create()?);
_H2D_STREAM = Some(Stream::create()?);
_D2H_STREAM = Some(Stream::create()?);
}

Ok(Self {})
Self::create_internal(
cuda_ctx,
small_device_alloc,
device_alloc,
small_host_alloc,
host_alloc,
)
}
}

impl Drop for ProverContext {
fn drop(&mut self) {
unsafe {
_setup_cache_reset();

let cuda_ctx = _CUDA_CONTEXT.take().expect("cuda ctx");
cuda_ctx.destroy().expect("destroy cuda ctx");

Expand Down Expand Up @@ -183,14 +140,16 @@ impl Drop for ProverContext {
.inner
.destroy()
.expect("destroy d2h stream");

drop(_STRATEGY_CACHE.take());
}
}
}

pub(crate) static mut _CUDA_CONTEXT: Option<CudaContext> = None;
pub(crate) static mut _EXEC_STREAM: Option<Stream> = None;
pub(crate) static mut _H2D_STREAM: Option<Stream> = None;
pub(crate) static mut _D2H_STREAM: Option<Stream> = None;
static mut _CUDA_CONTEXT: Option<CudaContext> = None;
static mut _EXEC_STREAM: Option<Stream> = None;
static mut _H2D_STREAM: Option<Stream> = None;
static mut _D2H_STREAM: Option<Stream> = None;

pub(crate) fn get_stream() -> &'static CudaStream {
unsafe { &_EXEC_STREAM.as_ref().expect("execution stream").inner }
Expand All @@ -207,11 +166,11 @@ pub(crate) fn get_d2h_stream() -> &'static CudaStream {
}

pub fn synchronize_streams() -> CudaResult<()> {
get_stream().synchronize()?;
get_h2d_stream().synchronize()?;
get_d2h_stream().synchronize()?;

Ok(())
if_not_dry_run! {
get_stream().synchronize()?;
get_h2d_stream().synchronize()?;
get_d2h_stream().synchronize()
}
}

// use custom wrapper to work around send + sync requirement of static var
Expand All @@ -230,38 +189,82 @@ impl Stream {
unsafe impl Send for Stream {}
unsafe impl Sync for Stream {}

pub(crate) static mut _DEVICE_ALLOCATOR: Option<StaticDeviceAllocator> = None;
pub(crate) static mut _SMALL_DEVICE_ALLOCATOR: Option<SmallStaticDeviceAllocator> = None;
pub(crate) static mut _HOST_ALLOCATOR: Option<StaticHostAllocator> = None;
pub(crate) static mut _SMALL_HOST_ALLOCATOR: Option<SmallStaticHostAllocator> = None;
static mut _DEVICE_ALLOCATOR: Option<StaticDeviceAllocator> = None;
static mut _SMALL_DEVICE_ALLOCATOR: Option<SmallStaticDeviceAllocator> = None;
static mut _HOST_ALLOCATOR: Option<StaticHostAllocator> = None;
static mut _SMALL_HOST_ALLOCATOR: Option<SmallStaticHostAllocator> = None;

pub(crate) fn _alloc() -> &'static StaticDeviceAllocator {
unsafe {
&_DEVICE_ALLOCATOR
_DEVICE_ALLOCATOR
.as_ref()
.expect("device allocator should be initialized")
}
}

pub(crate) fn _small_alloc() -> &'static SmallStaticDeviceAllocator {
unsafe {
&_SMALL_DEVICE_ALLOCATOR
_SMALL_DEVICE_ALLOCATOR
.as_ref()
.expect("small device allocator should be initialized")
}
}
pub(crate) fn _host_alloc() -> &'static StaticHostAllocator {
unsafe {
&_HOST_ALLOCATOR
_HOST_ALLOCATOR
.as_ref()
.expect("host allocator should be initialized")
}
}

pub(crate) fn _small_host_alloc() -> &'static SmallStaticHostAllocator {
unsafe {
&_SMALL_HOST_ALLOCATOR
_SMALL_HOST_ALLOCATOR
.as_ref()
.expect("small host allocator should be initialized")
}
}

static mut _SETUP_CACHE: Option<SetupCache> = None;

pub(crate) fn _setup_cache_get() -> Option<&'static mut SetupCache> {
unsafe { _SETUP_CACHE.as_mut() }
}

pub(crate) fn _setup_cache_set(value: SetupCache) {
unsafe {
assert!(_SETUP_CACHE.is_none());
_SETUP_CACHE = Some(value)
}
}

pub(crate) fn _setup_cache_reset() {
unsafe { _SETUP_CACHE = None }
}

static mut _STRATEGY_CACHE: Option<HashMap<Vec<[F; 4]>, CacheStrategy>> = None;

pub(crate) fn _strategy_cache_get() -> &'static mut HashMap<Vec<[F; 4]>, CacheStrategy> {
unsafe {
_STRATEGY_CACHE
.as_mut()
.expect("strategy cache should be initialized")
}
}
pub(crate) fn _strategy_cache_reset() {
unsafe { _STRATEGY_CACHE = Some(HashMap::new()) }
}

pub(crate) fn is_prover_context_initialized() -> bool {
unsafe {
_CUDA_CONTEXT.is_some()
& _EXEC_STREAM.is_some()
& _H2D_STREAM.is_some()
& _D2H_STREAM.is_some()
& _DEVICE_ALLOCATOR.is_some()
& _SMALL_DEVICE_ALLOCATOR.is_some()
& _HOST_ALLOCATOR.is_some()
& _SMALL_HOST_ALLOCATOR.is_some()
& _STRATEGY_CACHE.is_some()
}
}
Loading

0 comments on commit 350fbf8

Please sign in to comment.