From 79d29d86ca552906e055f350b8cde2539bb34ccd Mon Sep 17 00:00:00 2001 From: Robert Remen Date: Tue, 28 Nov 2023 18:44:09 +0000 Subject: [PATCH] add memory allocation limits and refactor --- src/context.rs | 18 +++++++-------- src/oracle.rs | 6 ++--- src/static_allocator/device.rs | 40 ++++++++++++++++++---------------- src/static_allocator/host.rs | 16 +++++--------- src/test.rs | 2 +- 5 files changed, 39 insertions(+), 43 deletions(-) diff --git a/src/context.rs b/src/context.rs index 37b5e87..02af562 100644 --- a/src/context.rs +++ b/src/context.rs @@ -25,7 +25,7 @@ impl ProverContext { let device_alloc = StaticDeviceAllocator::init_all(block_size)?; let small_host_alloc = SmallStaticHostAllocator::init()?; - let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?; + let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; unsafe { _CUDA_CONTEXT = Some(cuda_ctx); @@ -42,7 +42,7 @@ impl ProverContext { } #[allow(dead_code)] - pub(crate) fn create_14gb_dev(block_size: usize) -> CudaResult { + pub(crate) fn create_limited_dev(block_size: usize) -> CudaResult { unsafe { assert!(_CUDA_CONTEXT.is_none()); assert!(_DEVICE_ALLOCATOR.is_none()); @@ -58,10 +58,9 @@ impl ProverContext { // grab small slice then consume everything let small_device_alloc = SmallStaticDeviceAllocator::init()?; - let device_alloc = StaticDeviceAllocator::init_14gb(block_size)?; - println!("allocated 14gb on device"); + let device_alloc = StaticDeviceAllocator::init_limited(block_size)?; let small_host_alloc = SmallStaticHostAllocator::init()?; - let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?; + let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; unsafe { _CUDA_CONTEXT = Some(cuda_ctx); @@ -77,7 +76,7 @@ impl ProverContext { Ok(Self {}) } - pub fn create_14gb() -> CudaResult { + pub fn create_limited() -> CudaResult { unsafe { assert!(_CUDA_CONTEXT.is_none()); assert!(_DEVICE_ALLOCATOR.is_none()); @@ -94,10 +93,9 @@ impl ProverContext { // grab small slice then consume everything let small_device_alloc = SmallStaticDeviceAllocator::init()?; - let device_alloc = StaticDeviceAllocator::init_14gb(block_size)?; - println!("allocated 14gb on device"); + let device_alloc = StaticDeviceAllocator::init_limited(block_size)?; let small_host_alloc = SmallStaticHostAllocator::init()?; - let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?; + let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; unsafe { _CUDA_CONTEXT = Some(cuda_ctx); @@ -124,7 +122,7 @@ impl ProverContext { let device_alloc = StaticDeviceAllocator::init_all(block_size)?; let small_host_alloc = SmallStaticHostAllocator::init()?; - let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?; + let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; unsafe { _CUDA_CONTEXT = Some(cuda_ctx); diff --git a/src/oracle.rs b/src/oracle.rs index 44a7d80..55a4d71 100644 --- a/src/oracle.rs +++ b/src/oracle.rs @@ -410,7 +410,7 @@ mod tests { #[test] #[ignore] fn test_batch_query_for_leaf_sources() -> CudaResult<()> { - let _ctx = ProverContext::create_14gb()?; + let _ctx = ProverContext::create_limited()?; let domain_size = 1 << 16; let lde_degree = 2; let num_cols = 2; @@ -536,7 +536,7 @@ mod tests { #[test] #[ignore] fn test_batch_query_for_fri_layers() -> CudaResult<()> { - let _ctx = ProverContext::create_14gb()?; + let _ctx = ProverContext::create_limited()?; let domain_size = 1 << 16; let lde_degree = 2; let num_cols = 2; @@ -692,7 +692,7 @@ mod tests { #[test] #[ignore] fn test_batch_query_for_merkle_paths() -> CudaResult<()> { - let _ctx = ProverContext::create_14gb()?; + let _ctx = ProverContext::create_limited()?; let domain_size = 1 << 4; let lde_degree = 2; let num_cols = 2; diff --git a/src/static_allocator/device.rs b/src/static_allocator/device.rs index a51f3e4..6add434 100644 --- a/src/static_allocator/device.rs +++ b/src/static_allocator/device.rs @@ -1,4 +1,4 @@ -use cudart::memory::DeviceAllocation; +use cudart::memory::{memory_get_info, DeviceAllocation}; use super::*; use derivative::*; @@ -11,6 +11,13 @@ use std::sync::Arc; #[cfg(feature = "allocator_stats")] use std::sync::atomic::AtomicUsize; +pub const FREE_MEMORY_SLACK: usize = 1 << 23; // 8 MB +#[cfg(feature = "recompute")] +pub const ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 1340 + 32; +#[cfg(not(feature = "recompute"))] +pub const ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 1695 + 64; +pub const SMALL_ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 27 + 16; + #[derive(Derivative)] #[derivative(Clone, Debug)] pub struct StaticDeviceAllocator { @@ -62,7 +69,7 @@ impl StaticDeviceAllocator { } pub fn init(num_blocks: usize, block_size: usize) -> CudaResult { - assert!(num_blocks > 32); + assert_ne!(num_blocks, 0); assert!(block_size.is_power_of_two()); let memory_size = num_blocks * block_size; let memory_size_in_bytes = memory_size * std::mem::size_of::(); @@ -73,6 +80,8 @@ impl StaticDeviceAllocator { memory_size_in_bytes )); + println!("allocated {memory_size_in_bytes} bytes on device"); + let alloc = StaticDeviceAllocator { memory: Arc::new(memory), memory_size: memory_size_in_bytes, @@ -86,34 +95,28 @@ impl StaticDeviceAllocator { } pub fn init_all(block_size: usize) -> CudaResult { - use cudart::memory::memory_get_info; - let block_size_in_bytes = block_size * std::mem::size_of::(); let (memory_size_in_bytes, _total) = memory_get_info().expect("get memory info"); - let precomputed_data_in_bytes = 256 * 1024 * 1024; // precomputed data is <=256mb - let free_memory_size_in_bytes = memory_size_in_bytes - precomputed_data_in_bytes; + assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK); + let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK; assert!(free_memory_size_in_bytes >= block_size); let num_blocks = free_memory_size_in_bytes / block_size_in_bytes; Self::init(num_blocks, block_size) } - pub fn init_14gb(block_size: usize) -> CudaResult { - use cudart::memory::memory_get_info; - - let block_size_in_bytes = block_size * std::mem::size_of::(); + pub fn init_limited(block_size: usize) -> CudaResult { let (memory_size_in_bytes, _total) = memory_get_info().expect("get memory info"); - let precomputed_data_in_bytes = 256 * 1024 * 1024; // precomputed data is <=256mb - let free_memory_size_in_bytes = memory_size_in_bytes - precomputed_data_in_bytes; - assert!(free_memory_size_in_bytes >= block_size); - let requested_memory_size_in_bytes = 14usize * 0x40000000; // 16gb + assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK); + let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK; + let block_size_in_bytes = block_size * std::mem::size_of::(); + let requested_memory_size_in_bytes = ALLOCATOR_LIMITED_BLOCKS_COUNT * block_size_in_bytes; assert!( requested_memory_size_in_bytes <= free_memory_size_in_bytes, "requested memory {}bytes, free memory {} bytes", requested_memory_size_in_bytes, free_memory_size_in_bytes ); - let num_blocks = requested_memory_size_in_bytes / block_size_in_bytes; - Self::init(num_blocks, block_size) + Self::init(ALLOCATOR_LIMITED_BLOCKS_COUNT, block_size) } fn find_free_block(&self) -> Option { @@ -269,9 +272,8 @@ pub struct SmallStaticDeviceAllocator { impl SmallStaticDeviceAllocator { pub fn init() -> CudaResult { // cuda requires alignment to be multiple of 32 goldilocks elems - let block_size = 32; - let num_blocks = 1 << 10; // 256 KB - let inner = StaticDeviceAllocator::init(num_blocks, block_size)?; + const BLOCK_SIZE: usize = 32; + let inner = StaticDeviceAllocator::init(SMALL_ALLOCATOR_LIMITED_BLOCKS_COUNT, BLOCK_SIZE)?; Ok(Self { inner }) } diff --git a/src/static_allocator/host.rs b/src/static_allocator/host.rs index f0f43d2..0d81561 100644 --- a/src/static_allocator/host.rs +++ b/src/static_allocator/host.rs @@ -19,7 +19,7 @@ pub struct StaticHostAllocator { impl Default for StaticHostAllocator { fn default() -> Self { - let _domain_size = 1 << 20; + let _domain_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH; Self::init(0, 0).unwrap() // TODO } } @@ -48,7 +48,7 @@ impl StaticHostAllocator { } pub fn init(num_blocks: usize, block_size: usize) -> CudaResult { - assert!(num_blocks > 32); + assert_ne!(num_blocks, 0); assert!(block_size.is_power_of_two()); let memory_size = num_blocks * block_size; let memory_size_in_bytes = memory_size * std::mem::size_of::(); @@ -59,11 +59,7 @@ impl StaticHostAllocator { &format!("failed to allocate {} bytes", memory_size_in_bytes), ); - println!( - "allocated {} bytes({}gb) on device on host", - memory_size_in_bytes, - memory_size_in_bytes / 0x40000000 - ); + println!("allocated {memory_size_in_bytes} bytes on host"); let alloc = StaticHostAllocator { memory: Arc::new(memory), @@ -205,10 +201,10 @@ pub struct SmallStaticHostAllocator { impl SmallStaticHostAllocator { pub fn init() -> CudaResult { + const NUM_BLOCKS: usize = 1 << 8; // cuda requires alignment to be multiple of 32 goldilocks elems - let block_size = 32; - let num_blocks = 1 << 20; // <1gb - let inner = StaticHostAllocator::init(num_blocks, block_size)?; + const BLOCK_SIZE: usize = 32; + let inner = StaticHostAllocator::init(NUM_BLOCKS, BLOCK_SIZE)?; Ok(Self { inner }) } diff --git a/src/test.rs b/src/test.rs index d8cf1ab..5553c11 100644 --- a/src/test.rs +++ b/src/test.rs @@ -1035,7 +1035,7 @@ mod zksync { #[ignore] fn compare_proofs_for_single_zksync_circuit_in_single_shot() { let circuit = get_circuit_from_env(); - let _ctx = ProverContext::create_14gb().expect("gpu prover context"); + let _ctx = ProverContext::create_limited().expect("gpu prover context"); println!( "{} {}",