From 79c5f914234812483091f7ac69d8e307a807c9ce Mon Sep 17 00:00:00 2001 From: Robert Remen Date: Mon, 8 Apr 2024 16:44:57 +0200 Subject: [PATCH] improve device allocator memory allocation logic --- src/context.rs | 2 +- src/static_allocator/device.rs | 65 ++++++++++++++++++++++------------ 2 files changed, 43 insertions(+), 24 deletions(-) diff --git a/src/context.rs b/src/context.rs index 81939e6..502bfcc 100644 --- a/src/context.rs +++ b/src/context.rs @@ -121,7 +121,7 @@ impl ProverContext { let cuda_ctx = CudaContext::create(12, 12)?; // grab small slice then consume everything let small_device_alloc = SmallStaticDeviceAllocator::init()?; - let device_alloc = StaticDeviceAllocator::init(num_blocks, block_size)?; + let device_alloc = StaticDeviceAllocator::init(num_blocks, num_blocks, block_size)?; let small_host_alloc = SmallStaticHostAllocator::init()?; let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; Self::create_internal( diff --git a/src/static_allocator/device.rs b/src/static_allocator/device.rs index c942fcf..24a7892 100644 --- a/src/static_allocator/device.rs +++ b/src/static_allocator/device.rs @@ -10,6 +10,7 @@ use std::ptr::NonNull; use std::sync::{Arc, Mutex}; pub const FREE_MEMORY_SLACK: usize = 1 << 23; // 8 MB +pub const MIN_NUM_BLOCKS: usize = 512; pub const SMALL_ALLOCATOR_BLOCKS_COUNT: usize = 1 << 10; // 256 KB #[derive(Derivative)] @@ -164,30 +165,44 @@ impl StaticDeviceAllocator { self.block_size_in_bytes } - pub fn init(num_blocks: usize, block_size: usize) -> CudaResult { - assert_ne!(num_blocks, 0); + pub fn init( + max_num_blocks: usize, + min_num_blocks: usize, + block_size: usize, + ) -> CudaResult { + assert_ne!(min_num_blocks, 0); + assert!(max_num_blocks >= min_num_blocks); assert!(block_size.is_power_of_two()); - let memory_size = num_blocks * block_size; - let memory_size_in_bytes = memory_size * std::mem::size_of::(); - let block_size_in_bytes = block_size * std::mem::size_of::(); - - let memory = DeviceAllocation::alloc(memory_size_in_bytes).expect(&format!( - "failed to allocate {} bytes", - memory_size_in_bytes - )); + let mut num_blocks = max_num_blocks; + while num_blocks >= min_num_blocks { + let memory_size = num_blocks * block_size; + let memory_size_in_bytes = memory_size * std::mem::size_of::(); + let block_size_in_bytes = block_size * std::mem::size_of::(); + + let result = DeviceAllocation::alloc(memory_size_in_bytes); + let memory = match result { + Ok(memory) => memory, + Err(CudaError::ErrorMemoryAllocation) => { + num_blocks -= 1; + continue; + } + Err(e) => return Err(e), + }; - println!("allocated {memory_size_in_bytes} bytes on device"); + println!("allocated {memory_size_in_bytes} bytes on device"); - let alloc = StaticDeviceAllocator { - memory: Arc::new(memory), - memory_size: memory_size_in_bytes, - block_size_in_bytes, - bitmap: Arc::new(Mutex::new(Self::init_bitmap(num_blocks))), - #[cfg(feature = "allocator_stats")] - stats: Default::default(), - }; + let alloc = StaticDeviceAllocator { + memory: Arc::new(memory), + memory_size: memory_size_in_bytes, + block_size_in_bytes, + bitmap: Arc::new(Mutex::new(Self::init_bitmap(num_blocks))), + #[cfg(feature = "allocator_stats")] + stats: Default::default(), + }; - Ok(alloc) + return Ok(alloc); + } + Err(CudaError::ErrorMemoryAllocation) } pub fn init_all(block_size: usize) -> CudaResult { @@ -196,8 +211,8 @@ impl StaticDeviceAllocator { assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK); let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK; assert!(free_memory_size_in_bytes >= block_size); - let num_blocks = free_memory_size_in_bytes / block_size_in_bytes; - Self::init(num_blocks, block_size) + let max_num_blocks = free_memory_size_in_bytes / block_size_in_bytes; + Self::init(max_num_blocks, MIN_NUM_BLOCKS, block_size) } fn find_free_block(&self) -> Option { @@ -370,7 +385,11 @@ impl SmallStaticDeviceAllocator { pub fn init() -> CudaResult { // cuda requires alignment to be multiple of 32 goldilocks elems const BLOCK_SIZE: usize = 32; - let inner = StaticDeviceAllocator::init(SMALL_ALLOCATOR_BLOCKS_COUNT, BLOCK_SIZE)?; + let inner = StaticDeviceAllocator::init( + SMALL_ALLOCATOR_BLOCKS_COUNT, + SMALL_ALLOCATOR_BLOCKS_COUNT, + BLOCK_SIZE, + )?; Ok(Self { inner }) }