Skip to content
This repository has been archived by the owner on Aug 16, 2024. It is now read-only.

Commit

Permalink
add memory allocation limits and refactor
Browse files Browse the repository at this point in the history
  • Loading branch information
robik75 committed Nov 28, 2023
1 parent 64a3213 commit 79d29d8
Show file tree
Hide file tree
Showing 5 changed files with 39 additions and 43 deletions.
18 changes: 8 additions & 10 deletions src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ impl ProverContext {
let device_alloc = StaticDeviceAllocator::init_all(block_size)?;

let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
Expand All @@ -42,7 +42,7 @@ impl ProverContext {
}

#[allow(dead_code)]
pub(crate) fn create_14gb_dev(block_size: usize) -> CudaResult<Self> {
pub(crate) fn create_limited_dev(block_size: usize) -> CudaResult<Self> {
unsafe {
assert!(_CUDA_CONTEXT.is_none());
assert!(_DEVICE_ALLOCATOR.is_none());
Expand All @@ -58,10 +58,9 @@ impl ProverContext {

// grab small slice then consume everything
let small_device_alloc = SmallStaticDeviceAllocator::init()?;
let device_alloc = StaticDeviceAllocator::init_14gb(block_size)?;
println!("allocated 14gb on device");
let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
Expand All @@ -77,7 +76,7 @@ impl ProverContext {
Ok(Self {})
}

pub fn create_14gb() -> CudaResult<Self> {
pub fn create_limited() -> CudaResult<Self> {
unsafe {
assert!(_CUDA_CONTEXT.is_none());
assert!(_DEVICE_ALLOCATOR.is_none());
Expand All @@ -94,10 +93,9 @@ impl ProverContext {

// grab small slice then consume everything
let small_device_alloc = SmallStaticDeviceAllocator::init()?;
let device_alloc = StaticDeviceAllocator::init_14gb(block_size)?;
println!("allocated 14gb on device");
let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
Expand All @@ -124,7 +122,7 @@ impl ProverContext {
let device_alloc = StaticDeviceAllocator::init_all(block_size)?;

let small_host_alloc = SmallStaticHostAllocator::init()?;
let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;

unsafe {
_CUDA_CONTEXT = Some(cuda_ctx);
Expand Down
6 changes: 3 additions & 3 deletions src/oracle.rs
Original file line number Diff line number Diff line change
Expand Up @@ -410,7 +410,7 @@ mod tests {
#[test]
#[ignore]
fn test_batch_query_for_leaf_sources() -> CudaResult<()> {
let _ctx = ProverContext::create_14gb()?;
let _ctx = ProverContext::create_limited()?;
let domain_size = 1 << 16;
let lde_degree = 2;
let num_cols = 2;
Expand Down Expand Up @@ -536,7 +536,7 @@ mod tests {
#[test]
#[ignore]
fn test_batch_query_for_fri_layers() -> CudaResult<()> {
let _ctx = ProverContext::create_14gb()?;
let _ctx = ProverContext::create_limited()?;
let domain_size = 1 << 16;
let lde_degree = 2;
let num_cols = 2;
Expand Down Expand Up @@ -692,7 +692,7 @@ mod tests {
#[test]
#[ignore]
fn test_batch_query_for_merkle_paths() -> CudaResult<()> {
let _ctx = ProverContext::create_14gb()?;
let _ctx = ProverContext::create_limited()?;
let domain_size = 1 << 4;
let lde_degree = 2;
let num_cols = 2;
Expand Down
40 changes: 21 additions & 19 deletions src/static_allocator/device.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
use cudart::memory::DeviceAllocation;
use cudart::memory::{memory_get_info, DeviceAllocation};

use super::*;
use derivative::*;
Expand All @@ -11,6 +11,13 @@ use std::sync::Arc;
#[cfg(feature = "allocator_stats")]
use std::sync::atomic::AtomicUsize;

pub const FREE_MEMORY_SLACK: usize = 1 << 23; // 8 MB
#[cfg(feature = "recompute")]
pub const ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 1340 + 32;
#[cfg(not(feature = "recompute"))]
pub const ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 1695 + 64;
pub const SMALL_ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 27 + 16;

#[derive(Derivative)]
#[derivative(Clone, Debug)]
pub struct StaticDeviceAllocator {
Expand Down Expand Up @@ -62,7 +69,7 @@ impl StaticDeviceAllocator {
}

pub fn init(num_blocks: usize, block_size: usize) -> CudaResult<Self> {
assert!(num_blocks > 32);
assert_ne!(num_blocks, 0);
assert!(block_size.is_power_of_two());
let memory_size = num_blocks * block_size;
let memory_size_in_bytes = memory_size * std::mem::size_of::<F>();
Expand All @@ -73,6 +80,8 @@ impl StaticDeviceAllocator {
memory_size_in_bytes
));

println!("allocated {memory_size_in_bytes} bytes on device");

let alloc = StaticDeviceAllocator {
memory: Arc::new(memory),
memory_size: memory_size_in_bytes,
Expand All @@ -86,34 +95,28 @@ impl StaticDeviceAllocator {
}

pub fn init_all(block_size: usize) -> CudaResult<Self> {
use cudart::memory::memory_get_info;

let block_size_in_bytes = block_size * std::mem::size_of::<F>();
let (memory_size_in_bytes, _total) = memory_get_info().expect("get memory info");
let precomputed_data_in_bytes = 256 * 1024 * 1024; // precomputed data is <=256mb
let free_memory_size_in_bytes = memory_size_in_bytes - precomputed_data_in_bytes;
assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK);
let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK;
assert!(free_memory_size_in_bytes >= block_size);
let num_blocks = free_memory_size_in_bytes / block_size_in_bytes;
Self::init(num_blocks, block_size)
}

pub fn init_14gb(block_size: usize) -> CudaResult<Self> {
use cudart::memory::memory_get_info;

let block_size_in_bytes = block_size * std::mem::size_of::<F>();
pub fn init_limited(block_size: usize) -> CudaResult<Self> {
let (memory_size_in_bytes, _total) = memory_get_info().expect("get memory info");
let precomputed_data_in_bytes = 256 * 1024 * 1024; // precomputed data is <=256mb
let free_memory_size_in_bytes = memory_size_in_bytes - precomputed_data_in_bytes;
assert!(free_memory_size_in_bytes >= block_size);
let requested_memory_size_in_bytes = 14usize * 0x40000000; // 16gb
assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK);
let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK;
let block_size_in_bytes = block_size * std::mem::size_of::<F>();
let requested_memory_size_in_bytes = ALLOCATOR_LIMITED_BLOCKS_COUNT * block_size_in_bytes;
assert!(
requested_memory_size_in_bytes <= free_memory_size_in_bytes,
"requested memory {}bytes, free memory {} bytes",
requested_memory_size_in_bytes,
free_memory_size_in_bytes
);
let num_blocks = requested_memory_size_in_bytes / block_size_in_bytes;
Self::init(num_blocks, block_size)
Self::init(ALLOCATOR_LIMITED_BLOCKS_COUNT, block_size)
}

fn find_free_block(&self) -> Option<usize> {
Expand Down Expand Up @@ -269,9 +272,8 @@ pub struct SmallStaticDeviceAllocator {
impl SmallStaticDeviceAllocator {
pub fn init() -> CudaResult<Self> {
// cuda requires alignment to be multiple of 32 goldilocks elems
let block_size = 32;
let num_blocks = 1 << 10; // 256 KB
let inner = StaticDeviceAllocator::init(num_blocks, block_size)?;
const BLOCK_SIZE: usize = 32;
let inner = StaticDeviceAllocator::init(SMALL_ALLOCATOR_LIMITED_BLOCKS_COUNT, BLOCK_SIZE)?;
Ok(Self { inner })
}

Expand Down
16 changes: 6 additions & 10 deletions src/static_allocator/host.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ pub struct StaticHostAllocator {

impl Default for StaticHostAllocator {
fn default() -> Self {
let _domain_size = 1 << 20;
let _domain_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
Self::init(0, 0).unwrap() // TODO
}
}
Expand Down Expand Up @@ -48,7 +48,7 @@ impl StaticHostAllocator {
}

pub fn init(num_blocks: usize, block_size: usize) -> CudaResult<Self> {
assert!(num_blocks > 32);
assert_ne!(num_blocks, 0);
assert!(block_size.is_power_of_two());
let memory_size = num_blocks * block_size;
let memory_size_in_bytes = memory_size * std::mem::size_of::<F>();
Expand All @@ -59,11 +59,7 @@ impl StaticHostAllocator {
&format!("failed to allocate {} bytes", memory_size_in_bytes),
);

println!(
"allocated {} bytes({}gb) on device on host",
memory_size_in_bytes,
memory_size_in_bytes / 0x40000000
);
println!("allocated {memory_size_in_bytes} bytes on host");

let alloc = StaticHostAllocator {
memory: Arc::new(memory),
Expand Down Expand Up @@ -205,10 +201,10 @@ pub struct SmallStaticHostAllocator {

impl SmallStaticHostAllocator {
pub fn init() -> CudaResult<Self> {
const NUM_BLOCKS: usize = 1 << 8;
// cuda requires alignment to be multiple of 32 goldilocks elems
let block_size = 32;
let num_blocks = 1 << 20; // <1gb
let inner = StaticHostAllocator::init(num_blocks, block_size)?;
const BLOCK_SIZE: usize = 32;
let inner = StaticHostAllocator::init(NUM_BLOCKS, BLOCK_SIZE)?;
Ok(Self { inner })
}

Expand Down
2 changes: 1 addition & 1 deletion src/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1035,7 +1035,7 @@ mod zksync {
#[ignore]
fn compare_proofs_for_single_zksync_circuit_in_single_shot() {
let circuit = get_circuit_from_env();
let _ctx = ProverContext::create_14gb().expect("gpu prover context");
let _ctx = ProverContext::create_limited().expect("gpu prover context");

println!(
"{} {}",
Expand Down

0 comments on commit 79d29d8

Please sign in to comment.