From 79d29d86ca552906e055f350b8cde2539bb34ccd Mon Sep 17 00:00:00 2001
From: Robert Remen <robik75@gmail.com>
Date: Tue, 28 Nov 2023 18:44:09 +0000
Subject: [PATCH] add memory allocation limits and refactor

---
 src/context.rs                 | 18 +++++++--------
 src/oracle.rs                  |  6 ++---
 src/static_allocator/device.rs | 40 ++++++++++++++++++----------------
 src/static_allocator/host.rs   | 16 +++++---------
 src/test.rs                    |  2 +-
 5 files changed, 39 insertions(+), 43 deletions(-)
diff --git a/src/context.rs b/src/context.rs
index 37b5e87..02af562 100644
--- a/src/context.rs
+++ b/src/context.rs
@@ -25,7 +25,7 @@ impl ProverContext {
         let device_alloc = StaticDeviceAllocator::init_all(block_size)?;
 
         let small_host_alloc = SmallStaticHostAllocator::init()?;
-        let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
+        let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
 
         unsafe {
             _CUDA_CONTEXT = Some(cuda_ctx);
@@ -42,7 +42,7 @@ impl ProverContext {
     }
 
     #[allow(dead_code)]
-    pub(crate) fn create_14gb_dev(block_size: usize) -> CudaResult<Self> {
+    pub(crate) fn create_limited_dev(block_size: usize) -> CudaResult<Self> {
         unsafe {
             assert!(_CUDA_CONTEXT.is_none());
             assert!(_DEVICE_ALLOCATOR.is_none());
@@ -58,10 +58,9 @@ impl ProverContext {
 
         // grab small slice then consume everything
         let small_device_alloc = SmallStaticDeviceAllocator::init()?;
-        let device_alloc = StaticDeviceAllocator::init_14gb(block_size)?;
-        println!("allocated 14gb on device");
+        let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
         let small_host_alloc = SmallStaticHostAllocator::init()?;
-        let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
+        let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
 
         unsafe {
             _CUDA_CONTEXT = Some(cuda_ctx);
@@ -77,7 +76,7 @@ impl ProverContext {
         Ok(Self {})
     }
 
-    pub fn create_14gb() -> CudaResult<Self> {
+    pub fn create_limited() -> CudaResult<Self> {
         unsafe {
             assert!(_CUDA_CONTEXT.is_none());
             assert!(_DEVICE_ALLOCATOR.is_none());
@@ -94,10 +93,9 @@ impl ProverContext {
 
         // grab small slice then consume everything
         let small_device_alloc = SmallStaticDeviceAllocator::init()?;
-        let device_alloc = StaticDeviceAllocator::init_14gb(block_size)?;
-        println!("allocated 14gb on device");
+        let device_alloc = StaticDeviceAllocator::init_limited(block_size)?;
         let small_host_alloc = SmallStaticHostAllocator::init()?;
-        let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
+        let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
 
         unsafe {
             _CUDA_CONTEXT = Some(cuda_ctx);
@@ -124,7 +122,7 @@ impl ProverContext {
         let device_alloc = StaticDeviceAllocator::init_all(block_size)?;
 
         let small_host_alloc = SmallStaticHostAllocator::init()?;
-        let host_alloc = StaticHostAllocator::init(block_size, 1 << 8)?;
+        let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?;
 
         unsafe {
             _CUDA_CONTEXT = Some(cuda_ctx);
diff --git a/src/oracle.rs b/src/oracle.rs
index 44a7d80..55a4d71 100644
--- a/src/oracle.rs
+++ b/src/oracle.rs
@@ -410,7 +410,7 @@ mod tests {
     #[test]
     #[ignore]
     fn test_batch_query_for_leaf_sources() -> CudaResult<()> {
-        let _ctx = ProverContext::create_14gb()?;
+        let _ctx = ProverContext::create_limited()?;
         let domain_size = 1 << 16;
         let lde_degree = 2;
         let num_cols = 2;
@@ -536,7 +536,7 @@ mod tests {
     #[test]
     #[ignore]
     fn test_batch_query_for_fri_layers() -> CudaResult<()> {
-        let _ctx = ProverContext::create_14gb()?;
+        let _ctx = ProverContext::create_limited()?;
         let domain_size = 1 << 16;
         let lde_degree = 2;
         let num_cols = 2;
@@ -692,7 +692,7 @@ mod tests {
     #[test]
     #[ignore]
     fn test_batch_query_for_merkle_paths() -> CudaResult<()> {
-        let _ctx = ProverContext::create_14gb()?;
+        let _ctx = ProverContext::create_limited()?;
         let domain_size = 1 << 4;
         let lde_degree = 2;
         let num_cols = 2;
diff --git a/src/static_allocator/device.rs b/src/static_allocator/device.rs
index a51f3e4..6add434 100644
--- a/src/static_allocator/device.rs
+++ b/src/static_allocator/device.rs
@@ -1,4 +1,4 @@
-use cudart::memory::DeviceAllocation;
+use cudart::memory::{memory_get_info, DeviceAllocation};
 
 use super::*;
 use derivative::*;
@@ -11,6 +11,13 @@ use std::sync::Arc;
 #[cfg(feature = "allocator_stats")]
 use std::sync::atomic::AtomicUsize;
 
+pub const FREE_MEMORY_SLACK: usize = 1 << 23; // 8 MB
+#[cfg(feature = "recompute")]
+pub const ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 1340 + 32;
+#[cfg(not(feature = "recompute"))]
+pub const ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 1695 + 64;
+pub const SMALL_ALLOCATOR_LIMITED_BLOCKS_COUNT: usize = 27 + 16;
+
 #[derive(Derivative)]
 #[derivative(Clone, Debug)]
 pub struct StaticDeviceAllocator {
@@ -62,7 +69,7 @@ impl StaticDeviceAllocator {
     }
 
     pub fn init(num_blocks: usize, block_size: usize) -> CudaResult<Self> {
-        assert!(num_blocks > 32);
+        assert_ne!(num_blocks, 0);
         assert!(block_size.is_power_of_two());
         let memory_size = num_blocks * block_size;
         let memory_size_in_bytes = memory_size * std::mem::size_of::<F>();
@@ -73,6 +80,8 @@ impl StaticDeviceAllocator {
             memory_size_in_bytes
         ));
 
+        println!("allocated {memory_size_in_bytes} bytes on device");
+
         let alloc = StaticDeviceAllocator {
             memory: Arc::new(memory),
             memory_size: memory_size_in_bytes,
@@ -86,34 +95,28 @@ impl StaticDeviceAllocator {
     }
 
     pub fn init_all(block_size: usize) -> CudaResult<Self> {
-        use cudart::memory::memory_get_info;
-
         let block_size_in_bytes = block_size * std::mem::size_of::<F>();
         let (memory_size_in_bytes, _total) = memory_get_info().expect("get memory info");
-        let precomputed_data_in_bytes = 256 * 1024 * 1024; // precomputed data is <=256mb
-        let free_memory_size_in_bytes = memory_size_in_bytes - precomputed_data_in_bytes;
+        assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK);
+        let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK;
         assert!(free_memory_size_in_bytes >= block_size);
         let num_blocks = free_memory_size_in_bytes / block_size_in_bytes;
         Self::init(num_blocks, block_size)
     }
 
-    pub fn init_14gb(block_size: usize) -> CudaResult<Self> {
-        use cudart::memory::memory_get_info;
-
-        let block_size_in_bytes = block_size * std::mem::size_of::<F>();
+    pub fn init_limited(block_size: usize) -> CudaResult<Self> {
         let (memory_size_in_bytes, _total) = memory_get_info().expect("get memory info");
-        let precomputed_data_in_bytes = 256 * 1024 * 1024; // precomputed data is <=256mb
-        let free_memory_size_in_bytes = memory_size_in_bytes - precomputed_data_in_bytes;
-        assert!(free_memory_size_in_bytes >= block_size);
-        let requested_memory_size_in_bytes = 14usize * 0x40000000; // 16gb
+        assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK);
+        let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK;
+        let block_size_in_bytes = block_size * std::mem::size_of::<F>();
+        let requested_memory_size_in_bytes = ALLOCATOR_LIMITED_BLOCKS_COUNT * block_size_in_bytes;
         assert!(
             requested_memory_size_in_bytes <= free_memory_size_in_bytes,
             "requested memory {}bytes, free memory {} bytes",
             requested_memory_size_in_bytes,
             free_memory_size_in_bytes
         );
-        let num_blocks = requested_memory_size_in_bytes / block_size_in_bytes;
-        Self::init(num_blocks, block_size)
+        Self::init(ALLOCATOR_LIMITED_BLOCKS_COUNT, block_size)
     }
 
     fn find_free_block(&self) -> Option<usize> {
@@ -269,9 +272,8 @@ pub struct SmallStaticDeviceAllocator {
 impl SmallStaticDeviceAllocator {
     pub fn init() -> CudaResult<Self> {
         // cuda requires alignment to be  multiple of 32 goldilocks elems
-        let block_size = 32;
-        let num_blocks = 1 << 10; // 256 KB
-        let inner = StaticDeviceAllocator::init(num_blocks, block_size)?;
+        const BLOCK_SIZE: usize = 32;
+        let inner = StaticDeviceAllocator::init(SMALL_ALLOCATOR_LIMITED_BLOCKS_COUNT, BLOCK_SIZE)?;
         Ok(Self { inner })
     }
 
diff --git a/src/static_allocator/host.rs b/src/static_allocator/host.rs
index f0f43d2..0d81561 100644
--- a/src/static_allocator/host.rs
+++ b/src/static_allocator/host.rs
@@ -19,7 +19,7 @@ pub struct StaticHostAllocator {
 
 impl Default for StaticHostAllocator {
     fn default() -> Self {
-        let _domain_size = 1 << 20;
+        let _domain_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH;
         Self::init(0, 0).unwrap() // TODO
     }
 }
@@ -48,7 +48,7 @@ impl StaticHostAllocator {
     }
 
     pub fn init(num_blocks: usize, block_size: usize) -> CudaResult<Self> {
-        assert!(num_blocks > 32);
+        assert_ne!(num_blocks, 0);
         assert!(block_size.is_power_of_two());
         let memory_size = num_blocks * block_size;
         let memory_size_in_bytes = memory_size * std::mem::size_of::<F>();
@@ -59,11 +59,7 @@ impl StaticHostAllocator {
                 &format!("failed to allocate {} bytes", memory_size_in_bytes),
             );
 
-        println!(
-            "allocated {} bytes({}gb) on device on host",
-            memory_size_in_bytes,
-            memory_size_in_bytes / 0x40000000
-        );
+        println!("allocated {memory_size_in_bytes} bytes on host");
 
         let alloc = StaticHostAllocator {
             memory: Arc::new(memory),
@@ -205,10 +201,10 @@ pub struct SmallStaticHostAllocator {
 
 impl SmallStaticHostAllocator {
     pub fn init() -> CudaResult<Self> {
+        const NUM_BLOCKS: usize = 1 << 8;
         // cuda requires alignment to be  multiple of 32 goldilocks elems
-        let block_size = 32;
-        let num_blocks = 1 << 20; // <1gb
-        let inner = StaticHostAllocator::init(num_blocks, block_size)?;
+        const BLOCK_SIZE: usize = 32;
+        let inner = StaticHostAllocator::init(NUM_BLOCKS, BLOCK_SIZE)?;
         Ok(Self { inner })
     }
 
diff --git a/src/test.rs b/src/test.rs
index d8cf1ab..5553c11 100644
--- a/src/test.rs
+++ b/src/test.rs
@@ -1035,7 +1035,7 @@ mod zksync {
     #[ignore]
     fn compare_proofs_for_single_zksync_circuit_in_single_shot() {
         let circuit = get_circuit_from_env();
-        let _ctx = ProverContext::create_14gb().expect("gpu prover context");
+        let _ctx = ProverContext::create_limited().expect("gpu prover context");
 
         println!(
             "{} {}",