From 0a2ad8ca729a7762a83aad18f9f5ec21be022dee Mon Sep 17 00:00:00 2001 From: Agnes Leroy Date: Wed, 24 Jul 2024 17:52:06 +0200 Subject: [PATCH] chore(gpu): remove remaining par_iter over gpu_indexes Rename some variables to try and make the code clearer --- tfhe/src/core_crypto/gpu/mod.rs | 5 +- tfhe/src/core_crypto/gpu/slice.rs | 78 +++++------ tfhe/src/core_crypto/gpu/vec.rs | 121 +++++++++--------- .../gpu/server_key/radix/bitwise_op.rs | 13 +- tfhe/src/integer/gpu/server_key/radix/mod.rs | 44 ++----- .../gpu/server_key/radix/scalar_bitwise_op.rs | 3 +- .../gpu/server_key/radix/scalar_comparison.rs | 6 +- .../gpu/server_key/radix/scalar_mul.rs | 6 +- 8 files changed, 127 insertions(+), 149 deletions(-) diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index db040514a7..709da6fca4 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -11,7 +11,6 @@ use crate::core_crypto::prelude::{ }; pub use algorithms::*; pub use entities::*; -use rayon::prelude::*; use std::ffi::c_void; pub(crate) use tfhe_cuda_backend::cuda_bind::*; @@ -284,7 +283,7 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async( polynomial_size: PolynomialSize, ) { let size = std::mem::size_of_val(src); - streams.gpu_indexes.par_iter().for_each(|&gpu_index| { + for &gpu_index in streams.gpu_indexes.iter() { assert_eq!(dest.len() * std::mem::size_of::(), size); cuda_convert_lwe_programmable_bootstrap_key_64( streams.ptr[gpu_index as usize], @@ -296,7 +295,7 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async( l_gadget.0 as u32, polynomial_size.0 as u32, ); - }); + } } /// Convert multi-bit programmable bootstrap key diff --git a/tfhe/src/core_crypto/gpu/slice.rs b/tfhe/src/core_crypto/gpu/slice.rs index 523d71bd54..d5788b07a7 100644 --- a/tfhe/src/core_crypto/gpu/slice.rs +++ b/tfhe/src/core_crypto/gpu/slice.rs @@ -93,20 +93,24 @@ where /// /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is /// required. - pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams, gpu_index: u32) - where + pub unsafe fn copy_from_gpu_async( + &mut self, + src: &Self, + streams: &CudaStreams, + stream_index: u32, + ) where T: Numeric, { - assert_eq!(self.len(gpu_index), src.len(gpu_index)); - let size = src.len(gpu_index) * std::mem::size_of::(); + assert_eq!(self.len(stream_index), src.len(stream_index)); + let size = src.len(stream_index) * std::mem::size_of::(); // We check that src is not empty to avoid invalid pointers if size > 0 { cuda_memcpy_async_gpu_to_gpu( - self.as_mut_c_ptr(gpu_index), - src.as_c_ptr(gpu_index), + self.as_mut_c_ptr(stream_index), + src.as_c_ptr(stream_index), size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } } @@ -118,55 +122,55 @@ where /// /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is /// required. - pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, gpu_index: u32) + pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, stream_index: u32) where T: Numeric, { - assert_eq!(self.len(gpu_index), dest.len()); - let size = self.len(gpu_index) * std::mem::size_of::(); + assert_eq!(self.len(stream_index), dest.len()); + let size = self.len(stream_index) * std::mem::size_of::(); // We check that src is not empty to avoid invalid pointers if size > 0 { cuda_memcpy_async_to_cpu( dest.as_mut_ptr().cast::(), - self.as_c_ptr(gpu_index), + self.as_c_ptr(stream_index), size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } } /// Returns the number of elements in the vector, also referred to as its ‘length’. - pub fn len(&self, gpu_index: u32) -> usize { - self.lengths[gpu_index as usize] + pub fn len(&self, index: u32) -> usize { + self.lengths[index as usize] } /// Returns true if the ptr is empty - pub fn is_empty(&self, gpu_index: u32) -> bool { - self.lengths[gpu_index as usize] == 0 + pub fn is_empty(&self, index: u32) -> bool { + self.lengths[index as usize] == 0 } - pub(crate) fn get_mut(&mut self, range: R, gpu_index: u32) -> Option> + pub(crate) fn get_mut(&mut self, range: R, index: u32) -> Option> where R: std::ops::RangeBounds, T: Numeric, { - let (start, end) = range_bounds_to_start_end(self.len(gpu_index), range).into_inner(); + let (start, end) = range_bounds_to_start_end(self.len(index), range).into_inner(); // Check the range is compatible with the vec - if end <= start || end > self.lengths[gpu_index as usize] - 1 { + if end <= start || end > self.lengths[index as usize] - 1 { None } else { // Shift ptr let shifted_ptr: *mut c_void = - self.ptrs[gpu_index as usize].wrapping_byte_add(start * std::mem::size_of::()); + self.ptrs[index as usize].wrapping_byte_add(start * std::mem::size_of::()); // Compute the length let new_len = end - start + 1; // Create the slice Some(unsafe { - CudaSliceMut::new(shifted_ptr, new_len, self.gpu_indexes[gpu_index as usize]) + CudaSliceMut::new(shifted_ptr, new_len, self.gpu_indexes[index as usize]) }) } } @@ -174,49 +178,47 @@ where pub(crate) fn split_at_mut( &mut self, mid: usize, - gpu_index: u32, + index: u32, ) -> (Option>, Option>) where T: Numeric, { // Check the index is compatible with the vec - if mid > self.lengths[gpu_index as usize] - 1 { + if mid > self.lengths[index as usize] - 1 { (None, None) } else if mid == 0 { ( None, Some(unsafe { CudaSliceMut::new( - self.ptrs[gpu_index as usize], - self.lengths[gpu_index as usize], - gpu_index, + self.ptrs[index as usize], + self.lengths[index as usize], + index, ) }), ) - } else if mid == self.lengths[gpu_index as usize] - 1 { + } else if mid == self.lengths[index as usize] - 1 { ( Some(unsafe { CudaSliceMut::new( - self.ptrs[gpu_index as usize], - self.lengths[gpu_index as usize], - gpu_index, + self.ptrs[index as usize], + self.lengths[index as usize], + index, ) }), None, ) } else { let new_len_1 = mid; - let new_len_2 = self.lengths[gpu_index as usize] - mid; + let new_len_2 = self.lengths[index as usize] - mid; // Shift ptr let shifted_ptr: *mut c_void = - self.ptrs[gpu_index as usize].wrapping_byte_add(mid * std::mem::size_of::()); + self.ptrs[index as usize].wrapping_byte_add(mid * std::mem::size_of::()); // Create the slice ( - Some(unsafe { - CudaSliceMut::new(self.ptrs[gpu_index as usize], new_len_1, gpu_index) - }), - Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len_2, gpu_index) }), + Some(unsafe { CudaSliceMut::new(self.ptrs[index as usize], new_len_1, index) }), + Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len_2, index) }), ) } } diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs index e86195a9c3..b18b791b3d 100644 --- a/tfhe/src/core_crypto/gpu/vec.rs +++ b/tfhe/src/core_crypto/gpu/vec.rs @@ -1,7 +1,6 @@ use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; use crate::core_crypto::gpu::{synchronize_device, CudaStreams}; use crate::core_crypto::prelude::Numeric; -use rayon::prelude::*; use std::collections::Bound::{Excluded, Included, Unbounded}; use std::ffi::c_void; use std::marker::PhantomData; @@ -33,33 +32,33 @@ pub struct CudaVec { impl CudaVec { /// This creates a `CudaVec` that holds memory of `len` elements /// on the GPU with index `gpu_index` - pub fn new(len: usize, streams: &CudaStreams, gpu_index: u32) -> Self { - let vec = unsafe { Self::new_async(len, streams, gpu_index) }; + pub fn new(len: usize, streams: &CudaStreams, stream_index: u32) -> Self { + let vec = unsafe { Self::new_async(len, streams, stream_index) }; streams.synchronize(); vec } /// # Safety /// /// - `streams` __must__ be synchronized to guarantee computation has finished - pub unsafe fn new_async(len: usize, streams: &CudaStreams, gpu_index: u32) -> Self { + pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self { let size = len as u64 * std::mem::size_of::() as u64; let ptr = cuda_malloc_async( size, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); cuda_memset_async( ptr, 0u64, size, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); Self { ptr: vec![ptr; 1], len, - gpu_indexes: vec![streams.gpu_indexes[gpu_index as usize]; 1], + gpu_indexes: vec![streams.gpu_indexes[stream_index as usize]; 1], _phantom: PhantomData, } } @@ -102,11 +101,11 @@ impl CudaVec { /// /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must /// not be dropped until streams is synchronised - pub unsafe fn from_cpu_async(src: &[T], streams: &CudaStreams, gpu_index: u32) -> Self { - let mut res = Self::new(src.len(), streams, gpu_index); + pub unsafe fn from_cpu_async(src: &[T], streams: &CudaStreams, stream_index: u32) -> Self { + let mut res = Self::new(src.len(), streams, stream_index); // We have to check that h_data is not empty, because cuda_memset with size 0 is invalid if !src.is_empty() { - res.copy_from_cpu_async(src, streams, gpu_index); + res.copy_from_cpu_async(src, streams, stream_index); } res } @@ -128,7 +127,7 @@ impl CudaVec { /// /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must /// not be dropped until streams is synchronised - pub unsafe fn memset_async(&mut self, value: T, streams: &CudaStreams, gpu_index: u32) + pub unsafe fn memset_async(&mut self, value: T, streams: &CudaStreams, stream_index: u32) where T: Into, { @@ -136,11 +135,11 @@ impl CudaVec { // We check that self is not empty to avoid invalid pointers if size > 0 { cuda_memset_async( - self.as_mut_c_ptr(gpu_index), + self.as_mut_c_ptr(stream_index), value.into(), size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } } @@ -174,8 +173,12 @@ impl CudaVec { /// /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is /// required - pub unsafe fn copy_from_cpu_async(&mut self, src: &[T], streams: &CudaStreams, gpu_index: u32) - where + pub unsafe fn copy_from_cpu_async( + &mut self, + src: &[T], + streams: &CudaStreams, + stream_index: u32, + ) where T: Numeric, { assert!(self.len() >= src.len()); @@ -185,11 +188,11 @@ impl CudaVec { // invalid pointer being passed to copy_to_gpu_async if size > 0 { cuda_memcpy_async_to_gpu( - self.as_mut_c_ptr(gpu_index), + self.as_mut_c_ptr(stream_index), src.as_ptr().cast(), size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } } @@ -204,7 +207,7 @@ impl CudaVec { where T: Numeric, { - self.gpu_indexes.par_iter().for_each(|&gpu_index| { + for &gpu_index in streams.gpu_indexes.iter() { assert!(self.len() >= src.len()); let size = std::mem::size_of_val(src); @@ -219,7 +222,7 @@ impl CudaVec { streams.gpu_indexes[gpu_index as usize], ); } - }); + } } /// Copies data between two `CudaVec` @@ -228,8 +231,12 @@ impl CudaVec { /// /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is /// required - pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams, gpu_index: u32) - where + pub unsafe fn copy_from_gpu_async( + &mut self, + src: &Self, + streams: &CudaStreams, + stream_index: u32, + ) where T: Numeric, { assert!(self.len() >= src.len()); @@ -237,11 +244,11 @@ impl CudaVec { // We check that src is not empty to avoid invalid pointers if size > 0 { cuda_memcpy_async_gpu_to_gpu( - self.as_mut_c_ptr(gpu_index), - src.as_c_ptr(gpu_index), + self.as_mut_c_ptr(stream_index), + src.as_c_ptr(stream_index), size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } } @@ -257,7 +264,7 @@ impl CudaVec { range: R, src: &Self, streams: &CudaStreams, - gpu_index: u32, + stream_index: u32, ) where R: std::ops::RangeBounds, T: Numeric, @@ -271,15 +278,15 @@ impl CudaVec { assert!(end - start < self.len()); let src_ptr = src - .as_c_ptr(gpu_index) + .as_c_ptr(stream_index) .add(start * std::mem::size_of::()); let size = (end - start + 1) * std::mem::size_of::(); cuda_memcpy_async_gpu_to_gpu( - self.as_mut_c_ptr(gpu_index), + self.as_mut_c_ptr(stream_index), src_ptr, size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } @@ -294,7 +301,7 @@ impl CudaVec { range: R, src: &Self, streams: &CudaStreams, - gpu_index: u32, + stream_index: u32, ) where R: std::ops::RangeBounds, T: Numeric, @@ -308,15 +315,15 @@ impl CudaVec { assert!(end - start < src.len()); let dest_ptr = self - .as_mut_c_ptr(gpu_index) + .as_mut_c_ptr(stream_index) .add(start * std::mem::size_of::()); let size = (end - start + 1) * std::mem::size_of::(); cuda_memcpy_async_gpu_to_gpu( dest_ptr, - src.as_c_ptr(gpu_index), + src.as_c_ptr(stream_index), size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } @@ -325,7 +332,7 @@ impl CudaVec { /// # Safety /// /// - [CudaStreams::synchronize] __must__ be called as soon as synchronization is required - pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, gpu_index: u32) + pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, stream_index: u32) where T: Numeric, { @@ -337,28 +344,28 @@ impl CudaVec { if size > 0 { cuda_memcpy_async_to_cpu( dest.as_mut_ptr().cast(), - self.as_c_ptr(gpu_index), + self.as_c_ptr(stream_index), size as u64, - streams.ptr[gpu_index as usize], - streams.gpu_indexes[gpu_index as usize], + streams.ptr[stream_index as usize], + streams.gpu_indexes[stream_index as usize], ); } } #[allow(clippy::needless_pass_by_ref_mut)] - pub(crate) fn as_mut_c_ptr(&mut self, gpu_index: u32) -> *mut c_void { - self.ptr[gpu_index as usize] + pub(crate) fn as_mut_c_ptr(&mut self, index: u32) -> *mut c_void { + self.ptr[index as usize] } - pub(crate) fn get_mut_c_ptr(&self, gpu_index: u32) -> *mut c_void { - self.ptr[gpu_index as usize] + pub(crate) fn get_mut_c_ptr(&self, index: u32) -> *mut c_void { + self.ptr[index as usize] } - pub(crate) fn as_c_ptr(&self, gpu_index: u32) -> *const c_void { - self.ptr[gpu_index as usize].cast_const() + pub(crate) fn as_c_ptr(&self, index: u32) -> *const c_void { + self.ptr[index as usize].cast_const() } - pub(crate) fn as_slice(&self, range: R, gpu_index: u32) -> Option> + pub(crate) fn as_slice(&self, range: R, index: u32) -> Option> where R: std::ops::RangeBounds, T: Numeric, @@ -371,19 +378,19 @@ impl CudaVec { } else { // Shift ptr let shifted_ptr: *mut c_void = - self.ptr[gpu_index as usize].wrapping_byte_add(start * std::mem::size_of::()); + self.ptr[index as usize].wrapping_byte_add(start * std::mem::size_of::()); // Compute the length let new_len = end - start + 1; // Create the slice - Some(unsafe { CudaSlice::new(shifted_ptr, new_len, gpu_index) }) + Some(unsafe { CudaSlice::new(shifted_ptr, new_len, index) }) } } // clippy complains as we only manipulate pointers, but we want to keep rust semantics #[allow(clippy::needless_pass_by_ref_mut)] - pub(crate) fn as_mut_slice(&mut self, range: R, gpu_index: u32) -> Option> + pub(crate) fn as_mut_slice(&mut self, range: R, index: u32) -> Option> where R: std::ops::RangeBounds, T: Numeric, @@ -396,13 +403,13 @@ impl CudaVec { } else { // Shift ptr let shifted_ptr: *mut c_void = - self.ptr[gpu_index as usize].wrapping_byte_add(start * std::mem::size_of::()); + self.ptr[index as usize].wrapping_byte_add(start * std::mem::size_of::()); // Compute the length let new_len = end - start + 1; // Create the slice - Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len, gpu_index) }) + Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len, index) }) } } @@ -439,11 +446,11 @@ unsafe impl Sync for CudaVec where T: Sync + Numeric {} impl Drop for CudaVec { /// Free memory for pointer `ptr` synchronously fn drop(&mut self) { - self.gpu_indexes.par_iter().for_each(|&gpu_index| { + for &gpu_index in self.gpu_indexes.iter() { // Synchronizes the device to be sure no stream is still using this pointer synchronize_device(gpu_index); unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) }; - }); + } } } diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs index 198b94df49..5e0d010361 100644 --- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs @@ -81,16 +81,9 @@ impl CudaServerKey { let shift_plaintext = u64::from(scalar) * delta; let scalar_vector = vec![shift_plaintext; ct_blocks]; - let mut d_decomposed_scalar = CudaVec::::new_async( - ct.as_ref().d_blocks.lwe_ciphertext_count().0, - streams, - streams.gpu_indexes[0], - ); - d_decomposed_scalar.copy_from_cpu_async( - scalar_vector.as_slice(), - streams, - streams.gpu_indexes[0], - ); + let mut d_decomposed_scalar = + CudaVec::::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, streams, 0); + d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), streams, 0); cuda_lwe_ciphertext_plaintext_add_assign( &mut ct.as_mut().d_blocks, diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs index 0fabd96559..6a64ef19e6 100644 --- a/tfhe/src/integer/gpu/server_key/radix/mod.rs +++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs @@ -409,11 +409,10 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = num_blocks * lwe_size.0; - let mut extended_ct_vec = unsafe { - CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) - }; + let mut extended_ct_vec = + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) }; unsafe { - extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]); + extended_ct_vec.memset_async(0u64, streams, 0); extended_ct_vec.copy_self_range_gpu_to_gpu_async( shift.., &ct.as_ref().d_blocks.0.d_vec, @@ -480,16 +479,11 @@ impl CudaServerKey { let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus(); let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); - let mut extended_ct_vec = unsafe { - CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) - }; + let mut extended_ct_vec = + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) }; unsafe { - extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]); - extended_ct_vec.copy_from_gpu_async( - &ct.as_ref().d_blocks.0.d_vec, - streams, - streams.gpu_indexes[0], - ); + extended_ct_vec.memset_async(0u64, streams, 0); + extended_ct_vec.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, streams, 0); } streams.synchronize(); let extended_ct_list = CudaLweCiphertextList::from_cuda_vec( @@ -551,9 +545,8 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = num_blocks * lwe_size.0; - let mut trimmed_ct_vec = unsafe { - CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) - }; + let mut trimmed_ct_vec = + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) }; unsafe { trimmed_ct_vec.copy_src_range_gpu_to_gpu_async( shift.., @@ -619,9 +612,8 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size(); let shift = new_num_blocks * lwe_size.0; - let mut trimmed_ct_vec = unsafe { - CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0]) - }; + let mut trimmed_ct_vec = + unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) }; unsafe { trimmed_ct_vec.copy_src_range_gpu_to_gpu_async( 0..shift, @@ -695,17 +687,9 @@ impl CudaServerKey { let lwe_size = ct.as_ref().d_blocks.0.lwe_dimension.to_lwe_size().0; // Allocate the necessary amount of memory - let mut output_radix = CudaVec::new( - new_num_ct_blocks * lwe_size, - streams, - streams.gpu_indexes[0], - ); + let mut output_radix = CudaVec::new(new_num_ct_blocks * lwe_size, streams, 0); unsafe { - output_radix.copy_from_gpu_async( - &ct.as_ref().d_blocks.0.d_vec, - streams, - streams.gpu_indexes[0], - ); + output_radix.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, streams, 0); // Get the last ct block let last_block = ct .as_ref() @@ -779,7 +763,7 @@ impl CudaServerKey { let mut output_block = new_blocks .get_mut(lwe_size * i..lwe_size * (i + 1), streams.gpu_indexes[0]) .unwrap(); - output_block.copy_from_gpu_async(&padding_block, streams, streams.gpu_indexes[0]); + output_block.copy_from_gpu_async(&padding_block, streams, 0); } } streams.synchronize(); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs index 9afa8ba40c..337abdd928 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs @@ -31,8 +31,7 @@ impl CudaServerKey { .map(|x| x as u64) .collect::>(); - let clear_blocks = - CudaVec::from_cpu_async(&h_clear_blocks, streams, streams.gpu_indexes[0]); + let clear_blocks = CudaVec::from_cpu_async(&h_clear_blocks, streams, 0); match &self.bootstrapping_key { CudaBootstrappingKey::Classic(d_bsk) => { diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs index 5b6639082f..345143d05a 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs @@ -153,8 +153,7 @@ impl CudaServerKey { // as we will handle them separately. scalar_blocks.truncate(ct.as_ref().d_blocks.lwe_ciphertext_count().0); - let d_scalar_blocks: CudaVec = - CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]); + let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, streams, 0); let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count(); @@ -328,8 +327,7 @@ impl CudaServerKey { .iter_as::() .collect::>(); - let d_scalar_blocks: CudaVec = - CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]); + let d_scalar_blocks: CudaVec = CudaVec::from_cpu_async(&scalar_blocks, streams, 0); let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count(); diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs index 8d23090fd9..8b80b35b91 100644 --- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs +++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs @@ -74,11 +74,7 @@ impl CudaServerKey { T: CudaIntegerRadixCiphertext, { if scalar == Scalar::ZERO { - ct.as_mut() - .d_blocks - .0 - .d_vec - .memset_async(0, streams, streams.gpu_indexes[0]); + ct.as_mut().d_blocks.0.d_vec.memset_async(0, streams, 0); return; }