From 0a2ad8ca729a7762a83aad18f9f5ec21be022dee Mon Sep 17 00:00:00 2001
From: Agnes Leroy <agnes.leroy@zama.ai>
Date: Wed, 24 Jul 2024 17:52:06 +0200
Subject: [PATCH] chore(gpu): remove remaining par_iter over gpu_indexes

Rename some variables to try and make the code clearer
---
 tfhe/src/core_crypto/gpu/mod.rs               |   5 +-
 tfhe/src/core_crypto/gpu/slice.rs             |  78 +++++------
 tfhe/src/core_crypto/gpu/vec.rs               | 121 +++++++++---------
 .../gpu/server_key/radix/bitwise_op.rs        |  13 +-
 tfhe/src/integer/gpu/server_key/radix/mod.rs  |  44 ++-----
 .../gpu/server_key/radix/scalar_bitwise_op.rs |   3 +-
 .../gpu/server_key/radix/scalar_comparison.rs |   6 +-
 .../gpu/server_key/radix/scalar_mul.rs        |   6 +-
 8 files changed, 127 insertions(+), 149 deletions(-)
diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs
index db040514a7..709da6fca4 100644
--- a/tfhe/src/core_crypto/gpu/mod.rs
+++ b/tfhe/src/core_crypto/gpu/mod.rs
@@ -11,7 +11,6 @@ use crate::core_crypto::prelude::{
 };
 pub use algorithms::*;
 pub use entities::*;
-use rayon::prelude::*;
 use std::ffi::c_void;
 pub(crate) use tfhe_cuda_backend::cuda_bind::*;
 
@@ -284,7 +283,7 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
     polynomial_size: PolynomialSize,
 ) {
     let size = std::mem::size_of_val(src);
-    streams.gpu_indexes.par_iter().for_each(|&gpu_index| {
+    for &gpu_index in streams.gpu_indexes.iter() {
         assert_eq!(dest.len() * std::mem::size_of::<T>(), size);
         cuda_convert_lwe_programmable_bootstrap_key_64(
             streams.ptr[gpu_index as usize],
@@ -296,7 +295,7 @@ pub unsafe fn convert_lwe_programmable_bootstrap_key_async<T: UnsignedInteger>(
             l_gadget.0 as u32,
             polynomial_size.0 as u32,
         );
-    });
+    }
 }
 
 /// Convert multi-bit programmable bootstrap key
diff --git a/tfhe/src/core_crypto/gpu/slice.rs b/tfhe/src/core_crypto/gpu/slice.rs
index 523d71bd54..d5788b07a7 100644
--- a/tfhe/src/core_crypto/gpu/slice.rs
+++ b/tfhe/src/core_crypto/gpu/slice.rs
@@ -93,20 +93,24 @@ where
     ///
     /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is
     ///   required.
-    pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams, gpu_index: u32)
-    where
+    pub unsafe fn copy_from_gpu_async(
+        &mut self,
+        src: &Self,
+        streams: &CudaStreams,
+        stream_index: u32,
+    ) where
         T: Numeric,
     {
-        assert_eq!(self.len(gpu_index), src.len(gpu_index));
-        let size = src.len(gpu_index) * std::mem::size_of::<T>();
+        assert_eq!(self.len(stream_index), src.len(stream_index));
+        let size = src.len(stream_index) * std::mem::size_of::<T>();
         // We check that src is not empty to avoid invalid pointers
         if size > 0 {
             cuda_memcpy_async_gpu_to_gpu(
-                self.as_mut_c_ptr(gpu_index),
-                src.as_c_ptr(gpu_index),
+                self.as_mut_c_ptr(stream_index),
+                src.as_c_ptr(stream_index),
                 size as u64,
-                streams.ptr[gpu_index as usize],
-                streams.gpu_indexes[gpu_index as usize],
+                streams.ptr[stream_index as usize],
+                streams.gpu_indexes[stream_index as usize],
             );
         }
     }
@@ -118,55 +122,55 @@ where
     ///
     /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is
     ///   required.
-    pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, gpu_index: u32)
+    pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, stream_index: u32)
     where
         T: Numeric,
     {
-        assert_eq!(self.len(gpu_index), dest.len());
-        let size = self.len(gpu_index) * std::mem::size_of::<T>();
+        assert_eq!(self.len(stream_index), dest.len());
+        let size = self.len(stream_index) * std::mem::size_of::<T>();
         // We check that src is not empty to avoid invalid pointers
         if size > 0 {
             cuda_memcpy_async_to_cpu(
                 dest.as_mut_ptr().cast::<c_void>(),
-                self.as_c_ptr(gpu_index),
+                self.as_c_ptr(stream_index),
                 size as u64,
-                streams.ptr[gpu_index as usize],
-                streams.gpu_indexes[gpu_index as usize],
+                streams.ptr[stream_index as usize],
+                streams.gpu_indexes[stream_index as usize],
             );
         }
     }
 
     /// Returns the number of elements in the vector, also referred to as its ‘length’.
-    pub fn len(&self, gpu_index: u32) -> usize {
-        self.lengths[gpu_index as usize]
+    pub fn len(&self, index: u32) -> usize {
+        self.lengths[index as usize]
     }
 
     /// Returns true if the ptr is empty
-    pub fn is_empty(&self, gpu_index: u32) -> bool {
-        self.lengths[gpu_index as usize] == 0
+    pub fn is_empty(&self, index: u32) -> bool {
+        self.lengths[index as usize] == 0
     }
 
-    pub(crate) fn get_mut<R>(&mut self, range: R, gpu_index: u32) -> Option<CudaSliceMut<T>>
+    pub(crate) fn get_mut<R>(&mut self, range: R, index: u32) -> Option<CudaSliceMut<T>>
     where
         R: std::ops::RangeBounds<usize>,
         T: Numeric,
     {
-        let (start, end) = range_bounds_to_start_end(self.len(gpu_index), range).into_inner();
+        let (start, end) = range_bounds_to_start_end(self.len(index), range).into_inner();
 
         // Check the range is compatible with the vec
-        if end <= start || end > self.lengths[gpu_index as usize] - 1 {
+        if end <= start || end > self.lengths[index as usize] - 1 {
             None
         } else {
             // Shift ptr
             let shifted_ptr: *mut c_void =
-                self.ptrs[gpu_index as usize].wrapping_byte_add(start * std::mem::size_of::<T>());
+                self.ptrs[index as usize].wrapping_byte_add(start * std::mem::size_of::<T>());
 
             // Compute the length
             let new_len = end - start + 1;
 
             // Create the slice
             Some(unsafe {
-                CudaSliceMut::new(shifted_ptr, new_len, self.gpu_indexes[gpu_index as usize])
+                CudaSliceMut::new(shifted_ptr, new_len, self.gpu_indexes[index as usize])
             })
         }
     }
@@ -174,49 +178,47 @@ where
     pub(crate) fn split_at_mut(
         &mut self,
         mid: usize,
-        gpu_index: u32,
+        index: u32,
     ) -> (Option<CudaSliceMut<T>>, Option<CudaSliceMut<T>>)
     where
         T: Numeric,
     {
         // Check the index is compatible with the vec
-        if mid > self.lengths[gpu_index as usize] - 1 {
+        if mid > self.lengths[index as usize] - 1 {
             (None, None)
         } else if mid == 0 {
             (
                 None,
                 Some(unsafe {
                     CudaSliceMut::new(
-                        self.ptrs[gpu_index as usize],
-                        self.lengths[gpu_index as usize],
-                        gpu_index,
+                        self.ptrs[index as usize],
+                        self.lengths[index as usize],
+                        index,
                     )
                 }),
             )
-        } else if mid == self.lengths[gpu_index as usize] - 1 {
+        } else if mid == self.lengths[index as usize] - 1 {
             (
                 Some(unsafe {
                     CudaSliceMut::new(
-                        self.ptrs[gpu_index as usize],
-                        self.lengths[gpu_index as usize],
-                        gpu_index,
+                        self.ptrs[index as usize],
+                        self.lengths[index as usize],
+                        index,
                     )
                 }),
                 None,
             )
         } else {
             let new_len_1 = mid;
-            let new_len_2 = self.lengths[gpu_index as usize] - mid;
+            let new_len_2 = self.lengths[index as usize] - mid;
             // Shift ptr
             let shifted_ptr: *mut c_void =
-                self.ptrs[gpu_index as usize].wrapping_byte_add(mid * std::mem::size_of::<T>());
+                self.ptrs[index as usize].wrapping_byte_add(mid * std::mem::size_of::<T>());
 
             // Create the slice
             (
-                Some(unsafe {
-                    CudaSliceMut::new(self.ptrs[gpu_index as usize], new_len_1, gpu_index)
-                }),
-                Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len_2, gpu_index) }),
+                Some(unsafe { CudaSliceMut::new(self.ptrs[index as usize], new_len_1, index) }),
+                Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len_2, index) }),
             )
         }
     }
diff --git a/tfhe/src/core_crypto/gpu/vec.rs b/tfhe/src/core_crypto/gpu/vec.rs
index e86195a9c3..b18b791b3d 100644
--- a/tfhe/src/core_crypto/gpu/vec.rs
+++ b/tfhe/src/core_crypto/gpu/vec.rs
@@ -1,7 +1,6 @@
 use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut};
 use crate::core_crypto::gpu::{synchronize_device, CudaStreams};
 use crate::core_crypto::prelude::Numeric;
-use rayon::prelude::*;
 use std::collections::Bound::{Excluded, Included, Unbounded};
 use std::ffi::c_void;
 use std::marker::PhantomData;
@@ -33,33 +32,33 @@ pub struct CudaVec<T: Numeric> {
 impl<T: Numeric> CudaVec<T> {
     /// This creates a `CudaVec` that holds memory of `len` elements
     /// on the GPU with index `gpu_index`
-    pub fn new(len: usize, streams: &CudaStreams, gpu_index: u32) -> Self {
-        let vec = unsafe { Self::new_async(len, streams, gpu_index) };
+    pub fn new(len: usize, streams: &CudaStreams, stream_index: u32) -> Self {
+        let vec = unsafe { Self::new_async(len, streams, stream_index) };
         streams.synchronize();
         vec
     }
     /// # Safety
     ///
     /// - `streams` __must__ be synchronized to guarantee computation has finished
-    pub unsafe fn new_async(len: usize, streams: &CudaStreams, gpu_index: u32) -> Self {
+    pub unsafe fn new_async(len: usize, streams: &CudaStreams, stream_index: u32) -> Self {
         let size = len as u64 * std::mem::size_of::<T>() as u64;
         let ptr = cuda_malloc_async(
             size,
-            streams.ptr[gpu_index as usize],
-            streams.gpu_indexes[gpu_index as usize],
+            streams.ptr[stream_index as usize],
+            streams.gpu_indexes[stream_index as usize],
         );
         cuda_memset_async(
             ptr,
             0u64,
             size,
-            streams.ptr[gpu_index as usize],
-            streams.gpu_indexes[gpu_index as usize],
+            streams.ptr[stream_index as usize],
+            streams.gpu_indexes[stream_index as usize],
         );
 
         Self {
             ptr: vec![ptr; 1],
             len,
-            gpu_indexes: vec![streams.gpu_indexes[gpu_index as usize]; 1],
+            gpu_indexes: vec![streams.gpu_indexes[stream_index as usize]; 1],
             _phantom: PhantomData,
         }
     }
@@ -102,11 +101,11 @@ impl<T: Numeric> CudaVec<T> {
     ///
     /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
     ///   not be dropped until streams is synchronised
-    pub unsafe fn from_cpu_async(src: &[T], streams: &CudaStreams, gpu_index: u32) -> Self {
-        let mut res = Self::new(src.len(), streams, gpu_index);
+    pub unsafe fn from_cpu_async(src: &[T], streams: &CudaStreams, stream_index: u32) -> Self {
+        let mut res = Self::new(src.len(), streams, stream_index);
         // We have to check that h_data is not empty, because cuda_memset with size 0 is invalid
         if !src.is_empty() {
-            res.copy_from_cpu_async(src, streams, gpu_index);
+            res.copy_from_cpu_async(src, streams, stream_index);
         }
         res
     }
@@ -128,7 +127,7 @@ impl<T: Numeric> CudaVec<T> {
     ///
     /// - `streams` __must__ be synchronized to guarantee computation has finished, and inputs must
     ///   not be dropped until streams is synchronised
-    pub unsafe fn memset_async(&mut self, value: T, streams: &CudaStreams, gpu_index: u32)
+    pub unsafe fn memset_async(&mut self, value: T, streams: &CudaStreams, stream_index: u32)
     where
         T: Into<u64>,
     {
@@ -136,11 +135,11 @@ impl<T: Numeric> CudaVec<T> {
         // We check that self is not empty to avoid invalid pointers
         if size > 0 {
             cuda_memset_async(
-                self.as_mut_c_ptr(gpu_index),
+                self.as_mut_c_ptr(stream_index),
                 value.into(),
                 size as u64,
-                streams.ptr[gpu_index as usize],
-                streams.gpu_indexes[gpu_index as usize],
+                streams.ptr[stream_index as usize],
+                streams.gpu_indexes[stream_index as usize],
             );
         }
     }
@@ -174,8 +173,12 @@ impl<T: Numeric> CudaVec<T> {
     ///
     /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is
     ///   required
-    pub unsafe fn copy_from_cpu_async(&mut self, src: &[T], streams: &CudaStreams, gpu_index: u32)
-    where
+    pub unsafe fn copy_from_cpu_async(
+        &mut self,
+        src: &[T],
+        streams: &CudaStreams,
+        stream_index: u32,
+    ) where
         T: Numeric,
     {
         assert!(self.len() >= src.len());
@@ -185,11 +188,11 @@ impl<T: Numeric> CudaVec<T> {
         // invalid pointer being passed to copy_to_gpu_async
         if size > 0 {
             cuda_memcpy_async_to_gpu(
-                self.as_mut_c_ptr(gpu_index),
+                self.as_mut_c_ptr(stream_index),
                 src.as_ptr().cast(),
                 size as u64,
-                streams.ptr[gpu_index as usize],
-                streams.gpu_indexes[gpu_index as usize],
+                streams.ptr[stream_index as usize],
+                streams.gpu_indexes[stream_index as usize],
             );
         }
     }
@@ -204,7 +207,7 @@ impl<T: Numeric> CudaVec<T> {
     where
         T: Numeric,
     {
-        self.gpu_indexes.par_iter().for_each(|&gpu_index| {
+        for &gpu_index in streams.gpu_indexes.iter() {
             assert!(self.len() >= src.len());
             let size = std::mem::size_of_val(src);
 
@@ -219,7 +222,7 @@ impl<T: Numeric> CudaVec<T> {
                     streams.gpu_indexes[gpu_index as usize],
                 );
             }
-        });
+        }
     }
 
     /// Copies data between two `CudaVec`
@@ -228,8 +231,12 @@ impl<T: Numeric> CudaVec<T> {
     ///
     /// - [CudaStreams::synchronize] __must__ be called after the copy as soon as synchronization is
     ///   required
-    pub unsafe fn copy_from_gpu_async(&mut self, src: &Self, streams: &CudaStreams, gpu_index: u32)
-    where
+    pub unsafe fn copy_from_gpu_async(
+        &mut self,
+        src: &Self,
+        streams: &CudaStreams,
+        stream_index: u32,
+    ) where
         T: Numeric,
     {
         assert!(self.len() >= src.len());
@@ -237,11 +244,11 @@ impl<T: Numeric> CudaVec<T> {
         // We check that src is not empty to avoid invalid pointers
         if size > 0 {
             cuda_memcpy_async_gpu_to_gpu(
-                self.as_mut_c_ptr(gpu_index),
-                src.as_c_ptr(gpu_index),
+                self.as_mut_c_ptr(stream_index),
+                src.as_c_ptr(stream_index),
                 size as u64,
-                streams.ptr[gpu_index as usize],
-                streams.gpu_indexes[gpu_index as usize],
+                streams.ptr[stream_index as usize],
+                streams.gpu_indexes[stream_index as usize],
             );
         }
     }
@@ -257,7 +264,7 @@ impl<T: Numeric> CudaVec<T> {
         range: R,
         src: &Self,
         streams: &CudaStreams,
-        gpu_index: u32,
+        stream_index: u32,
     ) where
         R: std::ops::RangeBounds<usize>,
         T: Numeric,
@@ -271,15 +278,15 @@ impl<T: Numeric> CudaVec<T> {
         assert!(end - start < self.len());
 
         let src_ptr = src
-            .as_c_ptr(gpu_index)
+            .as_c_ptr(stream_index)
             .add(start * std::mem::size_of::<T>());
         let size = (end - start + 1) * std::mem::size_of::<T>();
         cuda_memcpy_async_gpu_to_gpu(
-            self.as_mut_c_ptr(gpu_index),
+            self.as_mut_c_ptr(stream_index),
             src_ptr,
             size as u64,
-            streams.ptr[gpu_index as usize],
-            streams.gpu_indexes[gpu_index as usize],
+            streams.ptr[stream_index as usize],
+            streams.gpu_indexes[stream_index as usize],
         );
     }
 
@@ -294,7 +301,7 @@ impl<T: Numeric> CudaVec<T> {
         range: R,
         src: &Self,
         streams: &CudaStreams,
-        gpu_index: u32,
+        stream_index: u32,
     ) where
         R: std::ops::RangeBounds<usize>,
         T: Numeric,
@@ -308,15 +315,15 @@ impl<T: Numeric> CudaVec<T> {
         assert!(end - start < src.len());
 
         let dest_ptr = self
-            .as_mut_c_ptr(gpu_index)
+            .as_mut_c_ptr(stream_index)
             .add(start * std::mem::size_of::<T>());
         let size = (end - start + 1) * std::mem::size_of::<T>();
         cuda_memcpy_async_gpu_to_gpu(
             dest_ptr,
-            src.as_c_ptr(gpu_index),
+            src.as_c_ptr(stream_index),
             size as u64,
-            streams.ptr[gpu_index as usize],
-            streams.gpu_indexes[gpu_index as usize],
+            streams.ptr[stream_index as usize],
+            streams.gpu_indexes[stream_index as usize],
         );
     }
 
@@ -325,7 +332,7 @@ impl<T: Numeric> CudaVec<T> {
     /// # Safety
     ///
     /// - [CudaStreams::synchronize] __must__ be called as soon as synchronization is required
-    pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, gpu_index: u32)
+    pub unsafe fn copy_to_cpu_async(&self, dest: &mut [T], streams: &CudaStreams, stream_index: u32)
     where
         T: Numeric,
     {
@@ -337,28 +344,28 @@ impl<T: Numeric> CudaVec<T> {
         if size > 0 {
             cuda_memcpy_async_to_cpu(
                 dest.as_mut_ptr().cast(),
-                self.as_c_ptr(gpu_index),
+                self.as_c_ptr(stream_index),
                 size as u64,
-                streams.ptr[gpu_index as usize],
-                streams.gpu_indexes[gpu_index as usize],
+                streams.ptr[stream_index as usize],
+                streams.gpu_indexes[stream_index as usize],
             );
         }
     }
 
     #[allow(clippy::needless_pass_by_ref_mut)]
-    pub(crate) fn as_mut_c_ptr(&mut self, gpu_index: u32) -> *mut c_void {
-        self.ptr[gpu_index as usize]
+    pub(crate) fn as_mut_c_ptr(&mut self, index: u32) -> *mut c_void {
+        self.ptr[index as usize]
     }
 
-    pub(crate) fn get_mut_c_ptr(&self, gpu_index: u32) -> *mut c_void {
-        self.ptr[gpu_index as usize]
+    pub(crate) fn get_mut_c_ptr(&self, index: u32) -> *mut c_void {
+        self.ptr[index as usize]
     }
 
-    pub(crate) fn as_c_ptr(&self, gpu_index: u32) -> *const c_void {
-        self.ptr[gpu_index as usize].cast_const()
+    pub(crate) fn as_c_ptr(&self, index: u32) -> *const c_void {
+        self.ptr[index as usize].cast_const()
     }
 
-    pub(crate) fn as_slice<R>(&self, range: R, gpu_index: u32) -> Option<CudaSlice<T>>
+    pub(crate) fn as_slice<R>(&self, range: R, index: u32) -> Option<CudaSlice<T>>
     where
         R: std::ops::RangeBounds<usize>,
         T: Numeric,
@@ -371,19 +378,19 @@ impl<T: Numeric> CudaVec<T> {
         } else {
             // Shift ptr
             let shifted_ptr: *mut c_void =
-                self.ptr[gpu_index as usize].wrapping_byte_add(start * std::mem::size_of::<T>());
+                self.ptr[index as usize].wrapping_byte_add(start * std::mem::size_of::<T>());
 
             // Compute the length
             let new_len = end - start + 1;
 
             // Create the slice
-            Some(unsafe { CudaSlice::new(shifted_ptr, new_len, gpu_index) })
+            Some(unsafe { CudaSlice::new(shifted_ptr, new_len, index) })
         }
     }
 
     // clippy complains as we only manipulate pointers, but we want to keep rust semantics
     #[allow(clippy::needless_pass_by_ref_mut)]
-    pub(crate) fn as_mut_slice<R>(&mut self, range: R, gpu_index: u32) -> Option<CudaSliceMut<T>>
+    pub(crate) fn as_mut_slice<R>(&mut self, range: R, index: u32) -> Option<CudaSliceMut<T>>
     where
         R: std::ops::RangeBounds<usize>,
         T: Numeric,
@@ -396,13 +403,13 @@ impl<T: Numeric> CudaVec<T> {
         } else {
             // Shift ptr
             let shifted_ptr: *mut c_void =
-                self.ptr[gpu_index as usize].wrapping_byte_add(start * std::mem::size_of::<T>());
+                self.ptr[index as usize].wrapping_byte_add(start * std::mem::size_of::<T>());
 
             // Compute the length
             let new_len = end - start + 1;
 
             // Create the slice
-            Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len, gpu_index) })
+            Some(unsafe { CudaSliceMut::new(shifted_ptr, new_len, index) })
         }
     }
 
@@ -439,11 +446,11 @@ unsafe impl<T> Sync for CudaVec<T> where T: Sync + Numeric {}
 impl<T: Numeric> Drop for CudaVec<T> {
     /// Free memory for pointer `ptr` synchronously
     fn drop(&mut self) {
-        self.gpu_indexes.par_iter().for_each(|&gpu_index| {
+        for &gpu_index in self.gpu_indexes.iter() {
             // Synchronizes the device to be sure no stream is still using this pointer
             synchronize_device(gpu_index);
             unsafe { cuda_drop(self.get_mut_c_ptr(gpu_index), gpu_index) };
-        });
+        }
     }
 }
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
index 198b94df49..5e0d010361 100644
--- a/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/bitwise_op.rs
@@ -81,16 +81,9 @@ impl CudaServerKey {
         let shift_plaintext = u64::from(scalar) * delta;
 
         let scalar_vector = vec![shift_plaintext; ct_blocks];
-        let mut d_decomposed_scalar = CudaVec::<u64>::new_async(
-            ct.as_ref().d_blocks.lwe_ciphertext_count().0,
-            streams,
-            streams.gpu_indexes[0],
-        );
-        d_decomposed_scalar.copy_from_cpu_async(
-            scalar_vector.as_slice(),
-            streams,
-            streams.gpu_indexes[0],
-        );
+        let mut d_decomposed_scalar =
+            CudaVec::<u64>::new_async(ct.as_ref().d_blocks.lwe_ciphertext_count().0, streams, 0);
+        d_decomposed_scalar.copy_from_cpu_async(scalar_vector.as_slice(), streams, 0);
 
         cuda_lwe_ciphertext_plaintext_add_assign(
             &mut ct.as_mut().d_blocks,
diff --git a/tfhe/src/integer/gpu/server_key/radix/mod.rs b/tfhe/src/integer/gpu/server_key/radix/mod.rs
index 0fabd96559..6a64ef19e6 100644
--- a/tfhe/src/integer/gpu/server_key/radix/mod.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/mod.rs
@@ -409,11 +409,10 @@ impl CudaServerKey {
         let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
         let shift = num_blocks * lwe_size.0;
 
-        let mut extended_ct_vec = unsafe {
-            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
-        };
+        let mut extended_ct_vec =
+            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) };
         unsafe {
-            extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]);
+            extended_ct_vec.memset_async(0u64, streams, 0);
             extended_ct_vec.copy_self_range_gpu_to_gpu_async(
                 shift..,
                 &ct.as_ref().d_blocks.0.d_vec,
@@ -480,16 +479,11 @@ impl CudaServerKey {
         let ciphertext_modulus = ct.as_ref().d_blocks.ciphertext_modulus();
         let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
 
-        let mut extended_ct_vec = unsafe {
-            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
-        };
+        let mut extended_ct_vec =
+            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) };
         unsafe {
-            extended_ct_vec.memset_async(0u64, streams, streams.gpu_indexes[0]);
-            extended_ct_vec.copy_from_gpu_async(
-                &ct.as_ref().d_blocks.0.d_vec,
-                streams,
-                streams.gpu_indexes[0],
-            );
+            extended_ct_vec.memset_async(0u64, streams, 0);
+            extended_ct_vec.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, streams, 0);
         }
         streams.synchronize();
         let extended_ct_list = CudaLweCiphertextList::from_cuda_vec(
@@ -551,9 +545,8 @@ impl CudaServerKey {
         let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
         let shift = num_blocks * lwe_size.0;
 
-        let mut trimmed_ct_vec = unsafe {
-            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
-        };
+        let mut trimmed_ct_vec =
+            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) };
         unsafe {
             trimmed_ct_vec.copy_src_range_gpu_to_gpu_async(
                 shift..,
@@ -619,9 +612,8 @@ impl CudaServerKey {
         let lwe_size = ct.as_ref().d_blocks.lwe_dimension().to_lwe_size();
         let shift = new_num_blocks * lwe_size.0;
 
-        let mut trimmed_ct_vec = unsafe {
-            CudaVec::new_async(new_num_blocks * lwe_size.0, streams, streams.gpu_indexes[0])
-        };
+        let mut trimmed_ct_vec =
+            unsafe { CudaVec::new_async(new_num_blocks * lwe_size.0, streams, 0) };
         unsafe {
             trimmed_ct_vec.copy_src_range_gpu_to_gpu_async(
                 0..shift,
@@ -695,17 +687,9 @@ impl CudaServerKey {
         let lwe_size = ct.as_ref().d_blocks.0.lwe_dimension.to_lwe_size().0;
 
         // Allocate the necessary amount of memory
-        let mut output_radix = CudaVec::new(
-            new_num_ct_blocks * lwe_size,
-            streams,
-            streams.gpu_indexes[0],
-        );
+        let mut output_radix = CudaVec::new(new_num_ct_blocks * lwe_size, streams, 0);
         unsafe {
-            output_radix.copy_from_gpu_async(
-                &ct.as_ref().d_blocks.0.d_vec,
-                streams,
-                streams.gpu_indexes[0],
-            );
+            output_radix.copy_from_gpu_async(&ct.as_ref().d_blocks.0.d_vec, streams, 0);
             // Get the last ct block
             let last_block = ct
                 .as_ref()
@@ -779,7 +763,7 @@ impl CudaServerKey {
                 let mut output_block = new_blocks
                     .get_mut(lwe_size * i..lwe_size * (i + 1), streams.gpu_indexes[0])
                     .unwrap();
-                output_block.copy_from_gpu_async(&padding_block, streams, streams.gpu_indexes[0]);
+                output_block.copy_from_gpu_async(&padding_block, streams, 0);
             }
         }
         streams.synchronize();
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
index 9afa8ba40c..337abdd928 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_bitwise_op.rs
@@ -31,8 +31,7 @@ impl CudaServerKey {
             .map(|x| x as u64)
             .collect::<Vec<_>>();
 
-        let clear_blocks =
-            CudaVec::from_cpu_async(&h_clear_blocks, streams, streams.gpu_indexes[0]);
+        let clear_blocks = CudaVec::from_cpu_async(&h_clear_blocks, streams, 0);
 
         match &self.bootstrapping_key {
             CudaBootstrappingKey::Classic(d_bsk) => {
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
index 5b6639082f..345143d05a 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_comparison.rs
@@ -153,8 +153,7 @@ impl CudaServerKey {
         // as we will handle them separately.
         scalar_blocks.truncate(ct.as_ref().d_blocks.lwe_ciphertext_count().0);
 
-        let d_scalar_blocks: CudaVec<u64> =
-            CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]);
+        let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, streams, 0);
 
         let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
 
@@ -328,8 +327,7 @@ impl CudaServerKey {
                 .iter_as::<u64>()
                 .collect::<Vec<_>>();
 
-        let d_scalar_blocks: CudaVec<u64> =
-            CudaVec::from_cpu_async(&scalar_blocks, streams, streams.gpu_indexes[0]);
+        let d_scalar_blocks: CudaVec<u64> = CudaVec::from_cpu_async(&scalar_blocks, streams, 0);
 
         let lwe_ciphertext_count = ct.as_ref().d_blocks.lwe_ciphertext_count();
 
diff --git a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
index 8d23090fd9..8b80b35b91 100644
--- a/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
+++ b/tfhe/src/integer/gpu/server_key/radix/scalar_mul.rs
@@ -74,11 +74,7 @@ impl CudaServerKey {
         T: CudaIntegerRadixCiphertext,
     {
         if scalar == Scalar::ZERO {
-            ct.as_mut()
-                .d_blocks
-                .0
-                .d_vec
-                .memset_async(0, streams, streams.gpu_indexes[0]);
+            ct.as_mut().d_blocks.0.d_vec.memset_async(0, streams, 0);
             return;
         }