diff --git a/backends/tfhe-cuda-backend/cuda/include/integer.h b/backends/tfhe-cuda-backend/cuda/include/integer.h index 33d862cd29..b6e09d7184 100644 --- a/backends/tfhe-cuda-backend/cuda/include/integer.h +++ b/backends/tfhe-cuda-backend/cuda/include/integer.h @@ -102,7 +102,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor, uint32_t num_blocks, PBS_TYPE pbs_type, - uint32_t max_shared_memory, bool allocate_gpu_memory); + bool allocate_gpu_memory); void cuda_integer_mult_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, @@ -526,8 +526,7 @@ template struct int_radix_lut { execute_scratch_pbs( streams[i], gpu_indexes[i], &gpu_pbs_buffer, params.glwe_dimension, params.small_lwe_dimension, params.polynomial_size, params.pbs_level, - params.grouping_factor, num_blocks_on_gpu, - cuda_get_max_shared_memory(gpu_indexes[i]), params.pbs_type, + params.grouping_factor, num_blocks_on_gpu, params.pbs_type, allocate_gpu_memory); cuda_synchronize_stream(streams[i], gpu_indexes[i]); buffer.push_back(gpu_pbs_buffer); diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h index bdfdabdd5d..8865af9f63 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap.h @@ -26,14 +26,12 @@ void cuda_convert_lwe_programmable_bootstrap_key_64( void scratch_cuda_programmable_bootstrap_amortized_32( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void scratch_cuda_programmable_bootstrap_amortized_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -41,7 +39,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -49,7 +47,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cleanup_cuda_programmable_bootstrap_amortized(void *stream, uint32_t gpu_index, @@ -58,14 +56,12 @@ void cleanup_cuda_programmable_bootstrap_amortized(void *stream, void scratch_cuda_programmable_bootstrap_32( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void scratch_cuda_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -73,7 +69,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -81,7 +77,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory); + uint32_t num_samples); void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index, int8_t **pbs_buffer); @@ -349,7 +345,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -358,7 +354,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); #if (CUDA_ARCH >= 900) template @@ -368,29 +364,26 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); #endif template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template bool has_support_to_cuda_programmable_bootstrap_tbc(uint32_t num_samples, diff --git a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h index 6d5d30781f..181457d2e1 100644 --- a/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h +++ b/backends/tfhe-cuda-backend/cuda/include/programmable_bootstrap_multibit.h @@ -19,8 +19,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t chunk_size = 0); void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *stream, uint32_t gpu_index, void *lwe_array_out, @@ -28,8 +28,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory, - uint32_t lwe_chunk_size = 0); + uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size = 0); void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, uint32_t gpu_index, @@ -52,8 +51,8 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size); template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -63,7 +62,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); #endif template @@ -71,15 +70,15 @@ void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0); template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -89,15 +88,15 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); + uint32_t lwe_chunk_size = 0); template void scratch_cuda_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0); template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -107,7 +106,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0); + uint32_t lwe_chunk_size = 0); template __host__ __device__ uint64_t @@ -314,7 +313,6 @@ template struct pbs_buffer { template __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, - uint32_t polynomial_size, - uint32_t max_shared_memory); + uint32_t polynomial_size); #endif // CUDA_MULTI_BIT_H diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh index 4469e71f3b..13642d9ee0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh @@ -177,8 +177,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + grouping_factor, num_radix_blocks, pbs_type); } else { /// Make sure all data that should be on GPU 0 is indeed there cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -204,8 +203,7 @@ __host__ void integer_radix_apply_univariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + pbs_level, grouping_factor, num_radix_blocks, pbs_type); /// Copy data back to GPU 0 and release vecs multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, @@ -270,8 +268,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec[0], lwe_trivial_indexes_vec[0], bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, pbs_level, - grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + grouping_factor, num_radix_blocks, pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); multi_gpu_scatter_lwe_async( @@ -293,8 +290,7 @@ __host__ void integer_radix_apply_bivariate_lookup_table_kb( lwe_trivial_indexes_vec, lut->lut_vec, lut->lut_indexes_vec, lwe_after_ks_vec, lwe_trivial_indexes_vec, bsks, lut->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, pbs_base_log, - pbs_level, grouping_factor, num_radix_blocks, - cuda_get_max_shared_memory(gpu_indexes[0]), pbs_type); + pbs_level, grouping_factor, num_radix_blocks, pbs_type); /// Copy data back to GPU 0 and release vecs multi_gpu_gather_lwe_async(streams, gpu_indexes, active_gpu_count, @@ -696,8 +692,7 @@ void host_full_propagate_inplace(cudaStream_t *streams, uint32_t *gpu_indexes, mem_ptr->lut->lwe_trivial_indexes, bsks, mem_ptr->lut->buffer, params.glwe_dimension, params.small_lwe_dimension, params.polynomial_size, params.pbs_base_log, params.pbs_level, - params.grouping_factor, 2, cuda_get_max_shared_memory(gpu_indexes[0]), - params.pbs_type); + params.grouping_factor, 2, params.pbs_type); cuda_memcpy_async_gpu_to_gpu(cur_input_block, mem_ptr->tmp_big_lwe_vector, big_lwe_size * sizeof(Torus), streams[0], diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu index d0eeae2817..49bb1e4dca 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu @@ -71,7 +71,7 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t pbs_base_log, uint32_t pbs_level, uint32_t ks_base_log, uint32_t ks_level, uint32_t grouping_factor, uint32_t num_radix_blocks, PBS_TYPE pbs_type, - uint32_t max_shared_memory, bool allocate_gpu_memory) { + bool allocate_gpu_memory) { int_radix_params params(pbs_type, glwe_dimension, polynomial_size, polynomial_size * glwe_dimension, lwe_dimension, @@ -123,7 +123,6 @@ void scratch_cuda_integer_mult_radix_ciphertext_kb_64( * - 'num_blocks' is the number of big lwe ciphertext blocks inside radix * ciphertext * - 'pbs_type' selects which PBS implementation should be used - * - 'max_shared_memory' maximum shared memory per cuda block */ void cuda_integer_mult_radix_ciphertext_kb_64( void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, diff --git a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh index 00b5e23a8d..97959dd333 100644 --- a/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cuh @@ -234,7 +234,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( int32_t h_smart_copy_in[r * num_blocks]; int32_t h_smart_copy_out[r * num_blocks]; - auto max_shared_memory = cuda_get_max_shared_memory(gpu_indexes[0]); + auto max_shared_memory = cuda_get_max_shared_memory(0); // create lut object for message and carry // we allocate luts_message_carry in the host function (instead of scratch) @@ -357,7 +357,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( small_lwe_vector, lwe_indexes_in, bsks, luts_message_carry->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, max_shared_memory, + mem_ptr->params.grouping_factor, total_count, mem_ptr->params.pbs_type); } else { cuda_synchronize_stream(streams[0], gpu_indexes[0]); @@ -405,7 +405,7 @@ __host__ void host_integer_sum_ciphertexts_vec_kb( lwe_trivial_indexes_vec, bsks, luts_message_carry->buffer, glwe_dimension, small_lwe_dimension, polynomial_size, mem_ptr->params.pbs_base_log, mem_ptr->params.pbs_level, - mem_ptr->params.grouping_factor, total_count, max_shared_memory, + mem_ptr->params.grouping_factor, total_count, mem_ptr->params.pbs_type); multi_gpu_gather_lwe_async( diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh index 8ee59ff3a9..966be0464d 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/bootstrapping_key.cuh @@ -115,7 +115,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, double2 *buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); switch (polynomial_size) { case 256: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -133,7 +133,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 512: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -151,7 +151,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 1024: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -169,7 +169,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 2048: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -187,7 +187,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 4096: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -205,7 +205,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 8192: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -223,7 +223,7 @@ void cuda_convert_lwe_programmable_bootstrap_key(cudaStream_t stream, } break; case 16384: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { check_cuda_error(cudaFuncSetAttribute( batch_NSMFFT, ForwardFFT>, FULLSM>, cudaFuncAttributeMaxDynamicSharedMemorySize, shared_memory_size)); @@ -268,7 +268,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, double2 *buffer; switch (polynomial_size) { case 256: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -289,7 +289,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 512: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -310,7 +310,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 1024: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -331,7 +331,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 2048: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -352,7 +352,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 4096: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -373,7 +373,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 8192: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, @@ -394,7 +394,7 @@ void cuda_fourier_polynomial_mul(cudaStream_t stream, uint32_t gpu_index, } break; case 16384: - if (shared_memory_size <= cuda_get_max_shared_memory(gpu_index)) { + if (shared_memory_size <= cuda_get_max_shared_memory(0)) { buffer = (double2 *)cuda_malloc_async(0, stream, gpu_index); check_cuda_error(cudaFuncSetAttribute( batch_polynomial_mul, ForwardFFT>, diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh index dacddb7bcc..f8f12d4403 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap.cuh @@ -127,8 +127,7 @@ void execute_pbs_async( std::vector pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - PBS_TYPE pbs_type) { + uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type) { switch (sizeof(Torus)) { case sizeof(uint32_t): // 32 bits @@ -160,8 +159,7 @@ void execute_pbs_async( current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, - max_shared_memory); + polynomial_size, base_log, level_count, num_inputs_on_gpu); } break; default: @@ -200,7 +198,7 @@ void execute_pbs_async( current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_inputs_on_gpu, max_shared_memory); + num_inputs_on_gpu); } break; case CLASSICAL: @@ -228,8 +226,7 @@ void execute_pbs_async( current_lwe_output_indexes, lut_vec[i], d_lut_vector_indexes, current_lwe_array_in, current_lwe_input_indexes, bootstrapping_keys[i], pbs_buffer[i], lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, num_inputs_on_gpu, - max_shared_memory); + polynomial_size, base_log, level_count, num_inputs_on_gpu); } break; default: @@ -247,8 +244,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory, PBS_TYPE pbs_type, + uint32_t input_lwe_ciphertext_count, PBS_TYPE pbs_type, bool allocate_gpu_memory) { switch (sizeof(Torus)) { case sizeof(uint32_t): @@ -259,8 +255,7 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index, case CLASSICAL: scratch_cuda_programmable_bootstrap_32( stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size, - level_count, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory); + level_count, input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Error: unsupported cuda PBS type.") @@ -275,13 +270,12 @@ void execute_scratch_pbs(cudaStream_t stream, uint32_t gpu_index, scratch_cuda_multi_bit_programmable_bootstrap_64( stream, gpu_index, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, grouping_factor, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case CLASSICAL: scratch_cuda_programmable_bootstrap_64( stream, gpu_index, pbs_buffer, glwe_dimension, polynomial_size, - level_count, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory); + level_count, input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Error: unsupported cuda PBS type.") diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu index 084756a374..8101e8abb5 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cu @@ -20,51 +20,50 @@ uint64_t get_buffer_size_programmable_bootstrap_amortized_64( void scratch_cuda_programmable_bootstrap_amortized_32( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -81,51 +80,50 @@ void scratch_cuda_programmable_bootstrap_amortized_32( void scratch_cuda_programmable_bootstrap_amortized_64( void *stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_amortized>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -143,7 +141,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 32) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -157,7 +155,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 512: host_programmable_bootstrap_amortized>( @@ -166,7 +164,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 1024: host_programmable_bootstrap_amortized>( @@ -175,7 +173,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 2048: host_programmable_bootstrap_amortized>( @@ -184,7 +182,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 4096: host_programmable_bootstrap_amortized>( @@ -193,7 +191,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 8192: host_programmable_bootstrap_amortized>( @@ -202,7 +200,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 16384: host_programmable_bootstrap_amortized>( @@ -211,7 +209,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_32( (uint32_t *)lut_vector_indexes, (uint32_t *)lwe_array_in, (uint32_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " @@ -283,7 +281,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 64) PANIC("Cuda error (amortized PBS): base log should be > number of bits in " @@ -297,7 +295,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 512: host_programmable_bootstrap_amortized>( @@ -306,7 +304,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 1024: host_programmable_bootstrap_amortized>( @@ -315,7 +313,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 2048: host_programmable_bootstrap_amortized>( @@ -324,7 +322,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 4096: host_programmable_bootstrap_amortized>( @@ -333,7 +331,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 8192: host_programmable_bootstrap_amortized>( @@ -342,7 +340,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; case 16384: host_programmable_bootstrap_amortized>( @@ -351,7 +349,7 @@ void cuda_programmable_bootstrap_amortized_lwe_ciphertext_vector_64( (uint64_t *)lut_vector_indexes, (uint64_t *)lwe_array_in, (uint64_t *)lwe_input_indexes, (double2 *)bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, base_log, level_count, - num_samples, max_shared_memory); + num_samples); break; default: PANIC("Cuda error (amortized PBS): unsupported polynomial size. Supported " diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh index ee9fb4e1b9..34f8ebdb58 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_amortized.cuh @@ -257,10 +257,10 @@ template __host__ void scratch_programmable_bootstrap_amortized( cudaStream_t stream, uint32_t gpu_index, int8_t **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_amortized( polynomial_size, glwe_dimension); @@ -299,9 +299,10 @@ __host__ void host_programmable_bootstrap_amortized( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, int8_t *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory) { + uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t SM_FULL = get_buffer_size_full_sm_programmable_bootstrap_amortized( polynomial_size, glwe_dimension); diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh index 537eb5c66d..e9d55a94b8 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_classic.cuh @@ -161,10 +161,10 @@ __host__ void scratch_programmable_bootstrap_cg( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm = get_buffer_size_full_sm_programmable_bootstrap_cg(polynomial_size); uint64_t partial_sm = @@ -203,9 +203,9 @@ __host__ void host_programmable_bootstrap_cg( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory) { + uint32_t level_count, uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); // With SM each block corresponds to either the mask or body, no need to // duplicate data for each diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh index 2da2e4581c..1edf63da73 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_cg_multibit.cuh @@ -177,8 +177,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size = 0) { uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( @@ -190,6 +190,7 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( get_buffer_size_partial_sm_cg_multibit_programmable_bootstrap( polynomial_size); + int max_shared_memory = cuda_get_max_shared_memory(0); if (max_shared_memory < full_sm_keybundle) { check_cuda_error(cudaFuncSetAttribute( device_multi_bit_programmable_bootstrap_keybundle, @@ -243,9 +244,8 @@ __host__ void scratch_cg_multi_bit_programmable_bootstrap( } if (!lwe_chunk_size) - lwe_chunk_size = - get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, - polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size( + gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::CG, @@ -260,9 +260,10 @@ __host__ void execute_cg_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { + uint32_t lwe_chunk_size, int lwe_offset) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_dm = get_buffer_size_full_sm_cg_multibit_programmable_bootstrap( polynomial_size); @@ -336,12 +337,12 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { + uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); if (!lwe_chunk_size) - lwe_chunk_size = get_lwe_chunk_size( - gpu_index, num_samples, polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size(gpu_index, num_samples, + polynomial_size); for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); lwe_offset += lwe_chunk_size) { @@ -350,16 +351,14 @@ __host__ void host_cg_multi_bit_programmable_bootstrap( execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); // Accumulate execute_cg_external_product_loop( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu index 09a689405c..8bf0534247 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cu @@ -71,51 +71,50 @@ template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -131,7 +130,7 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples) { switch (polynomial_size) { case 256: @@ -139,56 +138,49 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 512: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 1024: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 2048: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 4096: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 8192: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 16384: host_programmable_bootstrap_tbc>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -221,51 +213,50 @@ template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap_cg>( static_cast(stream), gpu_index, pbs_buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -278,51 +269,50 @@ template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { switch (polynomial_size) { case 256: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 512: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 1024: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 2048: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 4096: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 8192: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; case 16384: scratch_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory); + allocate_gpu_memory); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -340,9 +330,9 @@ void scratch_cuda_programmable_bootstrap( void scratch_cuda_programmable_bootstrap_32( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + int max_shared_memory = cuda_get_max_shared_memory(0); #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, @@ -350,7 +340,7 @@ void scratch_cuda_programmable_bootstrap_32( scratch_cuda_programmable_bootstrap_tbc( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else #endif if (has_support_to_cuda_programmable_bootstrap_cg( @@ -359,12 +349,12 @@ void scratch_cuda_programmable_bootstrap_32( scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else scratch_cuda_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); } /* @@ -375,9 +365,9 @@ void scratch_cuda_programmable_bootstrap_32( void scratch_cuda_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { + int max_shared_memory = cuda_get_max_shared_memory(0); #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, @@ -385,7 +375,7 @@ void scratch_cuda_programmable_bootstrap_64( scratch_cuda_programmable_bootstrap_tbc( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else #endif if (has_support_to_cuda_programmable_bootstrap_cg( @@ -394,12 +384,12 @@ void scratch_cuda_programmable_bootstrap_64( scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); else scratch_cuda_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory); + input_lwe_ciphertext_count, allocate_gpu_memory); } template @@ -409,7 +399,7 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples) { switch (polynomial_size) { case 256: @@ -417,56 +407,49 @@ void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 512: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 1024: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 2048: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 4096: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 8192: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 16384: host_programmable_bootstrap_cg>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -482,7 +465,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t level_count, uint32_t num_samples) { switch (polynomial_size) { case 256: @@ -490,56 +473,49 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 512: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 1024: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 2048: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 4096: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 8192: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case 16384: host_programmable_bootstrap>( static_cast(stream), gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, glwe_dimension, - lwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + lwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (classical PBS): unsupported polynomial size. " @@ -556,7 +532,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 32) PANIC("Cuda error (classical PBS): base log should be > number of bits " @@ -576,8 +552,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") @@ -591,8 +566,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -603,8 +577,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_32( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -680,7 +653,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, uint32_t level_count, - uint32_t num_samples, uint32_t max_shared_memory) { + uint32_t num_samples) { if (base_log > 64) PANIC("Cuda error (classical PBS): base log should be > number of bits " "in the ciphertext representation (64)"); @@ -699,8 +672,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; #else PANIC("Cuda error (PBS): TBC pbs is not supported.") @@ -714,8 +686,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; case PBS_VARIANT::DEFAULT: cuda_programmable_bootstrap_lwe_ciphertext_vector( @@ -726,8 +697,7 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_array_in), static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, num_samples, - max_shared_memory); + glwe_dimension, polynomial_size, base_log, level_count, num_samples); break; default: PANIC("Cuda error (PBS): unknown pbs variant.") @@ -755,7 +725,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, @@ -764,20 +734,18 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, @@ -786,7 +754,7 @@ template void cuda_programmable_bootstrap_cg_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint32_t *lwe_array_out, @@ -795,20 +763,18 @@ template void cuda_programmable_bootstrap_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_cg( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template bool has_support_to_cuda_programmable_bootstrap_tbc( uint32_t num_samples, uint32_t glwe_dimension, uint32_t polynomial_size, @@ -825,7 +791,7 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint32_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( void *stream, uint32_t gpu_index, uint64_t *lwe_array_out, uint64_t *lwe_output_indexes, uint64_t *lut_vector, @@ -833,17 +799,15 @@ template void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector( uint64_t *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory); + uint32_t level_count, uint32_t num_samples); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); template void scratch_cuda_programmable_bootstrap_tbc( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh index da33298a2f..c90059efab 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_classic.cuh @@ -273,10 +273,10 @@ __host__ void scratch_programmable_bootstrap( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm_step_one = get_buffer_size_full_sm_programmable_bootstrap_step_one( polynomial_size); @@ -330,17 +330,19 @@ __host__ void scratch_programmable_bootstrap( } template -__host__ void execute_step_one( - cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes, - double2 *bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, - uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { +__host__ void +execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, + Torus *lut_vector_indexes, Torus *lwe_array_in, + Torus *lwe_input_indexes, double2 *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm, + uint64_t full_sm, uint64_t full_dm) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); int thds = polynomial_size / params::opt; dim3 grid(level_count, glwe_dimension + 1, input_lwe_ciphertext_count); @@ -370,17 +372,19 @@ __host__ void execute_step_one( } template -__host__ void execute_step_two( - cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, Torus *lut_vector, Torus *lut_vector_indexes, - double2 *bootstrapping_key, Torus *global_accumulator, - double2 *global_accumulator_fft, uint32_t input_lwe_ciphertext_count, - uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, int8_t *d_mem, - uint32_t max_shared_memory, int lwe_iteration, uint64_t partial_sm, - uint64_t partial_dm, uint64_t full_sm, uint64_t full_dm) { +__host__ void +execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, + Torus *lwe_output_indexes, Torus *lut_vector, + Torus *lut_vector_indexes, double2 *bootstrapping_key, + Torus *global_accumulator, double2 *global_accumulator_fft, + uint32_t input_lwe_ciphertext_count, uint32_t lwe_dimension, + uint32_t glwe_dimension, uint32_t polynomial_size, + uint32_t base_log, uint32_t level_count, int8_t *d_mem, + int lwe_iteration, uint64_t partial_sm, uint64_t partial_dm, + uint64_t full_sm, uint64_t full_dm) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); int thds = polynomial_size / params::opt; dim3 grid(input_lwe_ciphertext_count, glwe_dimension + 1); @@ -418,8 +422,7 @@ __host__ void host_programmable_bootstrap( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *pbs_buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory) { + uint32_t level_count, uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); // With SM each block corresponds to either the mask or body, no need to @@ -448,16 +451,14 @@ __host__ void host_programmable_bootstrap( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, d_mem, - max_shared_memory, i, partial_sm, partial_dm_step_one, full_sm_step_one, - full_dm_step_one); + glwe_dimension, polynomial_size, base_log, level_count, d_mem, i, + partial_sm, partial_dm_step_one, full_sm_step_one, full_dm_step_one); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, lut_vector, lut_vector_indexes, bootstrapping_key, global_accumulator, global_accumulator_fft, input_lwe_ciphertext_count, lwe_dimension, - glwe_dimension, polynomial_size, base_log, level_count, d_mem, - max_shared_memory, i, partial_sm, partial_dm_step_two, full_sm_step_two, - full_dm_step_two); + glwe_dimension, polynomial_size, base_log, level_count, d_mem, i, + partial_sm, partial_dm_step_two, full_sm_step_two, full_dm_step_two); } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu index 4cc025c139..61ca9ee30e 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cu @@ -74,7 +74,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -87,7 +87,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 512: host_cg_multi_bit_programmable_bootstrap>( @@ -95,7 +95,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 1024: host_cg_multi_bit_programmable_bootstrap>( @@ -103,7 +103,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 2048: host_cg_multi_bit_programmable_bootstrap>( @@ -111,7 +111,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 4096: host_cg_multi_bit_programmable_bootstrap>( @@ -119,7 +119,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 8192: host_cg_multi_bit_programmable_bootstrap>( @@ -127,7 +127,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 16384: host_cg_multi_bit_programmable_bootstrap>( @@ -135,7 +135,7 @@ void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -152,7 +152,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -165,7 +165,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 512: host_multi_bit_programmable_bootstrap>( @@ -173,7 +173,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 1024: host_multi_bit_programmable_bootstrap>( @@ -181,7 +181,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 2048: host_multi_bit_programmable_bootstrap>( @@ -189,7 +189,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 4096: host_multi_bit_programmable_bootstrap>( @@ -197,7 +197,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 8192: host_multi_bit_programmable_bootstrap>( @@ -205,7 +205,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 16384: host_multi_bit_programmable_bootstrap>( @@ -213,7 +213,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -228,8 +228,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( void *lwe_array_in, void *lwe_input_indexes, void *bootstrapping_key, int8_t *mem_ptr, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, - uint32_t level_count, uint32_t num_samples, uint32_t max_shared_memory, - uint32_t lwe_chunk_size) { + uint32_t level_count, uint32_t num_samples, uint32_t lwe_chunk_size) { pbs_buffer *buffer = (pbs_buffer *)mem_ptr; @@ -246,7 +245,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; #else PANIC("Cuda error (multi-bit PBS): TBC pbs is not supported.") @@ -261,7 +260,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case PBS_VARIANT::DEFAULT: cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -273,7 +272,7 @@ void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( static_cast(lwe_input_indexes), static_cast(bootstrapping_key), buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported implementation variant.") @@ -284,51 +283,51 @@ template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size) { switch (polynomial_size) { case 256: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 512: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 1024: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 2048: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 4096: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 8192: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; case 16384: scratch_cg_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, - max_shared_memory, allocate_gpu_memory, lwe_chunk_size); + allocate_gpu_memory, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -342,58 +341,58 @@ void scratch_cuda_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size) { switch (polynomial_size) { case 256: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 512: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 1024: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 2048: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 4096: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 8192: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 16384: scratch_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -406,9 +405,9 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( void *stream, uint32_t gpu_index, int8_t **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory, bool allocate_gpu_memory, - uint32_t lwe_chunk_size) { + bool allocate_gpu_memory, uint32_t lwe_chunk_size) { + int max_shared_memory = cuda_get_max_shared_memory(0); #if (CUDA_ARCH >= 900) if (has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, @@ -416,8 +415,8 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - grouping_factor, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory, + lwe_chunk_size); else #endif if (supports_cooperative_groups_on_multibit_programmable_bootstrap< @@ -426,14 +425,13 @@ void scratch_cuda_multi_bit_programmable_bootstrap_64( scratch_cuda_cg_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, max_shared_memory, allocate_gpu_memory, - lwe_chunk_size); + input_lwe_ciphertext_count, allocate_gpu_memory, lwe_chunk_size); else scratch_cuda_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - grouping_factor, input_lwe_ciphertext_count, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + grouping_factor, input_lwe_ciphertext_count, allocate_gpu_memory, + lwe_chunk_size); } void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, @@ -456,9 +454,9 @@ void cleanup_cuda_multi_bit_programmable_bootstrap(void *stream, */ template __host__ uint32_t get_lwe_chunk_size(uint32_t gpu_index, uint32_t max_num_pbs, - uint32_t polynomial_size, - uint32_t max_shared_memory) { + uint32_t polynomial_size) { + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm_keybundle = get_buffer_size_full_sm_multibit_programmable_bootstrap_keybundle( polynomial_size); @@ -513,8 +511,7 @@ template void scratch_cuda_multi_bit_programmable_bootstrap( pbs_buffer **pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory, bool allocate_gpu_memory, - uint32_t lwe_chunk_size); + bool allocate_gpu_memory, uint32_t lwe_chunk_size); template void cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -525,14 +522,14 @@ cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); template void scratch_cuda_cg_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **pbs_buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size); template void cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -543,7 +540,7 @@ cuda_cg_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); template bool has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( @@ -556,58 +553,58 @@ void scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size) { switch (polynomial_size) { case 256: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 512: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 1024: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 2048: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 4096: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 8192: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; case 16384: scratch_tbc_multi_bit_programmable_bootstrap>( static_cast(stream), gpu_index, buffer, lwe_dimension, glwe_dimension, polynomial_size, level_count, - input_lwe_ciphertext_count, grouping_factor, max_shared_memory, - allocate_gpu_memory, lwe_chunk_size); + input_lwe_ciphertext_count, grouping_factor, allocate_gpu_memory, + lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -623,7 +620,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size) { + uint32_t lwe_chunk_size) { if (base_log > 64) PANIC("Cuda error (multi-bit PBS): base log should be > number of bits in " @@ -636,7 +633,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 512: host_tbc_multi_bit_programmable_bootstrap>( @@ -644,7 +641,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 1024: host_tbc_multi_bit_programmable_bootstrap>( @@ -652,7 +649,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 2048: host_tbc_multi_bit_programmable_bootstrap>( @@ -660,7 +657,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 4096: host_tbc_multi_bit_programmable_bootstrap>( @@ -668,7 +665,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 8192: host_tbc_multi_bit_programmable_bootstrap>( @@ -676,7 +673,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; case 16384: host_tbc_multi_bit_programmable_bootstrap>( @@ -684,7 +681,7 @@ void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( lwe_output_indexes, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, bootstrapping_key, pbs_buffer, glwe_dimension, lwe_dimension, polynomial_size, grouping_factor, base_log, level_count, - num_samples, max_shared_memory, lwe_chunk_size); + num_samples, lwe_chunk_size); break; default: PANIC("Cuda error (multi-bit PBS): unsupported polynomial size. Supported " @@ -697,8 +694,8 @@ template void scratch_cuda_tbc_multi_bit_programmable_bootstrap( void *stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t grouping_factor, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory, uint32_t lwe_chunk_size); + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory, + uint32_t lwe_chunk_size); template void cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( @@ -709,5 +706,5 @@ cuda_tbc_multi_bit_programmable_bootstrap_lwe_ciphertext_vector( pbs_buffer *pbs_buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size); + uint32_t lwe_chunk_size); #endif diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh index b409f935ec..47e0b58983 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_multibit.cuh @@ -470,9 +470,8 @@ __host__ void scratch_multi_bit_programmable_bootstrap( } if (!lwe_chunk_size) - lwe_chunk_size = - get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, - polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size( + gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::DEFAULT, @@ -486,9 +485,10 @@ __host__ void execute_compute_keybundle( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t max_shared_memory, uint32_t lwe_chunk_size, int lwe_offset) { + uint32_t lwe_chunk_size, int lwe_offset) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint32_t chunk_size = std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); @@ -526,16 +526,17 @@ __host__ void execute_compute_keybundle( } template -__host__ void -execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, - Torus *lut_vector_indexes, Torus *lwe_array_in, - Torus *lwe_input_indexes, pbs_buffer *buffer, - uint32_t num_samples, uint32_t lwe_dimension, - uint32_t glwe_dimension, uint32_t polynomial_size, - uint32_t base_log, uint32_t level_count, - uint32_t max_shared_memory, int j, int lwe_offset) { +__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index, + Torus *lut_vector, Torus *lut_vector_indexes, + Torus *lwe_array_in, Torus *lwe_input_indexes, + pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, + uint32_t glwe_dimension, + uint32_t polynomial_size, uint32_t base_log, + uint32_t level_count, int j, int lwe_offset) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm_accumulate_step_one = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one( polynomial_size); @@ -581,16 +582,17 @@ execute_step_one(cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector, } template -__host__ void -execute_step_two(cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out, - Torus *lwe_output_indexes, - pbs_buffer *buffer, uint32_t num_samples, - uint32_t lwe_dimension, uint32_t glwe_dimension, - uint32_t polynomial_size, int32_t grouping_factor, - uint32_t level_count, uint32_t max_shared_memory, int j, - int lwe_offset, uint32_t lwe_chunk_size) { +__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index, + Torus *lwe_array_out, Torus *lwe_output_indexes, + pbs_buffer *buffer, + uint32_t num_samples, uint32_t lwe_dimension, + uint32_t glwe_dimension, + uint32_t polynomial_size, + int32_t grouping_factor, uint32_t level_count, + int j, int lwe_offset, uint32_t lwe_chunk_size) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); uint64_t full_sm_accumulate_step_two = get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two( polynomial_size); @@ -630,13 +632,13 @@ __host__ void host_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { + uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); // If a chunk size is not passed to this function, select one. if (!lwe_chunk_size) - lwe_chunk_size = get_lwe_chunk_size( - gpu_index, num_samples, polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size(gpu_index, num_samples, + polynomial_size); for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); lwe_offset += lwe_chunk_size) { @@ -645,8 +647,7 @@ __host__ void host_multi_bit_programmable_bootstrap( execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); // Accumulate uint32_t chunk_size = std::min( lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset); @@ -654,14 +655,12 @@ __host__ void host_multi_bit_programmable_bootstrap( execute_step_one( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, - polynomial_size, base_log, level_count, max_shared_memory, j, - lwe_offset); + polynomial_size, base_log, level_count, j, lwe_offset); execute_step_two( stream, gpu_index, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, level_count, max_shared_memory, j, lwe_offset, - lwe_chunk_size); + grouping_factor, level_count, j, lwe_offset, lwe_chunk_size); } } } diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh index f63f22748e..0c8c9ada04 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_classic.cuh @@ -165,9 +165,9 @@ __host__ void scratch_programmable_bootstrap_tbc( cudaStream_t stream, uint32_t gpu_index, pbs_buffer **buffer, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, - uint32_t input_lwe_ciphertext_count, uint32_t max_shared_memory, - bool allocate_gpu_memory) { + uint32_t input_lwe_ciphertext_count, bool allocate_gpu_memory) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); bool supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< @@ -226,9 +226,9 @@ __host__ void host_programmable_bootstrap_tbc( Torus *lwe_array_in, Torus *lwe_input_indexes, double2 *bootstrapping_key, pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log, - uint32_t level_count, uint32_t input_lwe_ciphertext_count, - uint32_t max_shared_memory) { + uint32_t level_count, uint32_t input_lwe_ciphertext_count) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); auto supports_dsm = supports_distributed_shared_memory_on_classic_programmable_bootstrap< diff --git a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh index 57378b5127..dfbefe43f0 100644 --- a/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh +++ b/backends/tfhe-cuda-backend/cuda/src/pbs/programmable_bootstrap_tbc_multibit.cuh @@ -172,10 +172,10 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( pbs_buffer **buffer, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count, uint32_t input_lwe_ciphertext_count, uint32_t grouping_factor, - uint32_t max_shared_memory, bool allocate_gpu_memory, - uint32_t lwe_chunk_size = 0) { + bool allocate_gpu_memory, uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); bool supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< @@ -254,9 +254,8 @@ __host__ void scratch_tbc_multi_bit_programmable_bootstrap( } if (!lwe_chunk_size) - lwe_chunk_size = - get_lwe_chunk_size(gpu_index, input_lwe_ciphertext_count, - polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size( + gpu_index, input_lwe_ciphertext_count, polynomial_size); *buffer = new pbs_buffer( stream, gpu_index, glwe_dimension, polynomial_size, level_count, input_lwe_ciphertext_count, lwe_chunk_size, PBS_VARIANT::TBC, @@ -271,9 +270,10 @@ __host__ void execute_tbc_external_product_loop( pbs_buffer *buffer, uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, - uint32_t lwe_chunk_size, uint32_t max_shared_memory, int lwe_offset) { + uint32_t lwe_chunk_size, int lwe_offset) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); auto supports_dsm = supports_distributed_shared_memory_on_multibit_programmable_bootstrap< Torus>(polynomial_size, max_shared_memory); @@ -365,12 +365,13 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( pbs_buffer *buffer, uint32_t glwe_dimension, uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t grouping_factor, uint32_t base_log, uint32_t level_count, uint32_t num_samples, - uint32_t max_shared_memory, uint32_t lwe_chunk_size = 0) { + uint32_t lwe_chunk_size = 0) { cudaSetDevice(gpu_index); + int max_shared_memory = cuda_get_max_shared_memory(0); if (!lwe_chunk_size) - lwe_chunk_size = get_lwe_chunk_size( - gpu_index, num_samples, polynomial_size, max_shared_memory); + lwe_chunk_size = get_lwe_chunk_size(gpu_index, num_samples, + polynomial_size); for (uint32_t lwe_offset = 0; lwe_offset < (lwe_dimension / grouping_factor); lwe_offset += lwe_chunk_size) { @@ -379,16 +380,14 @@ __host__ void host_tbc_multi_bit_programmable_bootstrap( execute_compute_keybundle( stream, gpu_index, lwe_array_in, lwe_input_indexes, bootstrapping_key, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, max_shared_memory, - lwe_chunk_size, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); // Accumulate execute_tbc_external_product_loop( stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in, lwe_input_indexes, lwe_array_out, lwe_output_indexes, buffer, num_samples, lwe_dimension, glwe_dimension, polynomial_size, - grouping_factor, base_log, level_count, lwe_chunk_size, - max_shared_memory, lwe_offset); + grouping_factor, base_log, level_count, lwe_chunk_size, lwe_offset); } } diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp index 6fc06098da..86cf18700d 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/benchmarks/benchmark_pbs.cpp @@ -174,7 +174,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) (benchmark::State &st) { if (!has_support_to_cuda_programmable_bootstrap_tbc_multi_bit( input_lwe_ciphertext_count, glwe_dimension, polynomial_size, - pbs_level, cuda_get_max_shared_memory(stream->gpu_index))) { + pbs_level, cuda_get_max_shared_memory(gpu_index))) { st.SkipWithError("Configuration not supported for tbc operation"); return; } @@ -182,8 +182,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) scratch_cuda_tbc_multi_bit_programmable_bootstrap( stream, (pbs_buffer **)&buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_level, grouping_factor, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(stream->gpu_index), - true, chunk_size); + input_lwe_ciphertext_count, true, chunk_size); for (auto _ : st) { // Execute PBS @@ -192,8 +191,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, TbcMultiBit) d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, - pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(stream->gpu_index), chunk_size); + pbs_level, input_lwe_ciphertext_count, chunk_size); cuda_synchronize_stream(stream); } @@ -213,7 +211,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) scratch_cuda_cg_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true, chunk_size); + true, chunk_size); for (auto _ : st) { // Execute PBS @@ -222,8 +220,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, CgMultiBit) d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, - pbs_base_log, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), chunk_size); + pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size); cuda_synchronize_stream(stream, gpu_index); } @@ -235,8 +232,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) scratch_cuda_multi_bit_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)&buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_level, - grouping_factor, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true, chunk_size); + grouping_factor, input_lwe_ciphertext_count, true, chunk_size); for (auto _ : st) { // Execute PBS @@ -245,8 +241,7 @@ BENCHMARK_DEFINE_F(MultiBitBootstrap_u64, DefaultMultiBit) d_lut_pbs_identity, d_lut_pbs_indexes, d_lwe_ct_in_array, d_lwe_input_indexes, d_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, - pbs_base_log, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), chunk_size); + pbs_base_log, pbs_level, input_lwe_ciphertext_count, chunk_size); cuda_synchronize_stream(stream, gpu_index); } @@ -265,8 +260,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) scratch_cuda_programmable_bootstrap_tbc( stream, (pbs_buffer **)&buffer, glwe_dimension, - polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(stream->gpu_index), true); + polynomial_size, pbs_level, input_lwe_ciphertext_count, true); for (auto _ : st) { // Execute PBS @@ -277,8 +271,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, TbcPBC) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, - cuda_get_max_shared_memory(stream->gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream); } @@ -298,7 +291,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) scratch_cuda_programmable_bootstrap_cg( stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true); + true); for (auto _ : st) { // Execute PBS @@ -309,7 +302,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, CgPBS) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream, gpu_index); } @@ -322,7 +315,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS) scratch_cuda_programmable_bootstrap( stream, gpu_index, (pbs_buffer **)&buffer, glwe_dimension, polynomial_size, pbs_level, input_lwe_ciphertext_count, - cuda_get_max_shared_memory(gpu_index), true); + true); for (auto _ : st) { // Execute PBS @@ -333,7 +326,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, DefaultPBS) (uint64_t *)d_lwe_input_indexes, (double2 *)d_fourier_bsk, (pbs_buffer *)buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream, gpu_index); } @@ -345,7 +338,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS) scratch_cuda_programmable_bootstrap_amortized_64( stream, gpu_index, &buffer, glwe_dimension, polynomial_size, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index), true); + input_lwe_ciphertext_count, true); for (auto _ : st) { // Execute PBS @@ -355,7 +348,7 @@ BENCHMARK_DEFINE_F(ClassicalBootstrap_u64, AmortizedPBS) (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in_array, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, pbs_level, - input_lwe_ciphertext_count, cuda_get_max_shared_memory(gpu_index)); + input_lwe_ciphertext_count); cuda_synchronize_stream(stream, gpu_index); } diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp index f084244d04..cc6b11ba37 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_classical_pbs.cpp @@ -107,7 +107,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) { int8_t *pbs_buffer; scratch_cuda_programmable_bootstrap_amortized_64( stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, - number_of_inputs, cuda_get_max_shared_memory(gpu_index), true); + number_of_inputs, true); int bsk_size = (glwe_dimension + 1) * (glwe_dimension + 1) * pbs_level * polynomial_size * (lwe_dimension + 1); @@ -128,7 +128,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) { (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index)); + pbs_level, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, (glwe_dimension * polynomial_size + 1) * @@ -165,9 +165,9 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, amortized_bootstrap) { TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { int8_t *pbs_buffer; - scratch_cuda_programmable_bootstrap_64( - stream, gpu_index, &pbs_buffer, glwe_dimension, polynomial_size, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), true); + scratch_cuda_programmable_bootstrap_64(stream, gpu_index, &pbs_buffer, + glwe_dimension, polynomial_size, + pbs_level, number_of_inputs, true); int number_of_sm = 0; cudaDeviceGetAttribute(&number_of_sm, cudaDevAttrMultiProcessorCount, 0); @@ -190,7 +190,7 @@ TEST_P(ClassicalProgrammableBootstrapTestPrimitives_u64, bootstrap) { (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_fourier_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, pbs_base_log, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index)); + pbs_level, number_of_inputs); // Copy result back cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, (glwe_dimension * polynomial_size + 1) * diff --git a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp index b116a777e4..82e3bbb193 100644 --- a/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp +++ b/backends/tfhe-cuda-backend/cuda/tests_and_benchmarks/tests/test_multibit_pbs.cpp @@ -93,8 +93,7 @@ class MultiBitProgrammableBootstrapTestPrimitives_u64 scratch_cuda_multi_bit_programmable_bootstrap_64( stream, gpu_index, &pbs_buffer, lwe_dimension, glwe_dimension, - polynomial_size, pbs_level, grouping_factor, number_of_inputs, - cuda_get_max_shared_memory(gpu_index), true); + polynomial_size, pbs_level, grouping_factor, number_of_inputs, true); lwe_ct_out_array = (uint64_t *)malloc((glwe_dimension * polynomial_size + 1) * @@ -136,8 +135,7 @@ TEST_P(MultiBitProgrammableBootstrapTestPrimitives_u64, (void *)d_lut_pbs_indexes, (void *)d_lwe_ct_in, (void *)d_lwe_input_indexes, (void *)d_bsk, pbs_buffer, lwe_dimension, glwe_dimension, polynomial_size, grouping_factor, pbs_base_log, - pbs_level, number_of_inputs, cuda_get_max_shared_memory(gpu_index), - 0); + pbs_level, number_of_inputs, 0); // Copy result to the host memory cuda_memcpy_async_to_cpu(lwe_ct_out_array, d_lwe_ct_out_array, diff --git a/backends/tfhe-cuda-backend/src/cuda_bind.rs b/backends/tfhe-cuda-backend/src/cuda_bind.rs index 8d84b14a63..0865caf222 100644 --- a/backends/tfhe-cuda-backend/src/cuda_bind.rs +++ b/backends/tfhe-cuda-backend/src/cuda_bind.rs @@ -136,7 +136,6 @@ extern "C" { polynomial_size: u32, level_count: u32, input_lwe_ciphertext_count: u32, - max_shared_memory: u32, allocate_gpu_memory: bool, ); @@ -168,7 +167,6 @@ extern "C" { /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) /// - `level_count`: number of decomposition levels in the gadget matrix (~4) /// - `num_samples`: number of encrypted input messages - /// - `max_shared_memory` maximum amount of shared memory to be used inside device functions /// /// This function calls a wrapper to a device kernel that performs the /// bootstrapping: @@ -205,7 +203,6 @@ extern "C" { base_log: u32, level: u32, num_samples: u32, - max_shared_memory: u32, ); /// This cleanup function frees the data for the low latency PBS on GPU @@ -228,7 +225,6 @@ extern "C" { level_count: u32, grouping_factor: u32, input_lwe_ciphertext_count: u32, - max_shared_memory: u32, allocate_gpu_memory: bool, lwe_chunk_size: u32, ); @@ -259,7 +255,6 @@ extern "C" { /// - `base_log`: log base used for the gadget matrix - B = 2^base_log (~8) /// - `level_count`: number of decomposition levels in the gadget matrix (~4) /// - `num_samples`: number of encrypted input messages - /// - `max_shared_memory` maximum amount of shared memory to be used inside device functions pub fn cuda_multi_bit_programmable_bootstrap_lwe_ciphertext_vector_64( stream: *mut c_void, gpu_index: u32, @@ -278,7 +273,6 @@ extern "C" { base_log: u32, level: u32, num_samples: u32, - max_shared_memory: u32, lwe_chunk_size: u32, ); @@ -471,7 +465,6 @@ extern "C" { grouping_factor: u32, num_blocks: u32, pbs_type: u32, - max_shared_memory: u32, allocate_gpu_memory: bool, ); diff --git a/tfhe/src/core_crypto/gpu/mod.rs b/tfhe/src/core_crypto/gpu/mod.rs index 709da6fca4..b7b115dae1 100644 --- a/tfhe/src/core_crypto/gpu/mod.rs +++ b/tfhe/src/core_crypto/gpu/mod.rs @@ -119,7 +119,6 @@ pub unsafe fn programmable_bootstrap_async( polynomial_size.0 as u32, level.0 as u32, num_samples, - get_max_shared_memory(0) as u32, true, ); cuda_programmable_bootstrap_lwe_ciphertext_vector_64( @@ -139,7 +138,6 @@ pub unsafe fn programmable_bootstrap_async( base_log.0 as u32, level.0 as u32, num_samples, - get_max_shared_memory(streams.gpu_indexes[0]) as u32, ); cleanup_cuda_programmable_bootstrap( streams.ptr[0], @@ -183,7 +181,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( level.0 as u32, grouping_factor.0 as u32, num_samples, - get_max_shared_memory(0) as u32, true, 0u32, ); @@ -205,7 +202,6 @@ pub unsafe fn programmable_bootstrap_multi_bit_async( base_log.0 as u32, level.0 as u32, num_samples, - get_max_shared_memory(0) as u32, 0, ); cleanup_cuda_multi_bit_programmable_bootstrap( diff --git a/tfhe/src/integer/gpu/mod.rs b/tfhe/src/integer/gpu/mod.rs index 654ddce257..553b18d1a3 100644 --- a/tfhe/src/integer/gpu/mod.rs +++ b/tfhe/src/integer/gpu/mod.rs @@ -3,7 +3,7 @@ pub mod server_key; use crate::core_crypto::gpu::slice::{CudaSlice, CudaSliceMut}; use crate::core_crypto::gpu::vec::CudaVec; -use crate::core_crypto::gpu::{get_max_shared_memory, CudaStreams}; +use crate::core_crypto::gpu::CudaStreams; use crate::core_crypto::prelude::{ DecompositionBaseLog, DecompositionLevelCount, GlweDimension, LweBskGroupingFactor, LweDimension, Numeric, PolynomialSize, UnsignedInteger, @@ -368,7 +368,6 @@ pub unsafe fn unchecked_mul_integer_radix_kb_assign_async