Skip to content

Commit

Permalink
chore(gpu): return if chunk_size is 0
Browse files Browse the repository at this point in the history
  • Loading branch information
agnesLeroy committed Sep 12, 2024
1 parent 9dca245 commit 8314e7d
Show file tree
Hide file tree
Showing 6 changed files with 22 additions and 57 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -81,14 +81,6 @@ void cuda_programmable_bootstrap_lwe_ciphertext_vector_64(

void cleanup_cuda_programmable_bootstrap(void *stream, uint32_t gpu_index,
int8_t **pbs_buffer);

uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count);

uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count);
}

template <typename Torus>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,5 @@
#include "programmable_bootstrap_amortized.cuh"

/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_programmable_bootstrap_amortized_64(
uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t input_lwe_ciphertext_count) {
return get_buffer_size_programmable_bootstrap_amortized<uint64_t>(
glwe_dimension, polynomial_size, input_lwe_ciphertext_count);
}

/*
* This scratch function allocates the necessary amount of data on the GPU for
* the amortized PBS on 32 bits inputs, into `buffer`. It also
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -256,7 +256,7 @@ __host__ void execute_cg_external_product_loop(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, int lwe_offset) {
uint32_t lwe_chunk_size, uint32_t lwe_offset) {

uint64_t full_dm =
get_buffer_size_full_sm_cg_multibit_programmable_bootstrap<Torus>(
Expand All @@ -275,6 +275,8 @@ __host__ void execute_cg_external_product_loop(

uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;

auto d_mem = buffer->d_mem_acc_cg;
auto keybundle_fft = buffer->keybundle_fft;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -182,25 +182,6 @@ void cuda_programmable_bootstrap_tbc_lwe_ciphertext_vector(
}
#endif

/*
* Returns the buffer size for 64 bits executions
*/
uint64_t get_buffer_size_programmable_bootstrap_64(
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t input_lwe_ciphertext_count) {

if (has_support_to_cuda_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count))
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count);
else
return get_buffer_size_programmable_bootstrap_cg<uint64_t>(
glwe_dimension, polynomial_size, level_count,
input_lwe_ciphertext_count);
}

template <typename Torus>
void scratch_cuda_programmable_bootstrap_cg(
void *stream, uint32_t gpu_index, pbs_buffer<Torus, CLASSICAL> **pbs_buffer,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -465,10 +465,12 @@ __host__ void execute_compute_keybundle(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, int lwe_offset) {
uint32_t lwe_chunk_size, uint32_t lwe_offset) {

uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;

uint32_t keybundle_size_per_input =
lwe_chunk_size * level_count * (glwe_dimension + 1) *
Expand Down Expand Up @@ -506,14 +508,12 @@ __host__ void execute_compute_keybundle(
}

template <typename Torus, class params>
__host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
Torus *lut_vector, Torus *lut_vector_indexes,
Torus *lwe_array_in, Torus *lwe_input_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int j, int lwe_offset) {
__host__ void execute_step_one(
cudaStream_t stream, uint32_t gpu_index, Torus *lut_vector,
Torus *lut_vector_indexes, Torus *lwe_array_in, Torus *lwe_input_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t base_log, uint32_t level_count, uint32_t j, uint32_t lwe_offset) {

uint64_t full_sm_accumulate_step_one =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_one<Torus>(
Expand Down Expand Up @@ -562,14 +562,12 @@ __host__ void execute_step_one(cudaStream_t stream, uint32_t gpu_index,
}

template <typename Torus, class params>
__host__ void execute_step_two(cudaStream_t stream, uint32_t gpu_index,
Torus *lwe_array_out, Torus *lwe_output_indexes,
pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension,
uint32_t glwe_dimension,
uint32_t polynomial_size,
int32_t grouping_factor, uint32_t level_count,
int j, int lwe_offset, uint32_t lwe_chunk_size) {
__host__ void execute_step_two(
cudaStream_t stream, uint32_t gpu_index, Torus *lwe_array_out,
Torus *lwe_output_indexes, pbs_buffer<Torus, MULTI_BIT> *buffer,
uint32_t num_samples, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, int32_t grouping_factor, uint32_t level_count,
uint32_t j, uint32_t lwe_offset, uint32_t lwe_chunk_size) {

uint64_t full_sm_accumulate_step_two =
get_buffer_size_full_sm_multibit_programmable_bootstrap_step_two<Torus>(
Expand Down Expand Up @@ -627,7 +625,7 @@ __host__ void host_multi_bit_programmable_bootstrap(
// Accumulate
uint32_t chunk_size = std::min(
lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
for (int j = 0; j < chunk_size; j++) {
for (uint32_t j = 0; j < chunk_size; j++) {
execute_step_one<Torus, params>(
stream, gpu_index, lut_vector, lut_vector_indexes, lwe_array_in,
lwe_input_indexes, buffer, num_samples, lwe_dimension, glwe_dimension,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,7 @@ __host__ void execute_tbc_external_product_loop(
pbs_buffer<Torus, MULTI_BIT> *buffer, uint32_t num_samples,
uint32_t lwe_dimension, uint32_t glwe_dimension, uint32_t polynomial_size,
uint32_t grouping_factor, uint32_t base_log, uint32_t level_count,
uint32_t lwe_chunk_size, int lwe_offset) {
uint32_t lwe_chunk_size, uint32_t lwe_offset) {

auto supports_dsm =
supports_distributed_shared_memory_on_multibit_programmable_bootstrap<
Expand All @@ -294,6 +294,8 @@ __host__ void execute_tbc_external_product_loop(

uint32_t chunk_size =
std::min(lwe_chunk_size, (lwe_dimension / grouping_factor) - lwe_offset);
if (chunk_size == 0)
return;

auto d_mem = buffer->d_mem_acc_tbc;
auto keybundle_fft = buffer->keybundle_fft;
Expand Down

0 comments on commit 8314e7d

Please sign in to comment.