Skip to content

Commit

Permalink
feat(gpu): implement ilog2, trailing and leading zeros and ones on GPU
Browse files Browse the repository at this point in the history
  • Loading branch information
guillermo-oyarzun committed Aug 9, 2024
1 parent 5340859 commit 267ddd8
Show file tree
Hide file tree
Showing 25 changed files with 2,763 additions and 1,111 deletions.
35 changes: 29 additions & 6 deletions backends/tfhe-cuda-backend/cuda/include/integer.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,7 +283,7 @@ void cleanup_cuda_propagate_single_carry(void **streams, uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
Expand All @@ -292,15 +292,14 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
uint32_t message_modulus, uint32_t carry_modulus, PBS_TYPE pbs_type,
bool allocate_gpu_memory);

void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix);

void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_radix_overflowing_sub_kb_64(
void **stream, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
Expand Down Expand Up @@ -375,6 +374,30 @@ void cleanup_signed_overflowing_add_or_sub(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void);

void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory);

void cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t shift);

void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void);

void cuda_integer_reverse_blocks_64_inplace(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count, void *lwe_array,
uint32_t num_blocks,
uint32_t lwe_size);

} // extern C

template <typename Torus>
Expand Down
52 changes: 52 additions & 0 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,3 +173,55 @@ void cleanup_cuda_apply_bivariate_lut_kb_64(void **streams,
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

void scratch_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
void *input_lut, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t ks_level, uint32_t ks_base_log,
uint32_t pbs_level, uint32_t pbs_base_log, uint32_t grouping_factor,
uint32_t num_radix_blocks, uint32_t message_modulus, uint32_t carry_modulus,
PBS_TYPE pbs_type, bool allocate_gpu_memory) {

int_radix_params params(pbs_type, glwe_dimension, polynomial_size,
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);

scratch_cuda_apply_bivariate_lut_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_radix_lut<uint64_t> **)mem_ptr, static_cast<uint64_t *>(input_lut),
num_radix_blocks, params, allocate_gpu_memory);
}

void cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *output_radix_lwe, void *input_radix_lwe, int8_t *mem_ptr, void **ksks,
void **bsks, uint32_t num_blocks, uint32_t shift) {

int_radix_params params = ((int_radix_lut<uint64_t> *)mem_ptr)->params;

host_compute_prefix_sum_hillis_steele<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(output_radix_lwe),
static_cast<uint64_t *>(input_radix_lwe), params,
(int_radix_lut<uint64_t> *)mem_ptr, bsks, (uint64_t **)(ksks),
num_blocks);
}

void cleanup_cuda_integer_compute_prefix_sum_hillis_steele_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_radix_lut<uint64_t> *mem_ptr = (int_radix_lut<uint64_t> *)(*mem_ptr_void);
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

void cuda_integer_reverse_blocks_64_inplace(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count, void *lwe_array,
uint32_t num_blocks,
uint32_t lwe_size) {

host_radix_blocks_reverse_inplace<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes,
static_cast<uint64_t *>(lwe_array), num_blocks, lwe_size);
}
114 changes: 68 additions & 46 deletions backends/tfhe-cuda-backend/cuda/src/integer/integer.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,35 @@ host_radix_blocks_rotate_left(cudaStream_t *streams, uint32_t *gpu_indexes,
dst, src, value, blocks_count, lwe_size);
}

// reverse the blocks in a list
// each cuda block swaps a couple of blocks
template <typename Torus>
__global__ void radix_blocks_reverse_lwe_inplace(Torus *src,
uint32_t blocks_count,
uint32_t lwe_size) {

size_t idx = blockIdx.x;
size_t rev_idx = blocks_count - 1 - idx;

for (int j = threadIdx.x; j < lwe_size; j += blockDim.x) {
Torus back_element = src[rev_idx * lwe_size + j];
Torus front_element = src[idx * lwe_size + j];
src[idx * lwe_size + j] = back_element;
src[rev_idx * lwe_size + j] = front_element;
}
}

template <typename Torus>
__host__ void
host_radix_blocks_reverse_inplace(cudaStream_t *streams, uint32_t *gpu_indexes,
Torus *src, uint32_t blocks_count,
uint32_t lwe_size) {
cudaSetDevice(gpu_indexes[0]);
int num_blocks = blocks_count / 2, num_threads = 1024;
radix_blocks_reverse_lwe_inplace<<<num_blocks, num_threads, 0, streams[0]>>>(
src, blocks_count, lwe_size);
}

// polynomial_size threads
template <typename Torus>
__global__ void
Expand Down Expand Up @@ -501,30 +530,17 @@ void scratch_cuda_propagate_single_carry_kb_inplace(
}

template <typename Torus>
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array,
Torus *carry_out, Torus *input_carries,
int_sc_prop_memory<Torus> *mem, void **bsks,
Torus **ksks, uint32_t num_blocks) {
auto params = mem->params;
void host_compute_prefix_sum_hillis_steele(
cudaStream_t *streams, uint32_t *gpu_indexes, uint32_t gpu_count,
Torus *step_output, Torus *generates_or_propagates, int_radix_params params,
int_radix_lut<Torus> *luts, void **bsks, Torus **ksks,
uint32_t num_blocks) {

auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

auto generates_or_propagates = mem->generates_or_propagates;
auto step_output = mem->step_output;

auto luts_array = mem->luts_array;
auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
auto message_acc = mem->message_acc;

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks,
ksks, num_blocks, luts_array);

// compute prefix sum with hillis&steele

int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
Expand All @@ -541,15 +557,42 @@ void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
luts_carry_propagation_sum->params.message_modulus);
bsks, ksks, cur_total_blocks, luts, luts->params.message_modulus);

cuda_synchronize_stream(streams[0], gpu_indexes[0]);
cuda_memcpy_async_gpu_to_gpu(
&generates_or_propagates[space * big_lwe_size], cur_blocks,
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
space *= 2;
}
}

template <typename Torus>
void host_propagate_single_carry(cudaStream_t *streams, uint32_t *gpu_indexes,
uint32_t gpu_count, Torus *lwe_array,
Torus *carry_out, Torus *input_carries,
int_sc_prop_memory<Torus> *mem, void **bsks,
Torus **ksks, uint32_t num_blocks) {
auto params = mem->params;
auto glwe_dimension = params.glwe_dimension;
auto polynomial_size = params.polynomial_size;
auto big_lwe_size = glwe_dimension * polynomial_size + 1;
auto big_lwe_size_bytes = big_lwe_size * sizeof(Torus);

auto generates_or_propagates = mem->generates_or_propagates;
auto step_output = mem->step_output;

auto luts_array = mem->luts_array;
auto luts_carry_propagation_sum = mem->luts_carry_propagation_sum;
auto message_acc = mem->message_acc;

integer_radix_apply_univariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, generates_or_propagates, lwe_array, bsks,
ksks, num_blocks, luts_array);

// compute prefix sum with hillis&steele
host_compute_prefix_sum_hillis_steele(
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

host_radix_blocks_rotate_right(streams, gpu_indexes, gpu_count, step_output,
generates_or_propagates, 1, num_blocks,
Expand Down Expand Up @@ -613,30 +656,9 @@ void host_propagate_single_sub_borrow(cudaStream_t *streams,
ksks, num_blocks, luts_array);

// compute prefix sum with hillis&steele
int num_steps = ceil(log2((double)num_blocks));
int space = 1;
cuda_memcpy_async_gpu_to_gpu(step_output, generates_or_propagates,
big_lwe_size_bytes * num_blocks, streams[0],
gpu_indexes[0]);

for (int step = 0; step < num_steps; step++) {
if (space > num_blocks - 1)
PANIC("Cuda error: step output is going out of bounds in Hillis Steele "
"propagation")
auto cur_blocks = &step_output[space * big_lwe_size];
auto prev_blocks = generates_or_propagates;
int cur_total_blocks = num_blocks - space;

integer_radix_apply_bivariate_lookup_table_kb<Torus>(
streams, gpu_indexes, gpu_count, cur_blocks, cur_blocks, prev_blocks,
bsks, ksks, cur_total_blocks, luts_carry_propagation_sum,
luts_carry_propagation_sum->params.message_modulus);

cuda_memcpy_async_gpu_to_gpu(
&generates_or_propagates[space * big_lwe_size], cur_blocks,
big_lwe_size_bytes * cur_total_blocks, streams[0], gpu_indexes[0]);
space *= 2;
}
host_compute_prefix_sum_hillis_steele<Torus>(
streams, gpu_indexes, gpu_count, step_output, generates_or_propagates,
params, luts_carry_propagation_sum, bsks, ksks, num_blocks);

cuda_memcpy_async_gpu_to_gpu(
overflowed, &generates_or_propagates[big_lwe_size * (num_blocks - 1)],
Expand Down
30 changes: 17 additions & 13 deletions backends/tfhe-cuda-backend/cuda/src/integer/multiplication.cu
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ void cleanup_cuda_integer_mult(void **streams, uint32_t *gpu_indexes,
mem_ptr->release((cudaStream_t *)(streams), gpu_indexes, gpu_count);
}

void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void scratch_cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count, int8_t **mem_ptr,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t lwe_dimension,
uint32_t ks_level, uint32_t ks_base_log, uint32_t pbs_level,
Expand All @@ -215,13 +215,13 @@ void scratch_cuda_integer_radix_sum_ciphertexts_vec_kb_64(
glwe_dimension * polynomial_size, lwe_dimension,
ks_level, ks_base_log, pbs_level, pbs_base_log,
grouping_factor, message_modulus, carry_modulus);
scratch_cuda_integer_sum_ciphertexts_vec_kb<uint64_t>(
scratch_cuda_integer_partial_sum_ciphertexts_vec_kb<uint64_t>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
(int_sum_ciphertexts_vec_memory<uint64_t> **)mem_ptr, num_blocks_in_radix,
max_num_radix_in_vec, params, allocate_gpu_memory);
}

void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
void cuda_integer_radix_partial_sum_ciphertexts_vec_kb_64(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
void *radix_lwe_out, void *radix_lwe_vec, uint32_t num_radix_in_vec,
int8_t *mem_ptr, void **bsks, void **ksks, uint32_t num_blocks_in_radix) {
Expand All @@ -237,42 +237,47 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(

switch (mem->params.polynomial_size) {
case 512:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<512>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 1024:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<1024>>(
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<1024>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 2048:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<2048>>(
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<2048>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 4096:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<4096>>(
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<4096>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 8192:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<8192>>(
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<8192>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
(uint64_t **)(ksks), mem, num_blocks_in_radix, num_radix_in_vec);
break;
case 16384:
host_integer_sum_ciphertexts_vec_kb<uint64_t, AmortizedDegree<16384>>(
host_integer_partial_sum_ciphertexts_vec_kb<uint64_t,
AmortizedDegree<16384>>(
(cudaStream_t *)(streams), gpu_indexes, gpu_count,
static_cast<uint64_t *>(radix_lwe_out),
static_cast<uint64_t *>(radix_lwe_vec), terms_degree, bsks,
Expand All @@ -286,10 +291,9 @@ void cuda_integer_radix_sum_ciphertexts_vec_kb_64(
free(terms_degree);
}

void cleanup_cuda_integer_radix_sum_ciphertexts_vec(void **streams,
uint32_t *gpu_indexes,
uint32_t gpu_count,
int8_t **mem_ptr_void) {
void cleanup_cuda_integer_radix_partial_sum_ciphertexts_vec(
void **streams, uint32_t *gpu_indexes, uint32_t gpu_count,
int8_t **mem_ptr_void) {
int_sum_ciphertexts_vec_memory<uint64_t> *mem_ptr =
(int_sum_ciphertexts_vec_memory<uint64_t> *)(*mem_ptr_void);

Expand Down
Loading

0 comments on commit 267ddd8

Please sign in to comment.