Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor(gpu): Specify launch bounds on kernels #1456

Merged
merged 1 commit into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,19 @@
#include <vector>

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_cg_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_cg_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block) {

grid_group grid = this_grid();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,16 +17,17 @@
#include "types/complex/operations.cuh"

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_step_one(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_one(
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {

// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
Expand Down Expand Up @@ -131,15 +132,16 @@ __global__ void device_programmable_bootstrap_step_one(
}

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_programmable_bootstrap_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const double2 *__restrict__ bootstrapping_key, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_iteration,
uint32_t lwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_programmable_bootstrap_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const double2 *__restrict__ bootstrapping_key,
Torus *global_accumulator, double2 *global_accumulator_fft,
uint32_t lwe_iteration, uint32_t lwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
int8_t *device_mem, uint64_t device_memory_size_per_block) {

// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -145,15 +145,16 @@ __global__ void device_multi_bit_programmable_bootstrap_keybundle(
}

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_accumulate_step_one(
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t lwe_iteration, int8_t *device_mem,
uint64_t device_memory_size_per_block) {

// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
Expand Down Expand Up @@ -242,14 +243,15 @@ __global__ void device_multi_bit_programmable_bootstrap_accumulate_step_one(
}

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_accumulate_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
uint32_t lwe_chunk_size, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_accumulate_step_two(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const double2 *__restrict__ keybundle_array, Torus *global_accumulator,
double2 *global_accumulator_fft, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t level_count,
uint32_t grouping_factor, uint32_t iteration, uint32_t lwe_offset,
uint32_t lwe_chunk_size, int8_t *device_mem,
uint64_t device_memory_size_per_block) {
// We use shared memory for the polynomials that are used often during the
// bootstrap, since shared memory is kept in L1 cache and accessing it is
// much faster than global memory
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,18 +18,20 @@
#include <vector>

template <typename Torus, class params, sharedMemDegree SMD>
__global__ void device_multi_bit_programmable_bootstrap_tbc_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension, uint32_t glwe_dimension,
uint32_t polynomial_size, uint32_t base_log, uint32_t level_count,
uint32_t grouping_factor, uint32_t lwe_offset, uint32_t lwe_chunk_size,
uint32_t keybundle_size_per_input, int8_t *device_mem,
uint64_t device_memory_size_per_block, bool support_dsm) {
__global__ void __launch_bounds__(params::degree / params::opt)
device_multi_bit_programmable_bootstrap_tbc_accumulate(
Torus *lwe_array_out, const Torus *__restrict__ lwe_output_indexes,
const Torus *__restrict__ lut_vector,
const Torus *__restrict__ lut_vector_indexes,
const Torus *__restrict__ lwe_array_in,
const Torus *__restrict__ lwe_input_indexes,
const double2 *__restrict__ keybundle_array, double2 *join_buffer,
Torus *global_accumulator, uint32_t lwe_dimension,
uint32_t glwe_dimension, uint32_t polynomial_size, uint32_t base_log,
uint32_t level_count, uint32_t grouping_factor, uint32_t lwe_offset,
uint32_t lwe_chunk_size, uint32_t keybundle_size_per_input,
int8_t *device_mem, uint64_t device_memory_size_per_block,
bool support_dsm) {

cluster_group cluster = this_cluster();

Expand Down
Loading