Skip to content

Commit

Permalink
[GPU/OpenCL] Broadcasting support added for addition_cl kernel.
Browse files Browse the repository at this point in the history
Added support where number of batches vary for input A and input B.

Self evaluation:

    Build test: [X]Passed [ ]Failed [ ]Skipped
    Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Niket Agarwal <[email protected]>
  • Loading branch information
niket-agarwal committed Oct 22, 2024
1 parent 72bdf54 commit 507f943
Show file tree
Hide file tree
Showing 6 changed files with 76 additions and 48 deletions.
63 changes: 36 additions & 27 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,38 +200,47 @@ void multiplyCl(Tensor &input, float const &value) {
}
}

void add_i_cl(Tensor const &input, Tensor &result) {

CREATE_IF_EMPTY_DIMS(result, result.getDim());

NNTR_THROW_IF(result.getData() == nullptr, std::invalid_argument)
<< result.getName() << " is not allocated";
NNTR_THROW_IF(input.getData() == nullptr, std::invalid_argument)
<< input.getName() << " is not allocated";

if (input.getDim() != result.getDim()) {
throw std::invalid_argument(
"Error: Dimensions does not match for addition");
}

if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
unsigned int size = input.size();
const float *data = input.getData();
float *rdata = result.getData();

addition_cl(data, rdata, size);

} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
void add_i_cl(Tensor &inputA, Tensor const &inputB) {

NNTR_THROW_IF(inputB.getData() == nullptr, std::invalid_argument)
<< inputB.getName() << " is not allocated";
NNTR_THROW_IF(inputA.getData() == nullptr, std::invalid_argument)
<< inputA.getName() << " is not allocated";

// Broadcasting done for the case where batch size vary for both inputs
// If batch size vary, batch size of inputB must be 1
if ((inputA.getDim() == inputB.getDim()) ||
(inputA.getDim() != inputB.getDim() && inputB.batch() == 1 &&
inputA.channel() == inputB.channel() &&
inputA.height() == inputB.height() &&
inputA.width() == inputB.width())) {

if (inputA.getDataType() == ml::train::TensorDim::DataType::FP32) {
unsigned int sizeA = inputA.size();
unsigned int sizeB = inputB.size();
float *dataA = inputA.getData();
const float *dataB = inputB.getData();

addition_cl(dataB, dataA, sizeB, sizeA);

} else if (inputA.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
unsigned int size = input.size();
const _FP16 *data = input.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
unsigned int sizeA = inputA.size();
unsigned int sizeB = inputB.size();
_FP16 *dataA = inputA.getData<_FP16>();
const _FP16 *dataB = inputB.getData<_FP16>();

addition_cl(data, rdata, size);
addition_cl(dataB, dataA, sizeB, sizeA);

#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

else {
throw std::invalid_argument(
"Error: Broadcasting not supported for these dimensions!");
}
}

Expand Down
6 changes: 3 additions & 3 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,11 @@ void multiplyCl(Tensor &input, float const &value);

/**
* @brief Process data and dimensions for add operation
* @param[in] input Tensor
* @param[in] result Tensor
* @param[in] inputA Tensor
* @param[in] inputB Tensor
* @param[in] RunLayerContext reference
*/
void add_i_cl(Tensor const &input, Tensor &result);
void add_i_cl(Tensor &inputA, Tensor const &inputB);

} // namespace nntrainer
#endif /* __BLAS_KERNEL_INTERFACE_H__ */
12 changes: 6 additions & 6 deletions nntrainer/tensor/cl_operations/blas_kernel_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,11 @@ static const std::string sgemm_cl_transAB_kernel_ =
})";

static const std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
R"(__kernel void addition_cl(const __global float* input, __global float* output, unsigned int size_input, unsigned int size_res) {
#pragma printf_support
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
if (idx < size_res) {
output[idx] = output[idx] + input[idx % size_input];
}
})";

Expand Down Expand Up @@ -228,10 +228,10 @@ static const std::string addition_cl_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) {
__kernel void addition_cl_fp16(const __global half* input, __global half* output, unsigned int size_input, unsigned int size_res) {
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
if (idx < size_res) {
output[idx] = output[idx] + input[idx % size_input];
}
})";

Expand Down
19 changes: 14 additions & 5 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
} while (false);
}

void addition_cl(const float *input, float *res, unsigned int size) {
void addition_cl(const float *input, float *res, unsigned int size_input,
unsigned int size_res) {

bool result = false;

Expand All @@ -291,11 +292,13 @@ void addition_cl(const float *input, float *res, unsigned int size) {
break;
}

size_t dim1_size = sizeof(float) * size;
size_t dim1_size = sizeof(float) * size_input;
size_t dim2_size = sizeof(float) * size_res;

opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
Expand All @@ -320,12 +323,18 @@ void addition_cl(const float *input, float *res, unsigned int size) {
break;
}

result = kernel_addition_ptr->SetKernelArguments(2, &size, sizeof(int));
result =
kernel_addition_ptr->SetKernelArguments(2, &size_input, sizeof(int));
if (!result) {
break;
}

result = kernel_addition_ptr->SetKernelArguments(3, &size_res, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
const int work_groups_count[3] = {(int)size_res, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = cl_context_ref.command_queue_inst_.DispatchCommand(
kernel_addition_ptr, work_groups_count, work_group_size);
Expand Down
6 changes: 4 additions & 2 deletions nntrainer/tensor/cl_operations/blas_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
*/
void addition_cl(const float *input, float *res, unsigned int size);
void addition_cl(const float *input, float *res, unsigned int size_input,
unsigned int size_res);

/**
* @brief sscal value element by element immediately
Expand Down Expand Up @@ -138,7 +139,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
*/
void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size);
void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size_input,
unsigned int size_res);

/**
* @brief fp16 sscal value element by element immediately
Expand Down
18 changes: 13 additions & 5 deletions nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
} while (false);
}

void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size_input,
unsigned int size_res) {

bool result = false;

Expand All @@ -303,11 +304,12 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
break;
}

size_t dim1_size = sizeof(cl_half) * size;
size_t dim1_size = sizeof(cl_half) * size_input;
size_t dim2_size = sizeof(cl_half) * size_res;
opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
Expand All @@ -333,12 +335,18 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
}

result =
kernel_addition_fp16_ptr->SetKernelArguments(2, &size, sizeof(int));
kernel_addition_fp16_ptr->SetKernelArguments(2, &size_input, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
result =
kernel_addition_fp16_ptr->SetKernelArguments(3, &size_res, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size_res, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = cl_context_ref.command_queue_inst_.DispatchCommand(
kernel_addition_fp16_ptr, work_groups_count, work_group_size);
Expand Down

0 comments on commit 507f943

Please sign in to comment.