Skip to content

Commit

Permalink
[GPU/OpenCL] Broadcasting support added for addition_cl kernel.
Browse files Browse the repository at this point in the history
Added support where number of batches vary for input A and input B.

Self evaluation:

    Build test: [X]Passed [ ]Failed [ ]Skipped
    Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Niket Agarwal <[email protected]>
  • Loading branch information
niket-agarwal committed Nov 13, 2024
1 parent ae8567d commit fcaad63
Show file tree
Hide file tree
Showing 10 changed files with 79 additions and 52 deletions.
6 changes: 3 additions & 3 deletions nntrainer/cl_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@ static void add_default_object(ClContext &cc) {
FullyConnectedLayerCl::type,
ml::train::LayerType::LAYER_FC);

// cc.registerFactory(nntrainer::createLayer<AdditionLayerCL>,
// AdditionLayerCL::type,
// ml::train::LayerType::LAYER_ADDITION);
cc.registerFactory(nntrainer::createLayer<AdditionLayerCL>,
AdditionLayerCL::type,
ml::train::LayerType::LAYER_ADDITION);

cc.registerFactory(nntrainer::createLayer<SwiGLULayerCl>, SwiGLULayerCl::type,
ml::train::LayerType::LAYER_SWIGLU);
Expand Down
4 changes: 2 additions & 2 deletions nntrainer/layers/cl_layers/addition_layer_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void AdditionLayerCL::forwarding(RunLayerContext &context, bool training) {
if (!idx) {
hidden_.copy(input_);
} else {
add_i_cl(input_, hidden_, context);
add_i_cl(hidden_, input_);
}
}
}
Expand Down Expand Up @@ -77,7 +77,7 @@ void AdditionLayerCL::incremental_forwarding(RunLayerContext &context,
if (!idx) {
hidden_step.copy(input_step);
} else {
add_i_cl(input_step, hidden_step, context);
add_i_cl(hidden_step, input_step);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions nntrainer/layers/cl_layers/addition_layer_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#define __ADDITION_LAYER_CL_H__
#ifdef __cplusplus

#include <cl_context.h>
#include <common_properties.h>
#include <layer_devel.h>

Expand Down
4 changes: 2 additions & 2 deletions nntrainer/layers/cl_layers/meson.build
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cl_layer_sources = [
'fc_layer_cl.cpp',
# 'addition_layer_cl.cpp',
'fc_layer_cl.cpp',
'addition_layer_cl.cpp',
'swiglu_cl.cpp',
'reshape_cl.cpp',
'rmsnorm_layer_cl.cpp',
Expand Down
48 changes: 28 additions & 20 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,38 +200,46 @@ void multiplyCl(Tensor &input, float const &value) {
}
}

void add_i_cl(Tensor const &input, Tensor &result) {
void add_i_cl(Tensor &result, Tensor const &input) {

CREATE_IF_EMPTY_DIMS(result, result.getDim());

NNTR_THROW_IF(result.getData() == nullptr, std::invalid_argument)
<< result.getName() << " is not allocated";
NNTR_THROW_IF(input.getData() == nullptr, std::invalid_argument)
<< input.getName() << " is not allocated";
NNTR_THROW_IF(result.getData() == nullptr, std::invalid_argument)
<< result.getName() << " is not allocated";

if (input.getDim() != result.getDim()) {
throw std::invalid_argument(
"Error: Dimensions does not match for addition");
}
// Broadcasting done for the case where batch size vary for both inputs
// If batch size vary, batch size of input must be 1
if ((result.getDim() == input.getDim()) ||
(result.getDim() != input.getDim() && input.batch() == 1 &&
result.channel() == input.channel() &&
result.height() == input.height() && result.width() == input.width())) {

if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
unsigned int size = input.size();
const float *data = input.getData();
float *rdata = result.getData();
if (result.getDataType() == ml::train::TensorDim::DataType::FP32) {
unsigned int size_res = result.size();
unsigned int size_input = input.size();
float *data_res = result.getData();
const float *data_input = input.getData();

addition_cl(data, rdata, size);
addition_cl(data_input, data_res, size_input, size_res);

} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
} else if (result.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
unsigned int size = input.size();
const _FP16 *data = input.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
unsigned int size_res = result.size();
unsigned int size_input = input.size();
_FP16 *data_res = result.getData<_FP16>();
const _FP16 *data_input = input.getData<_FP16>();

addition_cl(data, rdata, size);
addition_cl(data_input, data_res, size_input, size_res);

#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

else {
throw std::invalid_argument(
"Error: Broadcasting not supported for these dimensions!");
}
}

Expand Down
5 changes: 2 additions & 3 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,10 @@ void multiplyCl(Tensor &input, float const &value);

/**
* @brief Process data and dimensions for add operation
* @param[in] input Tensor
* @param[in] result Tensor
* @param[in] RunLayerContext reference
* @param[in] input Tensor
*/
void add_i_cl(Tensor const &input, Tensor &result);
void add_i_cl(Tensor &result, Tensor const &input);

} // namespace nntrainer
#endif /* __BLAS_KERNEL_INTERFACE_H__ */
12 changes: 6 additions & 6 deletions nntrainer/tensor/cl_operations/blas_kernel_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,11 +106,11 @@ static const std::string sgemm_cl_transAB_kernel_ =
})";

static const std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
R"(__kernel void addition_cl(const __global float* input, __global float* output, unsigned int size_input, unsigned int size_res) {
#pragma printf_support
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
if (idx < size_res) {
output[idx] = output[idx] + input[idx % size_input];
}
})";

Expand Down Expand Up @@ -228,10 +228,10 @@ static const std::string addition_cl_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) {
__kernel void addition_cl_fp16(const __global half* input, __global half* output, unsigned int size_input, unsigned int size_res) {
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
if (idx < size_res) {
output[idx] = output[idx] + input[idx % size_input];
}
})";

Expand Down
19 changes: 14 additions & 5 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -280,7 +280,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
} while (false);
}

void addition_cl(const float *input, float *res, unsigned int size) {
void addition_cl(const float *input, float *res, unsigned int size_input,
unsigned int size_res) {

bool result = false;

Expand All @@ -291,11 +292,13 @@ void addition_cl(const float *input, float *res, unsigned int size) {
break;
}

size_t dim1_size = sizeof(float) * size;
size_t dim1_size = sizeof(float) * size_input;
size_t dim2_size = sizeof(float) * size_res;

opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
Expand All @@ -320,12 +323,18 @@ void addition_cl(const float *input, float *res, unsigned int size) {
break;
}

result = kernel_addition_ptr->SetKernelArguments(2, &size, sizeof(int));
result =
kernel_addition_ptr->SetKernelArguments(2, &size_input, sizeof(int));
if (!result) {
break;
}

result = kernel_addition_ptr->SetKernelArguments(3, &size_res, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
const int work_groups_count[3] = {(int)size_res, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = cl_context_ref.command_queue_inst_.DispatchCommand(
kernel_addition_ptr, work_groups_count, work_group_size);
Expand Down
14 changes: 8 additions & 6 deletions nntrainer/tensor/cl_operations/blas_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -71,10 +71,11 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
* @brief addition : sum of all input vectors
* @param[in] input float * for input
* @param[in] res float * for result/output
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
* @param[in] size_input number of elements in input vector
* @param[in] size_res number of elements in result vector
*/
void addition_cl(const float *input, float *res, unsigned int size);
void addition_cl(const float *input, float *res, unsigned int size_input,
unsigned int size_res);

/**
* @brief sscal value element by element immediately
Expand Down Expand Up @@ -135,10 +136,11 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
* @brief fp16 addition : sum of all input vectors
* @param[in] input fp16 * for input
* @param[in] res fp16 * for result/output
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
* @param[in] size_input number of elements in input vector
* @param[in] size_res number of elements in result vector
*/
void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size);
void addition_cl(const _FP16 *input, _FP16 *res, unsigned int size_input,
unsigned int size_res);

/**
* @brief fp16 sscal value element by element immediately
Expand Down
18 changes: 13 additions & 5 deletions nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -291,7 +291,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
} while (false);
}

void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
void addition_cl(const _FP16 *input, _FP16 *res, unsigned int size_input,
unsigned int size_res) {

bool result = false;

Expand All @@ -303,11 +304,12 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
break;
}

size_t dim1_size = sizeof(cl_half) * size;
size_t dim1_size = sizeof(cl_half) * size_input;
size_t dim2_size = sizeof(cl_half) * size_res;
opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
Expand All @@ -333,12 +335,18 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
}

result =
kernel_addition_fp16_ptr->SetKernelArguments(2, &size, sizeof(int));
kernel_addition_fp16_ptr->SetKernelArguments(2, &size_input, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
result =
kernel_addition_fp16_ptr->SetKernelArguments(3, &size_res, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size_res, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = cl_context_ref.command_queue_inst_.DispatchCommand(
kernel_addition_fp16_ptr, work_groups_count, work_group_size);
Expand Down

0 comments on commit fcaad63

Please sign in to comment.