diff --git a/nntrainer/cl_context.cpp b/nntrainer/cl_context.cpp index 10e3ecdbb7..981249afe1 100644 --- a/nntrainer/cl_context.cpp +++ b/nntrainer/cl_context.cpp @@ -36,9 +36,9 @@ static void add_default_object(ClContext &cc) { FullyConnectedLayerCl::type, ml::train::LayerType::LAYER_FC); - // cc.registerFactory(nntrainer::createLayer, - // AdditionLayerCL::type, - // ml::train::LayerType::LAYER_ADDITION); + cc.registerFactory(nntrainer::createLayer, + AdditionLayerCL::type, + ml::train::LayerType::LAYER_ADDITION); cc.registerFactory(nntrainer::createLayer, SwiGLULayerCl::type, ml::train::LayerType::LAYER_SWIGLU); diff --git a/nntrainer/layers/cl_layers/addition_layer_cl.cpp b/nntrainer/layers/cl_layers/addition_layer_cl.cpp index dda2101645..b6788f5461 100644 --- a/nntrainer/layers/cl_layers/addition_layer_cl.cpp +++ b/nntrainer/layers/cl_layers/addition_layer_cl.cpp @@ -37,7 +37,7 @@ void AdditionLayerCL::forwarding(RunLayerContext &context, bool training) { if (!idx) { hidden_.copy(input_); } else { - add_i_cl(input_, hidden_, context); + add_i_cl(hidden_, input_); } } } @@ -77,7 +77,7 @@ void AdditionLayerCL::incremental_forwarding(RunLayerContext &context, if (!idx) { hidden_step.copy(input_step); } else { - add_i_cl(input_step, hidden_step, context); + add_i_cl(hidden_step, input_step); } } } diff --git a/nntrainer/layers/cl_layers/addition_layer_cl.h b/nntrainer/layers/cl_layers/addition_layer_cl.h index e24354e8d0..d20c2c0115 100644 --- a/nntrainer/layers/cl_layers/addition_layer_cl.h +++ b/nntrainer/layers/cl_layers/addition_layer_cl.h @@ -15,6 +15,7 @@ #define __ADDITION_LAYER_CL_H__ #ifdef __cplusplus +#include #include #include diff --git a/nntrainer/layers/cl_layers/meson.build b/nntrainer/layers/cl_layers/meson.build index fbfd46961b..519e607ee5 100644 --- a/nntrainer/layers/cl_layers/meson.build +++ b/nntrainer/layers/cl_layers/meson.build @@ -1,6 +1,6 @@ cl_layer_sources = [ - 'fc_layer_cl.cpp', - # 'addition_layer_cl.cpp', + 'fc_layer_cl.cpp', + 'addition_layer_cl.cpp', 'swiglu_cl.cpp', 'reshape_cl.cpp', 'rmsnorm_layer_cl.cpp', diff --git a/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp b/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp index 23af3f9799..73c3f769ee 100644 --- a/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp @@ -200,38 +200,47 @@ void multiplyCl(Tensor &input, float const &value) { } } -void add_i_cl(Tensor const &input, Tensor &result) { - - CREATE_IF_EMPTY_DIMS(result, result.getDim()); - - NNTR_THROW_IF(result.getData() == nullptr, std::invalid_argument) - << result.getName() << " is not allocated"; - NNTR_THROW_IF(input.getData() == nullptr, std::invalid_argument) - << input.getName() << " is not allocated"; - - if (input.getDim() != result.getDim()) { - throw std::invalid_argument( - "Error: Dimensions does not match for addition"); - } - - if (input.getDataType() == ml::train::TensorDim::DataType::FP32) { - unsigned int size = input.size(); - const float *data = input.getData(); - float *rdata = result.getData(); - - addition_cl(data, rdata, size); - - } else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) { +void add_i_cl(Tensor &inputA, Tensor const &inputB) { + + NNTR_THROW_IF(inputB.getData() == nullptr, std::invalid_argument) + << inputB.getName() << " is not allocated"; + NNTR_THROW_IF(inputA.getData() == nullptr, std::invalid_argument) + << inputA.getName() << " is not allocated"; + + // Broadcasting done for the case where batch size vary for both inputs + // If batch size vary, batch size of inputB must be 1 + if ((inputA.getDim() == inputB.getDim()) || + (inputA.getDim() != inputB.getDim() && inputB.batch() == 1 && + inputA.channel() == inputB.channel() && + inputA.height() == inputB.height() && + inputA.width() == inputB.width())) { + + if (inputA.getDataType() == ml::train::TensorDim::DataType::FP32) { + unsigned int sizeA = inputA.size(); + unsigned int sizeB = inputB.size(); + float *dataA = inputA.getData(); + const float *dataB = inputB.getData(); + + addition_cl(dataB, dataA, sizeB, sizeA); + + } else if (inputA.getDataType() == ml::train::TensorDim::DataType::FP16) { #ifdef ENABLE_FP16 - unsigned int size = input.size(); - const _FP16 *data = input.getData<_FP16>(); - _FP16 *rdata = result.getData<_FP16>(); + unsigned int sizeA = inputA.size(); + unsigned int sizeB = inputB.size(); + _FP16 *dataA = inputA.getData<_FP16>(); + const _FP16 *dataB = inputB.getData<_FP16>(); - addition_cl(data, rdata, size); + addition_cl(dataB, dataA, sizeB, sizeA); #else - throw std::invalid_argument("Error: enable-fp16 is not enabled"); + throw std::invalid_argument("Error: enable-fp16 is not enabled"); #endif + } + } + + else { + throw std::invalid_argument( + "Error: Broadcasting not supported for these dimensions!"); } } diff --git a/nntrainer/tensor/cl_operations/blas_kernel_interface.h b/nntrainer/tensor/cl_operations/blas_kernel_interface.h index 0b8d29a53c..91029016c4 100644 --- a/nntrainer/tensor/cl_operations/blas_kernel_interface.h +++ b/nntrainer/tensor/cl_operations/blas_kernel_interface.h @@ -64,11 +64,10 @@ void multiplyCl(Tensor &input, float const &value); /** * @brief Process data and dimensions for add operation - * @param[in] input Tensor - * @param[in] result Tensor - * @param[in] RunLayerContext reference + * @param[in] inputA Tensor + * @param[in] inputB Tensor */ -void add_i_cl(Tensor const &input, Tensor &result); +void add_i_cl(Tensor &inputA, Tensor const &inputB); } // namespace nntrainer #endif /* __BLAS_KERNEL_INTERFACE_H__ */ diff --git a/nntrainer/tensor/cl_operations/blas_kernel_strings.h b/nntrainer/tensor/cl_operations/blas_kernel_strings.h index 616900b719..b09536028a 100644 --- a/nntrainer/tensor/cl_operations/blas_kernel_strings.h +++ b/nntrainer/tensor/cl_operations/blas_kernel_strings.h @@ -106,11 +106,11 @@ static const std::string sgemm_cl_transAB_kernel_ = })"; static const std::string addition_cl_kernel_ = - R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) { + R"(__kernel void addition_cl(const __global float* input, __global float* output, unsigned int size_input, unsigned int size_res) { #pragma printf_support size_t idx = get_global_id(0); - if (idx < size) { - output[idx] = output[idx] + input[idx]; + if (idx < size_res) { + output[idx] = output[idx] + input[idx % size_input]; } })"; @@ -228,10 +228,10 @@ static const std::string addition_cl_kernel_fp16_ = R"( #pragma OPENCL EXTENSION cl_khr_fp16 : enable - __kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) { + __kernel void addition_cl_fp16(const __global half* input, __global half* output, unsigned int size_input, unsigned int size_res) { size_t idx = get_global_id(0); - if (idx < size) { - output[idx] = output[idx] + input[idx]; + if (idx < size_res) { + output[idx] = output[idx] + input[idx % size_input]; } })"; diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp index a8236988ad..21b6b1e29a 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels.cpp @@ -280,7 +280,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B, } while (false); } -void addition_cl(const float *input, float *res, unsigned int size) { +void addition_cl(const float *input, float *res, unsigned int size_input, + unsigned int size_res) { bool result = false; @@ -291,11 +292,13 @@ void addition_cl(const float *input, float *res, unsigned int size) { break; } - size_t dim1_size = sizeof(float) * size; + size_t dim1_size = sizeof(float) * size_input; + size_t dim2_size = sizeof(float) * size_res; + opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, nullptr); - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true, + opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true, nullptr); result = inputA.WriteData(cl_context_ref.command_queue_inst_, input); @@ -320,12 +323,18 @@ void addition_cl(const float *input, float *res, unsigned int size) { break; } - result = kernel_addition_ptr->SetKernelArguments(2, &size, sizeof(int)); + result = + kernel_addition_ptr->SetKernelArguments(2, &size_input, sizeof(int)); + if (!result) { + break; + } + + result = kernel_addition_ptr->SetKernelArguments(3, &size_res, sizeof(int)); if (!result) { break; } - const int work_groups_count[3] = {(int)size, 1, 1}; + const int work_groups_count[3] = {(int)size_res, 1, 1}; const int work_group_size[3] = {32, 32, 1}; // test-value result = cl_context_ref.command_queue_inst_.DispatchCommand( kernel_addition_ptr, work_groups_count, work_group_size); diff --git a/nntrainer/tensor/cl_operations/blas_kernels.h b/nntrainer/tensor/cl_operations/blas_kernels.h index 247314740a..f26972a592 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels.h +++ b/nntrainer/tensor/cl_operations/blas_kernels.h @@ -71,10 +71,11 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B, * @brief addition : sum of all input vectors * @param[in] input float * for input * @param[in] res float * for result/output - * @param[in] size number of elements in input vector - * @param[in] context RunLayerContext reference + * @param[in] size_input number of elements in input vector + * @param[in] size_res number of elements in result vector */ -void addition_cl(const float *input, float *res, unsigned int size); +void addition_cl(const float *input, float *res, unsigned int size_input, + unsigned int size_res); /** * @brief sscal value element by element immediately @@ -135,10 +136,11 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B, * @brief fp16 addition : sum of all input vectors * @param[in] input fp16 * for input * @param[in] res fp16 * for result/output - * @param[in] size number of elements in input vector - * @param[in] context RunLayerContext reference + * @param[in] size_input number of elements in input vector + * @param[in] size_res number of elements in result vector */ -void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size); +void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size_input, + unsigned int size_res); /** * @brief fp16 sscal value element by element immediately diff --git a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp index 6aa7ccb6e2..adde9b11cb 100644 --- a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp +++ b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp @@ -291,7 +291,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B, } while (false); } -void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) { +void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size_input, + unsigned int size_res) { bool result = false; @@ -303,11 +304,12 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) { break; } - size_t dim1_size = sizeof(cl_half) * size; + size_t dim1_size = sizeof(cl_half) * size_input; + size_t dim2_size = sizeof(cl_half) * size_res; opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true, nullptr); - opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true, + opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true, nullptr); result = inputA.WriteData(cl_context_ref.command_queue_inst_, input); @@ -333,12 +335,18 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) { } result = - kernel_addition_fp16_ptr->SetKernelArguments(2, &size, sizeof(int)); + kernel_addition_fp16_ptr->SetKernelArguments(2, &size_input, sizeof(int)); if (!result) { break; } - const int work_groups_count[3] = {(int)size, 1, 1}; + result = + kernel_addition_fp16_ptr->SetKernelArguments(3, &size_res, sizeof(int)); + if (!result) { + break; + } + + const int work_groups_count[3] = {(int)size_res, 1, 1}; const int work_group_size[3] = {32, 32, 1}; // test-value result = cl_context_ref.command_queue_inst_.DispatchCommand( kernel_addition_fp16_ptr, work_groups_count, work_group_size);