Skip to content

Commit

Permalink
[GPU/OpenCL] Broadcasting support added for addition_cl kernel.
Browse files Browse the repository at this point in the history
Added support where number of batches vary for input A and input B.
Added unit test case for new feature in unittest_blas_kernels_cl.cpp

Self evaluation:

    Build test: [X]Passed [ ]Failed [ ]Skipped
    Run test: [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Niket Agarwal <[email protected]>
  • Loading branch information
niket-agarwal committed Nov 19, 2024
1 parent a80a6e1 commit 4c30910
Show file tree
Hide file tree
Showing 13 changed files with 146 additions and 53 deletions.
6 changes: 3 additions & 3 deletions nntrainer/cl_context.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,9 @@ static void add_default_object(ClContext &cc) {
FullyConnectedLayerCl::type,
ml::train::LayerType::LAYER_FC);

// cc.registerFactory(nntrainer::createLayer<AdditionLayerCL>,
// AdditionLayerCL::type,
// ml::train::LayerType::LAYER_ADDITION);
cc.registerFactory(nntrainer::createLayer<AdditionLayerCL>,
AdditionLayerCL::type,
ml::train::LayerType::LAYER_ADDITION);

cc.registerFactory(nntrainer::createLayer<SwiGLULayerCl>, SwiGLULayerCl::type,
ml::train::LayerType::LAYER_SWIGLU);
Expand Down
4 changes: 2 additions & 2 deletions nntrainer/layers/cl_layers/addition_layer_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ void AdditionLayerCL::forwarding(RunLayerContext &context, bool training) {
if (!idx) {
hidden_.copy(input_);
} else {
add_i_cl(input_, hidden_, context);
add_i_cl(hidden_, input_);
}
}
}
Expand Down Expand Up @@ -77,7 +77,7 @@ void AdditionLayerCL::incremental_forwarding(RunLayerContext &context,
if (!idx) {
hidden_step.copy(input_step);
} else {
add_i_cl(input_step, hidden_step, context);
add_i_cl(hidden_step, input_step);
}
}
}
Expand Down
1 change: 1 addition & 0 deletions nntrainer/layers/cl_layers/addition_layer_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
#define __ADDITION_LAYER_CL_H__
#ifdef __cplusplus

#include <cl_context.h>
#include <common_properties.h>
#include <layer_devel.h>

Expand Down
4 changes: 2 additions & 2 deletions nntrainer/layers/cl_layers/meson.build
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
cl_layer_sources = [
'fc_layer_cl.cpp',
# 'addition_layer_cl.cpp',
'fc_layer_cl.cpp',
'addition_layer_cl.cpp',
'swiglu_cl.cpp',
'reshape_cl.cpp',
'rmsnorm_layer_cl.cpp',
Expand Down
48 changes: 28 additions & 20 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -200,38 +200,46 @@ void multiplyCl(Tensor &input, float const &value) {
}
}

void add_i_cl(Tensor const &input, Tensor &result) {
void add_i_cl(Tensor &result, Tensor const &input) {

CREATE_IF_EMPTY_DIMS(result, result.getDim());

NNTR_THROW_IF(result.getData() == nullptr, std::invalid_argument)
<< result.getName() << " is not allocated";
NNTR_THROW_IF(input.getData() == nullptr, std::invalid_argument)
<< input.getName() << " is not allocated";
NNTR_THROW_IF(result.getData() == nullptr, std::invalid_argument)
<< result.getName() << " is not allocated";

if (input.getDim() != result.getDim()) {
throw std::invalid_argument(
"Error: Dimensions does not match for addition");
}
// Broadcasting done for the case where batch size vary for both inputs
// If batch size vary, batch size of input must be 1
if ((result.getDim() == input.getDim()) ||
(result.getDim() != input.getDim() && input.batch() == 1 &&
result.channel() == input.channel() &&
result.height() == input.height() && result.width() == input.width())) {

if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
unsigned int size = input.size();
const float *data = input.getData();
float *rdata = result.getData();
if (result.getDataType() == ml::train::TensorDim::DataType::FP32) {
unsigned int size_res = result.size();
unsigned int size_input = input.size();
float *data_res = result.getData();
const float *data_input = input.getData();

addition_cl(data, rdata, size);
addition_cl(data_input, data_res, size_input, size_res);

} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
} else if (result.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
unsigned int size = input.size();
const _FP16 *data = input.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
unsigned int size_res = result.size();
unsigned int size_input = input.size();
_FP16 *data_res = result.getData<_FP16>();
const _FP16 *data_input = input.getData<_FP16>();

addition_cl(data, rdata, size);
addition_cl(data_input, data_res, size_input, size_res);

#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

else {
throw std::invalid_argument(
"Error: Broadcasting not supported for these dimensions!");
}
}

Expand Down
5 changes: 2 additions & 3 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,11 +64,10 @@ void multiplyCl(Tensor &input, float const &value);

/**
* @brief Process data and dimensions for add operation
* @param[in] input Tensor
* @param[in] result Tensor
* @param[in] RunLayerContext reference
* @param[in] input Tensor
*/
void add_i_cl(Tensor const &input, Tensor &result);
void add_i_cl(Tensor &result, Tensor const &input);

/**
* @brief Process data and dimensions for transpose operation
Expand Down
12 changes: 6 additions & 6 deletions nntrainer/tensor/cl_operations/blas_kernel_strings.h
Original file line number Diff line number Diff line change
Expand Up @@ -118,11 +118,11 @@ static const std::string sgemm_cl_transAB_kernel_ =
})";

static const std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
R"(__kernel void addition_cl(const __global float* input, __global float* output, unsigned int size_input, unsigned int size_res) {
#pragma printf_support
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
if (idx < size_res) {
output[idx] = output[idx] + input[idx % size_input];
}
})";

Expand Down Expand Up @@ -324,10 +324,10 @@ static const std::string addition_cl_kernel_fp16_ =
R"(
#pragma OPENCL EXTENSION cl_khr_fp16 : enable
__kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) {
__kernel void addition_cl_fp16(const __global half* input, __global half* output, unsigned int size_input, unsigned int size_res) {
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
if (idx < size_res) {
output[idx] = output[idx] + input[idx % size_input];
}
})";

Expand Down
19 changes: 14 additions & 5 deletions nntrainer/tensor/cl_operations/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -289,7 +289,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
} while (false);
}

void addition_cl(const float *input, float *res, unsigned int size) {
void addition_cl(const float *input, float *res, unsigned int size_input,
unsigned int size_res) {

bool result = false;

Expand All @@ -300,11 +301,13 @@ void addition_cl(const float *input, float *res, unsigned int size) {
break;
}

size_t dim1_size = sizeof(float) * size;
size_t dim1_size = sizeof(float) * size_input;
size_t dim2_size = sizeof(float) * size_res;

opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
Expand All @@ -329,12 +332,18 @@ void addition_cl(const float *input, float *res, unsigned int size) {
break;
}

result = kernel_addition_ptr->SetKernelArguments(2, &size, sizeof(int));
result =
kernel_addition_ptr->SetKernelArguments(2, &size_input, sizeof(int));
if (!result) {
break;
}

result = kernel_addition_ptr->SetKernelArguments(3, &size_res, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
const int work_groups_count[3] = {(int)size_res, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = cl_context_ref.command_queue_inst_.DispatchCommand(
kernel_addition_ptr, work_groups_count, work_group_size);
Expand Down
14 changes: 8 additions & 6 deletions nntrainer/tensor/cl_operations/blas_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -73,10 +73,11 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
* @brief addition : sum of all input vectors
* @param[in] input float * for input
* @param[in] res float * for result/output
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
* @param[in] size_input number of elements in input vector
* @param[in] size_res number of elements in result vector
*/
void addition_cl(const float *input, float *res, unsigned int size);
void addition_cl(const float *input, float *res, unsigned int size_input,
unsigned int size_res);

/**
* @brief sscal value element by element immediately
Expand Down Expand Up @@ -156,10 +157,11 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
* @brief fp16 addition : sum of all input vectors
* @param[in] input fp16 * for input
* @param[in] res fp16 * for result/output
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
* @param[in] size_input number of elements in input vector
* @param[in] size_res number of elements in result vector
*/
void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size);
void addition_cl(const _FP16 *input, _FP16 *res, unsigned int size_input,
unsigned int size_res);

/**
* @brief fp16 sscal value element by element immediately
Expand Down
18 changes: 13 additions & 5 deletions nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
} while (false);
}

void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
void addition_cl(const _FP16 *input, _FP16 *res, unsigned int size_input,
unsigned int size_res) {

bool result = false;

Expand All @@ -312,11 +313,12 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
break;
}

size_t dim1_size = sizeof(cl_half) * size;
size_t dim1_size = sizeof(cl_half) * size_input;
size_t dim2_size = sizeof(cl_half) * size_res;
opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
nullptr);

opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
nullptr);

result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
Expand All @@ -342,12 +344,18 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
}

result =
kernel_addition_fp16_ptr->SetKernelArguments(2, &size, sizeof(int));
kernel_addition_fp16_ptr->SetKernelArguments(2, &size_input, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
result =
kernel_addition_fp16_ptr->SetKernelArguments(3, &size_res, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size_res, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = cl_context_ref.command_queue_inst_.DispatchCommand(
kernel_addition_fp16_ptr, work_groups_count, work_group_size);
Expand Down
14 changes: 14 additions & 0 deletions test/include/nntrainer_test_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,20 @@ class ScopedIni {
} \
} while (0)

#define GEN_TEST_INPUT_C(input, equation_i_j_k_l) \
do { \
for (int i = 0; i < batch_b; ++i) { \
for (int j = 0; j < channel; ++j) { \
for (int k = 0; k < height; ++k) { \
for (int l = 0; l < width; ++l) { \
float val = (equation_i_j_k_l); \
input.setValue(i, j, k, l, val); \
} \
} \
} \
} \
} while (0)

/**
* @brief return a tensor filled with contant value with dimension
*/
Expand Down
2 changes: 1 addition & 1 deletion test/jni/Android.mk
Original file line number Diff line number Diff line change
Expand Up @@ -476,7 +476,7 @@ LOCAL_SRC_FILES := \
../unittest/layers/unittest_layers_reshape.cpp \
../unittest/layers/unittest_layers_multi_head_attention.cpp \
../unittest/layers/unittest_layers_positional_encoding.cpp \
# ../unittest/layers/unittest_layers_addition_cl.cpp \
../unittest/layers/unittest_layers_addition_cl.cpp \

LOCAL_C_INCLUDES += $(NNTRAINER_INCLUDES)

Expand Down
52 changes: 52 additions & 0 deletions test/unittest/unittest_blas_kernels_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -530,6 +530,58 @@ TEST(nntrainer_Tensor, dot_gemm_50_768_2048_transAB) {
EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
}

TEST(blas_kernels, addition_i) {

int batch = 12;
int channel = 1;
int height = 26;
int width = 26;

int batch_b = 1;

const float alpha = 1e-1;
const int MOD = 10;

nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};

nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32);
nntrainer::Tensor B_fp32(batch_b, channel, height, width, t_type_nchw_fp32);
nntrainer::Tensor C_fp32(batch, channel, height, width, t_type_nchw_fp32);
nntrainer::Tensor D_fp32(batch_b, channel, height, width, t_type_nchw_fp32);

GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) +
j * (batch * height) + k * (width) + l + 1) %
MOD) *
alpha);
GEN_TEST_INPUT_C(B_fp32, ((i * (batch_b * height * channel) +
j * (batch_b * height) + k * (width) + l + 1) %
MOD) *
alpha);
GEN_TEST_INPUT(C_fp32, ((i * (batch * height * channel) +
j * (batch * height) + k * (width) + l + 1) %
MOD) *
alpha);
GEN_TEST_INPUT_C(D_fp32, ((i * (batch_b * height * channel) +
j * (batch_b * height) + k * (width) + l + 1) %
MOD) *
alpha);

A_fp32.add_i(B_fp32);
add_i_cl(C_fp32, D_fp32);

float mseErrorNeon =
mse<float>(A_fp32.getData<float>(), C_fp32.getData<float>(), A_fp32.size());

double cosSimNeon = cosine_similarity<float>(
A_fp32.getData<float>(), C_fp32.getData<float>(), A_fp32.size());

const float epsilon = 1e-3 * width;

EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
}

GTEST_API_ int main(int argc, char **argv) {
int result = -1;

Expand Down

0 comments on commit 4c30910

Please sign in to comment.