Skip to content

Commit

Permalink
[GPU/OpenCL] Addition Kernel added in reusable blas OpenCL kernels
Browse files Browse the repository at this point in the history
Added addition kernel to enhance reusability of the common blas kernels.
Used AdditionLayer interface for both CPU and GPU calls.

Signed-off-by: yash.singh <[email protected]>

[GPU/OpenCL] Initial version of Addition Layer with OpenCL ops

Added naive version of OpenCL implementation for Addition Layer.
Incorporated kernel for ops used.
Added unit test for addition_layer_cl.

Signed-off-by: yash.singh <[email protected]>

[GPU/OpenCL] Addition Kernel added in reusable blas OpenCL kernels

Added addition kernel to enhance reusability of the common blas kernels.
Used AdditionLayer interface for both CPU and GPU calls.

Signed-off-by: yash.singh <[email protected]>
  • Loading branch information
yashSingh0723 committed Jun 6, 2024
1 parent 9faf6dc commit a98c244
Show file tree
Hide file tree
Showing 5 changed files with 85 additions and 110 deletions.
16 changes: 3 additions & 13 deletions api/ccapi/include/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -354,21 +354,11 @@ Reshape(const std::vector<std::string> &properties = {}) {
/**
* @brief Helper function to create addition layer
*/
inline std::unique_ptr<Layer>
Addition(const std::vector<std::string> &properties = {}) {
return createLayer(LayerType::LAYER_ADDITION, properties);
}

#ifdef ENABLE_OPENCL
/**
* @brief Helper function to create Addition layer for GPU
*/
inline std::unique_ptr<Layer>
AdditionCL(const std::vector<std::string> &properties = {},
const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU) {
inline std::unique_ptr<Layer> Addition(
const std::vector<std::string> &properties = {},
const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU) {
return createLayer(LayerType::LAYER_ADDITION, properties, compute_engine);
}
#endif

/**
* @brief Helper function to create concat layer
Expand Down
81 changes: 2 additions & 79 deletions nntrainer/layers/cl_layers/addition_layer_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,15 @@
* Copyright (C) 2024 Yash Singh <[email protected]>
*
* @file addition_layer_cl.cpp
* @date 17 May 2024
* @date 28 May 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Yash Singh [email protected]>
* @bug No known bugs except for NYI items
* @brief This is Addition Layer Class Class for Neural Network with OpenCl
* implementation
*/

#include <blas_kernels.h>
#include <addition_layer_cl.h>
#include <nntrainer_error.h>
#include <nntrainer_log.h>
Expand All @@ -19,15 +20,6 @@

#include <layer_context.h>

std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
#pragma printf_support
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
}
})";

namespace nntrainer {

static constexpr size_t SINGLE_INOUT_IDX = 0;
Expand All @@ -45,18 +37,11 @@ void AdditionLayerCL::forwarding(RunLayerContext &context, bool training) {
if (!idx) {
hidden_.copy(input_);
} else {
// hidden_.add_i(input_);
AddProcess(input_, hidden_, context);
}
}
}

/**
* @brief declaring static kerinputnel objects
*
*/
opencl::Kernel AdditionLayerCL::kernel_addition;

void AdditionLayerCL::AddProcess(Tensor const &input, Tensor &result,
RunLayerContext &context) {

Expand All @@ -83,67 +68,6 @@ void AdditionLayerCL::AddProcess(Tensor const &input, Tensor &result,
throw std::invalid_argument("Error: OpenCL fp16 is not supported yet.");
}

void AdditionLayerCL::addition_cl(const float *input, float *res,
unsigned int size, RunLayerContext &context) {

bool result = false;
do {
result = result =
context.clCreateKernel(addition_cl_kernel_, context.LayerKernel::ADD,
AdditionLayerCL::kernel_addition);
if (!result) {
break;
}

size_t dim1_size = sizeof(float) * size;
opencl::Buffer inputA(context.context_inst_, dim1_size, true, nullptr);

opencl::Buffer inOutRes(context.context_inst_, dim1_size, true, nullptr);

result = inputA.WriteData(context.command_queue_inst_, input);
if (!result) {
break;
}

result = inOutRes.WriteData(context.command_queue_inst_, res);
if (!result) {
break;
}

result = AdditionLayerCL::kernel_addition.SetKernelArguments(
0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}

result = AdditionLayerCL::kernel_addition.SetKernelArguments(
1, &inOutRes, sizeof(cl_mem));
if (!result) {
break;
}

result = AdditionLayerCL::kernel_addition.SetKernelArguments(2, &size,
sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
AdditionLayerCL::kernel_addition, work_groups_count, work_group_size);
if (!result) {
break;
}

result = inOutRes.ReadData(context.command_queue_inst_, res);
if (!result) {
break;
}

} while (false);
}

void AdditionLayerCL::incremental_forwarding(RunLayerContext &context,
unsigned int from, unsigned int to,
bool training) {
Expand Down Expand Up @@ -179,7 +103,6 @@ void AdditionLayerCL::incremental_forwarding(RunLayerContext &context,
if (!idx) {
hidden_step.copy(input_step);
} else {
// hidden_step.add_i(input_step);
AddProcess(input_step, hidden_step, context);
}
}
Expand Down
19 changes: 1 addition & 18 deletions nntrainer/layers/cl_layers/addition_layer_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
* Copyright (C) 2024 Yash Singh <[email protected]>
*
* @file addition_layer_cl.h
* @date 17 May 2024
* @date 28 May 2024
* @see https://github.com/nnstreamer/nntrainer
* @author Yash Singh [email protected]>
* @bug No known bugs except for NYI items
Expand All @@ -17,8 +17,6 @@

#include <common_properties.h>
#include <layer_devel.h>
#include <opencl_buffer.h>
#include <opencl_kernel.h>

#define CREATE_IF_EMPTY_DIMS(tensor, ...) \
do { \
Expand Down Expand Up @@ -78,11 +76,6 @@ class AdditionLayerCL : public Layer {
*/
void calcDerivative(RunLayerContext &context) override;

/**
* @brief declaring static kernel objects
*/
static opencl::Kernel kernel_addition;

/**
* @brief Process data and dimensions for add operation used in addition layer
* @param[in] input Tensor
Expand All @@ -92,16 +85,6 @@ class AdditionLayerCL : public Layer {
void AddProcess(Tensor const &input, Tensor &result,
RunLayerContext &context);

/**
* @brief addition : sum of all input vectors
* @param[in] input float * for input
* @param[in] res float * for result/output
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
*/
void addition_cl(const float *input, float *res, unsigned int size,
RunLayerContext &context);

/**
* @copydoc bool supportBackwarding() const
*/
Expand Down
68 changes: 68 additions & 0 deletions nntrainer/layers/cl_layers/blas_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,12 +51,22 @@ std::string sgemm_cl_kernel_ =
C[m * ldc + n] = c;
})";

std::string addition_cl_kernel_ =
R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
#pragma printf_support
size_t idx = get_global_id(0);
if (idx < size) {
output[idx] = output[idx] + input[idx];
}
})";

/**
* @brief defining global kernel objects
*/
opencl::Kernel kernel_sgemv;
opencl::Kernel kernel_sgemm;
opencl::Kernel kernel_dot;
opencl::Kernel kernel_addition;

void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
unsigned int dim1, unsigned int dim2, unsigned int lda,
Expand Down Expand Up @@ -299,4 +309,62 @@ void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
} while (false);
}

void addition_cl(const float *input, float *res,
unsigned int size, RunLayerContext &context) {

bool result = false;

do {
result = result =
context.clCreateKernel(addition_cl_kernel_, context.LayerKernel::ADD,
kernel_addition);
if (!result) {
break;
}

size_t dim1_size = sizeof(float) * size;
opencl::Buffer inputA(context.context_inst_, dim1_size, true, nullptr);

opencl::Buffer inOutRes(context.context_inst_, dim1_size, true, nullptr);

result = inputA.WriteData(context.command_queue_inst_, input);
if (!result) {
break;
}

result = inOutRes.WriteData(context.command_queue_inst_, res);
if (!result) {
break;
}

result = kernel_addition.SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_addition.SetKernelArguments(1, &inOutRes, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_addition.SetKernelArguments(2, &size, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)size, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value
result = context.command_queue_inst_.DispatchCommand(
kernel_addition, work_groups_count, work_group_size);
if (!result) {
break;
}

result = inOutRes.ReadData(context.command_queue_inst_, res);
if (!result) {
break;
}

} while (false);
}
} // namespace nntrainer
11 changes: 11 additions & 0 deletions nntrainer/layers/cl_layers/blas_kernels.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ extern opencl::Kernel kernel_sgemm;
extern opencl::Kernel kernel_sgemm_fp16;
extern opencl::Kernel kernel_dot;
extern opencl::Kernel kernel_dot_fp16;
extern opencl::Kernel kernel_addition;

/**
* @brief sgemv computation : Y = A*X + Y
Expand Down Expand Up @@ -117,5 +118,15 @@ void sgemm_cl(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context);

/**
* @brief addition : sum of all input vectors
* @param[in] input float * for input
* @param[in] res float * for result/output
* @param[in] size number of elements in input vector
* @param[in] context RunLayerContext reference
*/
void addition_cl(const float *input, float *res, unsigned int size,
RunLayerContext &context);

} // namespace nntrainer
#endif /* __BLAS_KERNELS_H__ */

0 comments on commit a98c244

Please sign in to comment.