[GPU/OpenCL] Broadcasting support added for addition_cl kernel.

Added support where number of batches vary for input A and input B. Self evaluation: Build test: [X]Passed [ ]Failed [ ]Skipped Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Niket Agarwal <[email protected]>
nnstreamer · Oct 22, 2024 · 507f943 · 507f943
1 parent 72bdf54
commit 507f943
Show file tree

Hide file tree

Showing 6 changed files with 76 additions and 48 deletions.
diff --git a/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp b/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
@@ -200,38 +200,47 @@ void multiplyCl(Tensor &input, float const &value) {
   }
 }
 
-void add_i_cl(Tensor const &input, Tensor &result) {
-
-  CREATE_IF_EMPTY_DIMS(result, result.getDim());
-
-  NNTR_THROW_IF(result.getData() == nullptr, std::invalid_argument)
-    << result.getName() << " is not allocated";
-  NNTR_THROW_IF(input.getData() == nullptr, std::invalid_argument)
-    << input.getName() << " is not allocated";
-
-  if (input.getDim() != result.getDim()) {
-    throw std::invalid_argument(
-      "Error: Dimensions does not match for addition");
-  }
-
-  if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
-    unsigned int size = input.size();
-    const float *data = input.getData();
-    float *rdata = result.getData();
-
-    addition_cl(data, rdata, size);
-
-  } else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
+void add_i_cl(Tensor &inputA, Tensor const &inputB) {
+
+  NNTR_THROW_IF(inputB.getData() == nullptr, std::invalid_argument)
+    << inputB.getName() << " is not allocated";
+  NNTR_THROW_IF(inputA.getData() == nullptr, std::invalid_argument)
+    << inputA.getName() << " is not allocated";
+
+  // Broadcasting done for the case where batch size vary for both inputs
+  // If batch size vary, batch size of inputB must be 1
+  if ((inputA.getDim() == inputB.getDim()) ||
+      (inputA.getDim() != inputB.getDim() && inputB.batch() == 1 &&
+       inputA.channel() == inputB.channel() &&
+       inputA.height() == inputB.height() &&
+       inputA.width() == inputB.width())) {
+
+    if (inputA.getDataType() == ml::train::TensorDim::DataType::FP32) {
+      unsigned int sizeA = inputA.size();
+      unsigned int sizeB = inputB.size();
+      float *dataA = inputA.getData();
+      const float *dataB = inputB.getData();
+
+      addition_cl(dataB, dataA, sizeB, sizeA);
+
+    } else if (inputA.getDataType() == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
-    unsigned int size = input.size();
-    const _FP16 *data = input.getData<_FP16>();
-    _FP16 *rdata = result.getData<_FP16>();
+      unsigned int sizeA = inputA.size();
+      unsigned int sizeB = inputB.size();
+      _FP16 *dataA = inputA.getData<_FP16>();
+      const _FP16 *dataB = inputB.getData<_FP16>();
 
-    addition_cl(data, rdata, size);
+      addition_cl(dataB, dataA, sizeB, sizeA);
 
 #else
-    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+      throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
+    }
+  }
+
+  else {
+    throw std::invalid_argument(
+      "Error: Broadcasting not supported for these dimensions!");
   }
 }
 

diff --git a/nntrainer/tensor/cl_operations/blas_kernel_interface.h b/nntrainer/tensor/cl_operations/blas_kernel_interface.h
@@ -64,11 +64,11 @@ void multiplyCl(Tensor &input, float const &value);
 
 /**
  * @brief Process data and dimensions for add operation
- * @param[in] input Tensor
- * @param[in] result Tensor
+ * @param[in] inputA Tensor
+ * @param[in] inputB Tensor
  * @param[in] RunLayerContext reference
  */
-void add_i_cl(Tensor const &input, Tensor &result);
+void add_i_cl(Tensor &inputA, Tensor const &inputB);
 
 } // namespace nntrainer
 #endif /* __BLAS_KERNEL_INTERFACE_H__ */
diff --git a/nntrainer/tensor/cl_operations/blas_kernel_strings.h b/nntrainer/tensor/cl_operations/blas_kernel_strings.h
@@ -106,11 +106,11 @@ static const std::string sgemm_cl_transAB_kernel_ =
     })";
 
 static const std::string addition_cl_kernel_ =
-  R"(__kernel void addition_cl(__global const float* input, __global float* output, const unsigned int size) {
+  R"(__kernel void addition_cl(const __global float* input, __global float* output, unsigned int size_input, unsigned int size_res) {
     #pragma printf_support
     size_t idx = get_global_id(0);
-    if (idx < size) {
-        output[idx] = output[idx] + input[idx];
+    if (idx < size_res) {
+        output[idx] = output[idx] + input[idx % size_input];
     }
   })";
 
@@ -228,10 +228,10 @@ static const std::string addition_cl_kernel_fp16_ =
   R"(
     #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-    __kernel void addition_cl_fp16(__global const half* input, __global half* output, const unsigned int size) {
+    __kernel void addition_cl_fp16(const __global half* input, __global half* output, unsigned int size_input, unsigned int size_res) {
     size_t idx = get_global_id(0);
-    if (idx < size) {
-        output[idx] = output[idx] + input[idx];
+    if (idx < size_res) {
+        output[idx] = output[idx] + input[idx % size_input];
     }
   })";
 

diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp
@@ -280,7 +280,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
   } while (false);
 }
 
-void addition_cl(const float *input, float *res, unsigned int size) {
+void addition_cl(const float *input, float *res, unsigned int size_input,
+                 unsigned int size_res) {
 
   bool result = false;
 
@@ -291,11 +292,13 @@ void addition_cl(const float *input, float *res, unsigned int size) {
       break;
     }
 
-    size_t dim1_size = sizeof(float) * size;
+    size_t dim1_size = sizeof(float) * size_input;
+    size_t dim2_size = sizeof(float) * size_res;
+
     opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
                           nullptr);
 
-    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
+    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
                             nullptr);
 
     result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
@@ -320,12 +323,18 @@ void addition_cl(const float *input, float *res, unsigned int size) {
       break;
     }
 
-    result = kernel_addition_ptr->SetKernelArguments(2, &size, sizeof(int));
+    result =
+      kernel_addition_ptr->SetKernelArguments(2, &size_input, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_addition_ptr->SetKernelArguments(3, &size_res, sizeof(int));
     if (!result) {
       break;
     }
 
-    const int work_groups_count[3] = {(int)size, 1, 1};
+    const int work_groups_count[3] = {(int)size_res, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
     result = cl_context_ref.command_queue_inst_.DispatchCommand(
       kernel_addition_ptr, work_groups_count, work_group_size);

diff --git a/nntrainer/tensor/cl_operations/blas_kernels.h b/nntrainer/tensor/cl_operations/blas_kernels.h
@@ -74,7 +74,8 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
  * @param[in] size number of elements in input vector
  * @param[in] context RunLayerContext reference
  */
-void addition_cl(const float *input, float *res, unsigned int size);
+void addition_cl(const float *input, float *res, unsigned int size_input,
+                 unsigned int size_res);
 
 /**
  * @brief     sscal value element by element immediately
@@ -138,7 +139,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
  * @param[in] size number of elements in input vector
  * @param[in] context RunLayerContext reference
  */
-void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size);
+void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size_input,
+                 unsigned int size_res);
 
 /**
  * @brief     fp16 sscal value element by element immediately

diff --git a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
@@ -291,7 +291,8 @@ void sgemm_cl(bool TransA, bool TransB, const __fp16 *A, const __fp16 *B,
   } while (false);
 }
 
-void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
+void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size_input,
+                 unsigned int size_res) {
 
   bool result = false;
 
@@ -303,11 +304,12 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
       break;
     }
 
-    size_t dim1_size = sizeof(cl_half) * size;
+    size_t dim1_size = sizeof(cl_half) * size_input;
+    size_t dim2_size = sizeof(cl_half) * size_res;
     opencl::Buffer inputA(cl_context_ref.context_inst_, dim1_size, true,
                           nullptr);
 
-    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim1_size, true,
+    opencl::Buffer inOutRes(cl_context_ref.context_inst_, dim2_size, true,
                             nullptr);
 
     result = inputA.WriteData(cl_context_ref.command_queue_inst_, input);
@@ -333,12 +335,18 @@ void addition_cl(const __fp16 *input, __fp16 *res, unsigned int size) {
     }
 
     result =
-      kernel_addition_fp16_ptr->SetKernelArguments(2, &size, sizeof(int));
+      kernel_addition_fp16_ptr->SetKernelArguments(2, &size_input, sizeof(int));
     if (!result) {
       break;
     }
 
-    const int work_groups_count[3] = {(int)size, 1, 1};
+    result =
+      kernel_addition_fp16_ptr->SetKernelArguments(3, &size_res, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {(int)size_res, 1, 1};
     const int work_group_size[3] = {32, 32, 1}; // test-value
     result = cl_context_ref.command_queue_inst_.DispatchCommand(
       kernel_addition_fp16_ptr, work_groups_count, work_group_size);