Skip to content

Commit

Permalink
[GPU/OpenCL] Resuable blas OpenCL kernels
Browse files Browse the repository at this point in the history
Added blas_kernels to enhance resuability of the common blas kernels.
Used FullyConnected interface for both CPU and GPU calls.

Signed-off-by: Debadri Samaddar <[email protected]>
  • Loading branch information
s-debadri committed May 14, 2024
1 parent 5a34d75 commit 9c45213
Show file tree
Hide file tree
Showing 8 changed files with 396 additions and 394 deletions.
12 changes: 1 addition & 11 deletions api/ccapi/include/layer.h
Original file line number Diff line number Diff line change
Expand Up @@ -289,21 +289,11 @@ Input(const std::vector<std::string> &properties = {}) {
/**
* @brief Helper function to create fully connected layer
*/
inline std::unique_ptr<Layer>
FullyConnected(const std::vector<std::string> &properties = {}) {
return createLayer(LayerType::LAYER_FC, properties);
}

#ifdef ENABLE_OPENCL
/**
* @brief Helper function to create fully connected layer for GPU
*/
inline std::unique_ptr<Layer> FullyConnectedCl(
inline std::unique_ptr<Layer> FullyConnected(
const std::vector<std::string> &properties = {},
const LayerComputeEngine &compute_engine = LayerComputeEngine::CPU) {
return createLayer(LayerType::LAYER_FC, properties, compute_engine);
}
#endif

/**
* @brief Helper function to create batch normalization layer
Expand Down
307 changes: 307 additions & 0 deletions nntrainer/layers/cl_layers/blas_kernels.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,307 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2024 Debadri Samaddar <[email protected]>
*
* @file blas_kernels.cpp
* @date 14 May 2024
* @brief Common blas OpenCL kernels
* @see https://github.com/nnstreamer/nntrainer
* @author Debadri Samaddar <[email protected]>
* @bug No known bugs except for NYI items
*
*/

#include <blas_kernels.h>

namespace nntrainer {

std::string sgemv_cl_kernel_ =
R"(__kernel void sgemv_cl(const __global float* A, const __global float* X,
__global float* Y, unsigned int M, unsigned int N) {
unsigned int i, j;
i = get_global_id(0);
float y0 = Y[i] * 0.0f;
for (unsigned int j = 0; j < M; j++)
y0 += A[i + j * N] * X[j];
Y[i] = y0;
})";

std::string dot_cl_kernel_ =
R"(__kernel void dot_cl(const __global float* A, const __global float* X, unsigned int K, float res) {
res = 0;
for (unsigned int i = 0; i < K; i++){
res += A[i] * X[i];
}
})";

std::string sgemm_cl_kernel_ =
R"(__kernel void sgemm_cl(const __global float* A, const __global float* B,
__global float* C, unsigned int M, unsigned int N, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
unsigned int m = get_global_id(0);
for (unsigned int n = 0; n < N; ++n) {
float c = 0.0;
float c_old = C[m * ldc + n];
for (unsigned int k = 0; k < K; ++k) {
float a, b;
a = A[m * lda + k];
b = B[k * ldb + n];
c += a * b;
}
C[m * ldc + n] = c;
}
})";

/**
* @brief declaring global kernel objects
*/
opencl::Kernel kernel_sgemv;
opencl::Kernel kernel_sgemm;
opencl::Kernel kernel_dot;

void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
unsigned int dim1, unsigned int dim2, unsigned int lda,
RunLayerContext &context) {

bool result = false;

do {
result = context.clCreateKernel(sgemv_cl_kernel_,
context.LayerKernel::SGEMV, kernel_sgemv);
if (!result) {
break;
}

size_t dim1_size = sizeof(float) * dim1;
size_t dim2_size = sizeof(float) * dim2;
opencl::Buffer inputA(context.context_inst_, dim1_size * dim2_size, true,
nullptr);

opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);

opencl::Buffer inOutY(context.context_inst_, dim2_size, true, nullptr);

result = inputA.WriteData(context.command_queue_inst_, matAdata);
if (!result) {
break;
}

result = inputX.WriteData(context.command_queue_inst_, vecXdata);
if (!result) {
break;
}

result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
if (!result) {
break;
}

result = kernel_sgemv.SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemv.SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemv.SetKernelArguments(2, &inOutY, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemv.SetKernelArguments(3, &dim1, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemv.SetKernelArguments(4, &dim2, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)dim1, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value

result = context.command_queue_inst_.DispatchCommand(
kernel_sgemv, work_groups_count, work_group_size);
if (!result) {
break;
}

result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
if (!result) {
break;
}

} while (false);
}

float dot_cl(const float *matAdata, const float *vecXdata, unsigned int dim1,
RunLayerContext &context) {

bool result = false;

float cl_ret = 0;

do {
result = context.clCreateKernel(dot_cl_kernel_, context.LayerKernel::DOT,
kernel_dot);
if (!result) {
break;
}

size_t dim1_size = sizeof(float) * dim1;

opencl::Buffer inputA(context.context_inst_, dim1_size, true, nullptr);

opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);

result = inputA.WriteData(context.command_queue_inst_, matAdata);
if (!result) {
break;
}

result = inputX.WriteData(context.command_queue_inst_, vecXdata);
if (!result) {
break;
}

result = kernel_dot.SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_dot.SetKernelArguments(1, &inputX, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_dot.SetKernelArguments(2, &dim1, sizeof(int));
if (!result) {
break;
}

result = kernel_dot.SetKernelArguments(3, &cl_ret, sizeof(float));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)dim1, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value

result = context.command_queue_inst_.DispatchCommand(
kernel_dot, work_groups_count, work_group_size);
if (!result) {
break;
}

} while (false);

return cl_ret;
}

void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
unsigned int N, unsigned int K, unsigned int lda,
unsigned int ldb, unsigned int ldc, RunLayerContext &context) {

bool result = false;

do {
result = context.clCreateKernel(sgemm_cl_kernel_,
context.LayerKernel::SGEMM, kernel_sgemm);
if (!result) {
break;
}

size_t m_size = sizeof(float) * M;
size_t n_size = sizeof(float) * N;
size_t k_size = sizeof(float) * K;
opencl::Buffer inputA(context.context_inst_, m_size * k_size, true,
nullptr);

opencl::Buffer inputB(context.context_inst_, k_size * n_size, true,
nullptr);

opencl::Buffer inOutC(context.context_inst_, m_size * n_size, true,
nullptr);

result = inputA.WriteData(context.command_queue_inst_, A);
if (!result) {
break;
}

result = inputB.WriteData(context.command_queue_inst_, B);
if (!result) {
break;
}

result = inOutC.WriteData(context.command_queue_inst_, C);
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(0, &inputA, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(1, &inputB, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(2, &inOutC, sizeof(cl_mem));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(3, &M, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(4, &N, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(5, &K, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(6, &lda, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(7, &ldb, sizeof(int));
if (!result) {
break;
}

result = kernel_sgemm.SetKernelArguments(8, &ldc, sizeof(int));
if (!result) {
break;
}

const int work_groups_count[3] = {(int)M, 1, 1};
const int work_group_size[3] = {32, 32, 1}; // test-value

result = context.command_queue_inst_.DispatchCommand(
kernel_sgemm, work_groups_count, work_group_size);
if (!result) {
break;
}

result = inOutC.ReadData(context.command_queue_inst_, C);
if (!result) {
break;
}

} while (false);
}
} // namespace nntrainer
Loading

0 comments on commit 9c45213

Please sign in to comment.