Skip to content

Commit

Permalink
[refactor] Moved blas_kernels to tensor directory
Browse files Browse the repository at this point in the history
Moved common OpenCL blas kernels to tensor directory.
Added pre processing functions as common that can be re-used.

Signed-off-by: Debadri Samaddar <[email protected]>
  • Loading branch information
s-debadri committed Jun 5, 2024
1 parent ec4c048 commit a1f2f8a
Show file tree
Hide file tree
Showing 10 changed files with 310 additions and 182 deletions.
114 changes: 4 additions & 110 deletions nntrainer/layers/cl_layers/fc_layer_cl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
*
*/

#include <blas_kernels.h>
#include <blas_kernel_interface.h>
#include <common_properties.h>
#include <fc_layer_cl.h>
#include <layer_context.h>
Expand Down Expand Up @@ -126,9 +126,9 @@ void FullyConnectedLayerCl::forwarding(RunLayerContext &context,

weight.dequantize(weight_, axis);

fcDotProcess(input_, weight_, hidden_, context);
dotCl(input_, weight_, hidden_, context);
} else {
fcDotProcess(input_, weight, hidden_, context);
dotCl(input_, weight, hidden_, context);
}

if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
Expand All @@ -138,112 +138,6 @@ void FullyConnectedLayerCl::forwarding(RunLayerContext &context,
}
}

void FullyConnectedLayerCl::fcDotProcess(Tensor const &input,
Tensor const &weight, Tensor &result,
RunLayerContext &context) {
// to do:
// NNTR_THROW_IF(!contiguous, std::invalid_argument)
// << getName() << " is not contiguous. Cannot dot product.";

unsigned int dim1, dim2, mdim1, mdim2;
if (input.getFormat() == Tformat::NHWC) {
dim1 = input.batch() * input.height() * input.width();
dim2 = input.channel();
mdim1 = weight.batch() * weight.height() * weight.width();
mdim2 = weight.channel();
} else {
dim1 = input.batch() * input.channel() * input.height();
dim2 = input.width();
mdim1 = weight.batch() * weight.channel() * weight.height();
mdim2 = weight.width();
}

unsigned int M, N, K, lda, ldb, ldc;
if (dim2 != mdim1)
throw std::runtime_error("Error: incompatible dimensions for dot product");
K = mdim1; /** == dim2 */
N = mdim2;
M = dim1;
if (input.getFormat() == Tformat::NHWC) {
CREATE_IF_EMPTY_DIMS(result, input.batch(), N, input.height(),
input.width(),
input.getTensorType()); // NHWC Result Tensor
} else {
CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(),
N, input.getTensorType());
}

lda = dim2;
ldb = mdim2;
ldc =
(input.getFormat() == Tformat::NHWC) ? result.channel() : result.width();

if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
const float *data = input.getData();
const float *mdata = weight.getData();
float *rdata = result.getData();

/// shortcut handling in case of vector
/// for vector, (1 * K) == (K * 1) in current memory layout...
/// and plaese note that N, K, M is a fixed place holder after considering
/// transpose.
/// For example, there is no case like (1 * K) X (1 * K) while
/// (1 * K) X (1 * M) can be a case
/// case1: (1 * K) X (K * 1)
if (M == 1 && N == 1) {
*rdata = dot_cl(data, mdata, K, context) + (*rdata);
}
/// case2: (M * K) X (K * 1)
else if (N == 1) {
sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
}
/// case3: (1 * K) X (K * N) = 1 * N = R
/// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
/// Effectively a translation of sgemv
else if (M == 1) {
sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
}
/// case others: use gemm
else {
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
}
} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
const _FP16 *data = input.getData<_FP16>();
const _FP16 *mdata = weight.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
const float alpha = 1.0f;

/// shortcut handling in case of vector
/// for vector, (1 * K) == (K * 1) in current memory layout...
/// and plaese note that N, K, M is a fixed place holder after considering
/// transpose.
/// For example, there is no case like (1 * K) X (1 * K) while
/// (1 * K) X (1 * M) can be a case
/// case1: (1 * K) X (K * 1)
if (M == 1 && N == 1) {
*rdata = dot_cl(data, mdata, K, context) + (*rdata);
}
/// case2: (M * K) X (K * 1)
else if (N == 1) {
sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
}
/// case3: (1 * K) X (K * N) = 1 * N = R
/// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
/// Effectively a translation of sgemv
else if (M == 1) {
sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
}
/// case others: use sgemm
else {
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
}
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

void FullyConnectedLayerCl::incremental_forwarding(RunLayerContext &context,
unsigned int from,
unsigned int to,
Expand Down Expand Up @@ -276,7 +170,7 @@ void FullyConnectedLayerCl::incremental_forwarding(RunLayerContext &context,
Tensor input_step = input_.getSharedDataTensor(input_step_dim, 0, true);
Tensor hidden_step = hidden_.getSharedDataTensor(hidden_step_dim, 0, true);

fcDotProcess(input_step, weight, hidden_step, context);
dotCl(input_step, weight, hidden_step, context);

if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
disable_bias.empty() || disable_bias.get() == false) {
Expand Down
16 changes: 0 additions & 16 deletions nntrainer/layers/cl_layers/fc_layer_cl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,6 @@
#include <common_properties.h>
#include <layer_impl.h>

#define CREATE_IF_EMPTY_DIMS(tensor, ...) \
do { \
if (tensor.empty()) \
tensor = Tensor(__VA_ARGS__); \
} while (0);

namespace nntrainer {

/**
Expand Down Expand Up @@ -96,16 +90,6 @@ class FullyConnectedLayerCl : public LayerImpl {
return FullyConnectedLayerCl::type;
};

/**
* @brief Process data and dimensions for dot operation used in fc_layer
* @param[in] input Tensor
* @param[in] weight Tensor
* @param[in] result Tensor
* @param[in] RunLayerContext reference
*/
void fcDotProcess(Tensor const &input, Tensor const &weight, Tensor &result,
RunLayerContext &context);

/**
* @copydoc Layer::supportBackwarding()
*/
Expand Down
5 changes: 0 additions & 5 deletions nntrainer/layers/cl_layers/meson.build
Original file line number Diff line number Diff line change
@@ -1,12 +1,7 @@
cl_layer_sources = [
'fc_layer_cl.cpp',
'blas_kernels.cpp',
]

if get_option('enable-fp16')
cl_layer_sources += 'blas_kernels_fp16.cpp'
endif

foreach s : cl_layer_sources
nntrainer_sources += meson.current_source_dir() / s
endforeach
189 changes: 189 additions & 0 deletions nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,189 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2024 Debadri Samaddar <[email protected]>
*
* @file blas_kernel_interface.cpp
* @date 5 June 2024
* @brief Interface for blas OpenCL kernels
* @see https://github.com/nnstreamer/nntrainer
* @author Debadri Samaddar <[email protected]>
* @bug No known bugs except for NYI items
*
*/

#include <blas_kernel_interface.h>
#include <blas_kernels.h>

namespace nntrainer {
void dotBatchedCl(Tensor const &input, Tensor const &m, Tensor &result,
RunLayerContext &context, bool trans, bool trans_m) {
if (!result.isAllocated())
throw std::invalid_argument(
"Output tensor must be preallocated for dotBatched operation");
for (unsigned int b = 0; b < input.batch(); b++) {
/** @todo try using transpose to speedup the operation */
const Tensor this_b = input.getBatchSlice(b, 1);
Tensor m_b = m.getBatchSlice(b, 1);
Tensor result_b = result.getBatchSlice(b, 1);

dotCl(this_b, m_b, result_b, context, trans, trans_m);
}
}

void dotCl(Tensor const &input, Tensor const &m, Tensor &result,
RunLayerContext &context, bool trans, bool trans_m) {
unsigned int dim1, dim2, mdim1, mdim2;
if (input.getFormat() == Tformat::NHWC) {
dim1 = input.batch() * input.height() * input.width();
dim2 = input.channel();
mdim1 = m.batch() * m.height() * m.width();
mdim2 = m.channel();
} else {
dim1 = input.batch() * input.channel() * input.height();
dim2 = input.width();
mdim1 = m.batch() * m.channel() * m.height();
mdim2 = m.width();
}

unsigned int M, N, K, lda, ldb, ldc;

if (!trans && !trans_m) {
if (dim2 != mdim1)
throw std::runtime_error(
"Error: incompatible dimensions for dot product");
K = mdim1; /** == dim2 */
N = mdim2;
M = dim1;
if (input.getFormat() == Tformat::NHWC) {
CREATE_IF_EMPTY_DIMS(result, input.batch(), N, input.height(),
input.width(),
input.getTensorType()); // NHWC Result Tensor
} else {
CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(),
input.height(), N, input.getTensorType());
}
} else if (!trans && trans_m) {
if (dim2 != mdim2)
throw std::runtime_error(
"Error: incompatible dimensions for dot product");
K = mdim2; /** == dim2 */
N = mdim1;
M = dim1;
if (input.getFormat() == Tformat::NHWC) {
CREATE_IF_EMPTY_DIMS(result, input.batch(), N, input.height(),
input.width(), input.getTensorType());
} else {
CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(),
input.height(), N, input.getTensorType());
}
} else if (trans && !trans_m) {
if (dim1 != mdim1)
throw std::runtime_error(
"Error: incompatible dimensions for dot product");
K = mdim1; /** == dim1 */
N = mdim2;
M = dim2;
if (input.getFormat() == Tformat::NHWC) {
CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, input.getTensorType());
} else {
CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, input.getTensorType());
}
} else {
if (dim1 != mdim2)
throw std::runtime_error(
"Error: incompatible dimensions for dot product");
K = mdim2; /** == dim1 */
N = mdim1;
M = dim2;
if (input.getFormat() == Tformat::NHWC) {
CREATE_IF_EMPTY_DIMS(result, 1, N, M, 1, input.getTensorType());
} else {
CREATE_IF_EMPTY_DIMS(result, 1, 1, M, N, input.getTensorType());
}
}

lda = dim2;
ldb = mdim2;
ldc =
(input.getFormat() == Tformat::NHWC) ? result.channel() : result.width();

if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
const float *data = input.getData();
const float *mdata = m.getData();
float *rdata = result.getData();
enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;

/// shortcut handling in case of vector
/// for vector, (1 * K) == (K * 1) in current memory layout...
/// and plaese note that N, K, M is a fixed place holder after considering
/// transpose.
/// For example, there is no case like (1 * K) X (1 * K) while
/// (1 * K) X (1 * M) can be a case
/// case1: (1 * K) X (K * 1)
if (M == 1 && N == 1) {
*rdata = dot_cl(data, mdata, K, context) + (*rdata);
}
/// case2: (M * K) X (K * 1)
else if (N == 1) {
transA ? sgemv_cl(data, mdata, rdata, dim2, dim1, lda, context)
: sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
}
/// case3: (1 * K) X (K * N) = 1 * N = R
/// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
/// Effectively a translation of sgemv
else if (M == 1) {
transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
transB ? sgemv_cl(mdata, data, rdata, mdim2, mdim1, ldb, context)
: sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
}
/// case others: use gemm
else {
// transA == false, transB == false
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
// todo: other condition implementations
}
} else if (input.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
const _FP16 *data = input.getData<_FP16>();
const _FP16 *mdata = m.getData<_FP16>();
_FP16 *rdata = result.getData<_FP16>();
enum CBLAS_TRANSPOSE transA = trans ? CblasTrans : CblasNoTrans;
enum CBLAS_TRANSPOSE transB = trans_m ? CblasTrans : CblasNoTrans;

/// shortcut handling in case of vector
/// for vector, (1 * K) == (K * 1) in current memory layout...
/// and plaese note that N, K, M is a fixed place holder after considering
/// transpose.
/// For example, there is no case like (1 * K) X (1 * K) while
/// (1 * K) X (1 * M) can be a case
/// case1: (1 * K) X (K * 1)
if (M == 1 && N == 1) {
*rdata = dot_cl(data, mdata, K, context) + (*rdata);
}
/// case2: (M * K) X (K * 1)
else if (N == 1) {
transA ? sgemv_cl(data, mdata, rdata, dim2, dim1, lda, context)
: sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
}
/// case3: (1 * K) X (K * N) = 1 * N = R
/// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
/// Effectively a translation of sgemv
else if (M == 1) {
transB = transB == CblasTrans ? CblasNoTrans : CblasTrans;
transB ? sgemv_cl(mdata, data, rdata, mdim2, mdim1, ldb, context)
: sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
}
/// case others: use sgemm
else {
// transA == false, transB == false
sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
// todo: other condition implementations
}
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
}
}

} // namespace nntrainer
Loading

0 comments on commit a1f2f8a

Please sign in to comment.