Skip to content

Commit

Permalink
[ Tensor ] Use SIMD accelerated transpose if possible
Browse files Browse the repository at this point in the history
- If it is for height-width transpose, we can enjoy SIMD accelerated code.
- Use SIMD version if possible, otherwise fallback.
- Through this commit, followings are expected to be accelerated, or can be accelerated with ease in the near future:
  - "0:2:1" transpose
  - BiQHGEMM
  - HGEMM

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed May 23, 2024
1 parent 5d75e5a commit c3b6175
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 2 deletions.
1 change: 1 addition & 0 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -535,6 +535,7 @@ void transpose_matrix(const unsigned int M, const unsigned int N,
const _FP16 *src, unsigned int ld_src, _FP16 *dst,
unsigned int ld_dst) {
#ifdef USE_NEON
/// @note Final form of transpose_neon is NOT having fallback. Debugging WIP.
if ((M & 0x3) == 0) {
transpose_neon<_FP16>(M, N, src, ld_src, dst, ld_dst);
} else {
Expand Down
4 changes: 4 additions & 0 deletions nntrainer/tensor/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,10 @@ if get_option('enable-fp16')
subdir('hgemm')
nntrainer_inc += include_directories('hgemm')
nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'

subdir('matrix_transpose_neon')
nntrainer_inc += include_directories('matrix_transpose_neon')
nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
endif
elif get_option('enable-avx')
tensor_sources += 'blas_avx.cpp'
Expand Down
10 changes: 8 additions & 2 deletions nntrainer/tensor/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2330,7 +2330,6 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
unsigned int SL, SI, SJ, SK;

out.reshape(dim.transpose(direction));

int indexI = direction[0] - '0';
int indexJ = direction[2] - '0';

Expand Down Expand Up @@ -2402,7 +2401,14 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
}
} else {
if (is_format_nchw) {
transposeloop(l, i, k, j, SL, SI, SK, SJ);
for (unsigned int b = 0; b < batch(); ++b) {
for (unsigned int c = 0; c < channel(); ++c) {
transpose_matrix(
height(), width(), getData<_FP16>() + getIndex(b, c, 0, 0),
width(), out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
out.width());
}
}
} else {
transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
}
Expand Down

0 comments on commit c3b6175

Please sign in to comment.