[ Tensor ] Use SIMD accelerated transpose if possible

- If it is for height-width transpose, we can enjoy SIMD accelerated code. - Use SIMD version if possible, otherwise fallback. - Through this commit, followings are expected to be accelerated, or can be accelerated with ease in the near future: - "0:2:1" transpose - BiQHGEMM - HGEMM **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 <[email protected]>
nnstreamer · May 23, 2024 · c3b6175 · c3b6175
1 parent 5d75e5a
commit c3b6175
Show file tree

Hide file tree

Showing 3 changed files with 13 additions and 2 deletions.
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
@@ -535,6 +535,7 @@ void transpose_matrix(const unsigned int M, const unsigned int N,
                       const _FP16 *src, unsigned int ld_src, _FP16 *dst,
                       unsigned int ld_dst) {
 #ifdef USE_NEON
+/// @note Final form of transpose_neon is NOT having fallback. Debugging WIP.
   if ((M & 0x3) == 0) {
     transpose_neon<_FP16>(M, N, src, ld_src, dst, ld_dst);
   } else {

diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
@@ -54,6 +54,10 @@ if get_option('enable-fp16')
       subdir('hgemm')
       nntrainer_inc += include_directories('hgemm')
       nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
+
+      subdir('matrix_transpose_neon')
+      nntrainer_inc += include_directories('matrix_transpose_neon')
+      nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
     endif
   elif get_option('enable-avx')
     tensor_sources += 'blas_avx.cpp'

diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
@@ -2330,7 +2330,6 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
   unsigned int SL, SI, SJ, SK;
 
   out.reshape(dim.transpose(direction));
-
   int indexI = direction[0] - '0';
   int indexJ = direction[2] - '0';
 
@@ -2402,7 +2401,14 @@ Tensor &Tensor::transpose(const std::string &direction, Tensor &out) const {
         }
       } else {
         if (is_format_nchw) {
-          transposeloop(l, i, k, j, SL, SI, SK, SJ);
+          for (unsigned int b = 0; b < batch(); ++b) {
+            for (unsigned int c = 0; c < channel(); ++c) {
+              transpose_matrix(
+                height(), width(), getData<_FP16>() + getIndex(b, c, 0, 0),
+                width(), out.getData<_FP16>() + out.getIndex(b, c, 0, 0),
+                out.width());
+            }
+          }
         } else {
           transposeloop_nhwc(l, k, j, i, SL, SK, SJ, SI);
         }