From 5dc111c4e0b3ae3c13dd8e5ddab25de37cc684f7 Mon Sep 17 00:00:00 2001 From: skykongkong8 Date: Mon, 11 Nov 2024 13:22:31 +0900 Subject: [PATCH] [ BLAS ] Consider incremental index on scopy - Current internal SIMD implementation does not support incremental indicies. - Let __fallback_scopy to handle this issue. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 --- nntrainer/tensor/blas_interface.cpp | 42 ++++++++++++++++------------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 166efcb0f9..0847890b49 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -597,15 +597,6 @@ static float sdot_raw(const unsigned int N, const float *X, return ret; } -static void __scopy_fallback(const unsigned int N, const float *X, - const int incX, float *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = X[i * incx]; -} - static void sscal_raw(const unsigned int N, const float alpha, float *X, const int incX) { unsigned int incx = abs(incX); @@ -668,6 +659,15 @@ static unsigned int isamax_raw(const unsigned int N, const float *X, #endif +static void __scopy_fallback(const unsigned int N, const float *X, + const int incX, float *Y, const int incY) { + unsigned int incy = abs(incY); + unsigned int incx = abs(incX); + + for (unsigned int i = 0; i < N; ++i) + Y[i * incy] = X[i * incx]; +} + void sscal(const unsigned int N, const float alpha, void *X, const int incX, ml::train::TensorDim::DataType d_type) { @@ -870,20 +870,24 @@ void scopy(const unsigned int N, const void *X, const int incX, void *Y, void scopy(const unsigned int N, const float *X, const int incX, float *Y, const int incY) { -/** - * @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy - * #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS - * openblas_set_num_threads(BLAS_NUM_THREADS); - * #endif - * cblas_scopy(N, X, incX, Y, incY); - */ + /** + * @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy + * #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS + * openblas_set_num_threads(BLAS_NUM_THREADS); + * #endif + * cblas_scopy(N, X, incX, Y, incY); + */ + if (incX == 1 && incY == 1) { #ifdef USE_NEON - nntrainer::neon::custom_scopy(N, X, incX, Y, incY); + nntrainer::neon::custom_scopy(N, X, incX, Y, incY); #elif USE_AVX - nntrainer::avx::custom_scopy(N, X, incX, Y, incY); + nntrainer::avx::custom_scopy(N, X, incX, Y, incY); #else - __scopy_fallback(N, X, incX, Y, incY); + __scopy_fallback(N, X, incX, Y, incY); #endif + } else { + __scopy_fallback(N, X, incX, Y, incY); + } } void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,