diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 166efcb0f9..0847890b49 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -597,15 +597,6 @@ static float sdot_raw(const unsigned int N, const float *X, return ret; } -static void __scopy_fallback(const unsigned int N, const float *X, - const int incX, float *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); - - for (unsigned int i = 0; i < N; ++i) - Y[i * incy] = X[i * incx]; -} - static void sscal_raw(const unsigned int N, const float alpha, float *X, const int incX) { unsigned int incx = abs(incX); @@ -668,6 +659,15 @@ static unsigned int isamax_raw(const unsigned int N, const float *X, #endif +static void __scopy_fallback(const unsigned int N, const float *X, + const int incX, float *Y, const int incY) { + unsigned int incy = abs(incY); + unsigned int incx = abs(incX); + + for (unsigned int i = 0; i < N; ++i) + Y[i * incy] = X[i * incx]; +} + void sscal(const unsigned int N, const float alpha, void *X, const int incX, ml::train::TensorDim::DataType d_type) { @@ -870,20 +870,24 @@ void scopy(const unsigned int N, const void *X, const int incX, void *Y, void scopy(const unsigned int N, const float *X, const int incX, float *Y, const int incY) { -/** - * @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy - * #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS - * openblas_set_num_threads(BLAS_NUM_THREADS); - * #endif - * cblas_scopy(N, X, incX, Y, incY); - */ + /** + * @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy + * #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS + * openblas_set_num_threads(BLAS_NUM_THREADS); + * #endif + * cblas_scopy(N, X, incX, Y, incY); + */ + if (incX == 1 && incY == 1) { #ifdef USE_NEON - nntrainer::neon::custom_scopy(N, X, incX, Y, incY); + nntrainer::neon::custom_scopy(N, X, incX, Y, incY); #elif USE_AVX - nntrainer::avx::custom_scopy(N, X, incX, Y, incY); + nntrainer::avx::custom_scopy(N, X, incX, Y, incY); #else - __scopy_fallback(N, X, incX, Y, incY); + __scopy_fallback(N, X, incX, Y, incY); #endif + } else { + __scopy_fallback(N, X, incX, Y, incY); + } } void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y, diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index ebfc687b99..0494e96d6b 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -15,7 +15,6 @@ #include #include #include -#include #include #include @@ -600,9 +599,9 @@ void custom_scopy(const unsigned int N, const float *X, const int incX, __asm__ __volatile__("ld1 {v0.4s}, [%1]\n\t" "st1 {v0.4s}, [%0]\n\t" : - : "+r"(&Y[i]), "+r"(&X[i]) + : "r"(&Y[i]), "r"(&X[i]) : "v0", "memory"); -#elif +#else __scopy_kernel(N, X + i, Y + i); #endif }