Skip to content

Commit

Permalink
[ BLAS ] Consider incremental index on scopy
Browse files Browse the repository at this point in the history
- Current internal SIMD implementation does not support incremental indicies.
- Let __fallback_scopy to handle this issue.

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 authored and jijoongmoon committed Nov 12, 2024
1 parent 3e37c11 commit d7838fd
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 22 deletions.
42 changes: 23 additions & 19 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,15 +597,6 @@ static float sdot_raw(const unsigned int N, const float *X,
return ret;
}

static void __scopy_fallback(const unsigned int N, const float *X,
const int incX, float *Y, const int incY) {
unsigned int incy = abs(incY);
unsigned int incx = abs(incX);

for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}

static void sscal_raw(const unsigned int N, const float alpha, float *X,
const int incX) {
unsigned int incx = abs(incX);
Expand Down Expand Up @@ -668,6 +659,15 @@ static unsigned int isamax_raw(const unsigned int N, const float *X,

#endif

static void __scopy_fallback(const unsigned int N, const float *X,
const int incX, float *Y, const int incY) {
unsigned int incy = abs(incY);
unsigned int incx = abs(incX);

for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}

void sscal(const unsigned int N, const float alpha, void *X, const int incX,
ml::train::TensorDim::DataType d_type) {

Expand Down Expand Up @@ -870,20 +870,24 @@ void scopy(const unsigned int N, const void *X, const int incX, void *Y,

void scopy(const unsigned int N, const float *X, const int incX, float *Y,
const int incY) {
/**
* @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy
* #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS
* openblas_set_num_threads(BLAS_NUM_THREADS);
* #endif
* cblas_scopy(N, X, incX, Y, incY);
*/
/**
* @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy
* #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS
* openblas_set_num_threads(BLAS_NUM_THREADS);
* #endif
* cblas_scopy(N, X, incX, Y, incY);
*/
if (incX == 1 && incY == 1) {
#ifdef USE_NEON
nntrainer::neon::custom_scopy(N, X, incX, Y, incY);
nntrainer::neon::custom_scopy(N, X, incX, Y, incY);
#elif USE_AVX
nntrainer::avx::custom_scopy(N, X, incX, Y, incY);
nntrainer::avx::custom_scopy(N, X, incX, Y, incY);
#else
__scopy_fallback(N, X, incX, Y, incY);
__scopy_fallback(N, X, incX, Y, incY);
#endif
} else {
__scopy_fallback(N, X, incX, Y, incY);
}
}

void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
Expand Down
5 changes: 2 additions & 3 deletions nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
#include <blas_neon.h>
#include <blas_neon_setting.h>
#include <hgemm.h>
#include <iostream>
#include <memory>
#include <nntrainer_error.h>

Expand Down Expand Up @@ -600,9 +599,9 @@ void custom_scopy(const unsigned int N, const float *X, const int incX,
__asm__ __volatile__("ld1 {v0.4s}, [%1]\n\t"
"st1 {v0.4s}, [%0]\n\t"
:
: "+r"(&Y[i]), "+r"(&X[i])
: "r"(&Y[i]), "r"(&X[i])
: "v0", "memory");
#elif
#else
__scopy_kernel(N, X + i, Y + i);
#endif
}
Expand Down

0 comments on commit d7838fd

Please sign in to comment.