Skip to content

Commit

Permalink
[ BLAS ] Consider incremental index on scopy
Browse files Browse the repository at this point in the history
- Current internal SIMD implementation does not support incremental indicies.
- Let __fallback_scopy to handle this issue.

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Nov 11, 2024
1 parent a62bcfc commit 5dc111c
Showing 1 changed file with 23 additions and 19 deletions.
42 changes: 23 additions & 19 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -597,15 +597,6 @@ static float sdot_raw(const unsigned int N, const float *X,
return ret;
}

static void __scopy_fallback(const unsigned int N, const float *X,
const int incX, float *Y, const int incY) {
unsigned int incy = abs(incY);
unsigned int incx = abs(incX);

for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}

static void sscal_raw(const unsigned int N, const float alpha, float *X,
const int incX) {
unsigned int incx = abs(incX);
Expand Down Expand Up @@ -668,6 +659,15 @@ static unsigned int isamax_raw(const unsigned int N, const float *X,

#endif

static void __scopy_fallback(const unsigned int N, const float *X,
const int incX, float *Y, const int incY) {
unsigned int incy = abs(incY);
unsigned int incx = abs(incX);

for (unsigned int i = 0; i < N; ++i)
Y[i * incy] = X[i * incx];
}

void sscal(const unsigned int N, const float alpha, void *X, const int incX,
ml::train::TensorDim::DataType d_type) {

Expand Down Expand Up @@ -870,20 +870,24 @@ void scopy(const unsigned int N, const void *X, const int incX, void *Y,

void scopy(const unsigned int N, const float *X, const int incX, float *Y,
const int incY) {
/**
* @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy
* #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS
* openblas_set_num_threads(BLAS_NUM_THREADS);
* #endif
* cblas_scopy(N, X, incX, Y, incY);
*/
/**
* @note Using 'cblas_scopy' shown some SIGSEGV, temporally use custom-scopy
* #ifdef USE_BLAS #ifdef BLAS_NUM_THREADS
* openblas_set_num_threads(BLAS_NUM_THREADS);
* #endif
* cblas_scopy(N, X, incX, Y, incY);
*/
if (incX == 1 && incY == 1) {
#ifdef USE_NEON
nntrainer::neon::custom_scopy(N, X, incX, Y, incY);
nntrainer::neon::custom_scopy(N, X, incX, Y, incY);
#elif USE_AVX
nntrainer::avx::custom_scopy(N, X, incX, Y, incY);
nntrainer::avx::custom_scopy(N, X, incX, Y, incY);
#else
__scopy_fallback(N, X, incX, Y, incY);
__scopy_fallback(N, X, incX, Y, incY);
#endif
} else {
__scopy_fallback(N, X, incX, Y, incY);
}
}

void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
Expand Down

0 comments on commit 5dc111c

Please sign in to comment.