Skip to content

Commit

Permalink
[ Tensor ] add is_NaN check in Tensor
Browse files Browse the repository at this point in the history
This PR add is_NaN function to check if the tensor has NaN value. This
is for the check NaN during mixed precision training.

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <[email protected]>
  • Loading branch information
jijoongmoon committed May 10, 2024
1 parent 0593c27 commit 59b7c2e
Show file tree
Hide file tree
Showing 12 changed files with 288 additions and 11 deletions.
17 changes: 11 additions & 6 deletions meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -64,9 +64,19 @@ warning_c_flags = [
'-Wno-error=varargs'
]

arch = host_machine.cpu_family()

if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
if get_option('platform') == 'tizen'
add_project_arguments(['-mavx2'], language: ['c','cpp'])
else
add_project_arguments(['-march=native'], language: ['c','cpp'])
endif
message('-march=native added for AVX hardware acceleration.')
endif

if get_option('enable-fp16')
arch = host_machine.cpu_family()
if get_option('platform') == 'android'
add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
extra_defines += '-DENABLE_FP16=1'
Expand Down Expand Up @@ -105,11 +115,6 @@ if get_option('enable-fp16')
if cc.version().version_compare('>=12.1.0')
message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
extra_defines += '-DENABLE_FP16=1'
if get_option('enable-avx')
extra_defines += '-DUSE_AVX=1'
add_project_arguments(['-march=native'], language: ['c','cpp'])
message('-march=native added for AVX hardware acceleration.')
endif
else
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
endif
Expand Down
96 changes: 96 additions & 0 deletions nntrainer/tensor/blas_avx.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

namespace nntrainer::avx {

#ifdef ENABLE_FP16
void vcvt_f16_f32(size_t N, const void *input, float *output) {
assert(N != 0);
assert(input != NULL);
Expand Down Expand Up @@ -114,4 +115,99 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
}
}

bool hasNaN(const size_t N, const _Float16 *input) {
assert(N != 0);
assert(input != NULL);

int temp = 0;
size_t idx = 0;

// 16 single-precision check : ( X != X )
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
const __m256 vec1 =
_mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));

input += 16;

__m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);

if (temp)
return true;

__m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res1);

if (temp)
return true;
}

// 8 single-precision check : ( X != X )
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
input += 8;
__m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);

if (temp)
return true;
}

// remain check : ( X != X )
while (idx < N) {
if (*input != *input) {
return true;
}
++input;
}

return false;
}
#endif

bool hasNaN(const size_t N, const float *input) {
assert(N != 0);
assert(input != NULL);

int temp = 0;
size_t idx = 0;

// 16 single-precision check : ( X != X )
for (; N - idx >= 16; idx += 16) {
const __m256 vec0 = _mm256_loadu_ps(input);
const __m256 vec1 = _mm256_loadu_ps(input + 8);
input += 16;
__m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);
__m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res1);

if (temp)
return true;
}

// 8 single-precision check : ( X != X )
for (; N - idx >= 8; idx += 8) {
const __m256 vec = _mm256_loadu_ps(input);
input += 8;
__m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
temp = temp | _mm256_movemask_ps(res);

if (temp)
return true;
}

// remain check : ( X != X )
while (idx < N) {
if (*input != *input) {
return true;
}
++input;
}

return false;
}

} // namespace nntrainer::avx
20 changes: 20 additions & 0 deletions nntrainer/tensor/blas_avx.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

namespace nntrainer::avx {

#ifdef ENABLE_FP16
/**
* @brief Converts half-precision floating point values to single-precision
* floating point values.
Expand All @@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output);
*/
void vcvt_f32_f16(size_t N, const float *input, void *output);

/**
* @brief check if the X has NaN value
* @note it compare !(x==x)
* @param[in] N length of the vector
* @param[in] X half-precision * for Vector X
* @param[out] true if it has NaN
*/
bool hasNaN(const size_t N, const _Float16 *X);
#endif

/**
* @brief check if the X has NaN value
* @note it compare !(x==x)
* @param[in] N length of the vector
* @param[in] X float * for Vector X
* @param[out] true if it has NaN
*/
bool hasNaN(const size_t N, const float *X);

} // namespace nntrainer::avx

#endif /* __cplusplus */
Expand Down
36 changes: 36 additions & 0 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1038,6 +1038,16 @@ static void ele_div_fallback(const unsigned int N, const float *X,
}
}

static bool has_nan_fallback(const size_t N, const float *X) {
for (size_t i = 0; i < N; ++i) {
if (*X != *X)
return true;
++X;
}

return false;
}

void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
float alpha, float beta, unsigned int i_stride,
unsigned int o_stride) {
Expand Down Expand Up @@ -1090,4 +1100,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
}

bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
const void *X) {
if (d_type == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
const _FP16 *vec = (const _FP16 *)X;
#ifdef USE_NEON
return nntrainer::neon::hasNaN(N, vec);
#elif defined(USE_AVX)
return nntrainer::avx::hasNaN(N, vec);
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
#endif
#endif
} else if (d_type == ml::train::TensorDim::DataType::FP32) {
const float *vec = (const float *)X;
#ifdef USE_NEON
return nntrainer::neon::hasNaN(N, vec);
#elif defined(USE_AVX)
return nntrainer::avx::hasNaN(N, vec);
#endif

return has_nan_fallback(N, vec);
}
return false;
}

} // namespace nntrainer
10 changes: 10 additions & 0 deletions nntrainer/tensor/blas_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
unsigned int o_stride = 1);

/**
* @brief check if X array has NaN
* @param[in] N length of the vector
* @param[in] X float/fp16 * for Vector X
* @param[out] bool true if NaN else false
*/
bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
const void *X);

} /* namespace nntrainer */
#endif /* __cplusplus */
#endif /* __BLAS_INTERFACE_H__ */
39 changes: 39 additions & 0 deletions nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -546,6 +546,25 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
}
}

bool hasNaN(const size_t N, const float *X) {
bool temp = false;
size_t i = 0;
for (; N - i >= 4; i += 4) {
float32x4_t vec = vld1q_f32(&X[i]);
uint32x4_t vcmp = vceqq_f32(vec, vec);
if (vaddvq_u32(vcmp))
return true;
}

while (i < N) {
if (X[i] != X[i])
return true;
++i;
}

return temp;
}

#ifdef ENABLE_FP16

void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N,
Expand Down Expand Up @@ -1994,5 +2013,25 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) {
}
}

bool hasNaN(const size_t N, const __fp16 *input) {
bool temp = 0;
size_t i = 0;
for (; N - i >= 8; i += 8) {
float16x8_t vec = vld1q_f16(&input[i]);
uint16x8_t vcmp = vceqq_f16(vec, vec);

if (vaddvq_u16(vcmp))
return true;
}

while (i < N) {
if (input[i] != input[i])
return true;
++i;
}

return temp;
}

#endif
} // namespace nntrainer::neon
18 changes: 18 additions & 0 deletions nntrainer/tensor/blas_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
float alpha = 1.f, float beta = 0.f);

/**
* @brief check if the X has NaN value
* @note it compare !(x==x)
* @param[in] N length of the vector
* @param[in] input float * for Vector X
* @param[out] true if it has NaN
*/
bool hasNaN(const size_t N, const float *input);

#ifdef ENABLE_FP16
/**
* @brief hgemv computation with neon : Y = alpha*A*X + beta*Y
Expand Down Expand Up @@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M,
* @param X __fp16 * for Vector X
*/
void inv_sqrt_inplace(const unsigned int N, __fp16 *X);

/**
* @brief check if the X has NaN value
* @note it compare !(x==x)
* @param[in] N length of the vector
* @param[in] X float * for Vector X
* @param[out] true if it has NaN
*/
bool hasNaN(const size_t N, const __fp16 *X);
#endif

} // namespace nntrainer::neon
Expand Down
9 changes: 6 additions & 3 deletions nntrainer/tensor/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,12 @@ cl_headers = [


arch = host_machine.cpu_family()

if get_option('enable-avx')
tensor_sources += 'blas_avx.cpp'
tensor_headers += 'blas_avx.h'
endif

if get_option('enable-fp16')
if arch == 'arm'
error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
Expand All @@ -55,9 +61,6 @@ if get_option('enable-fp16')
nntrainer_inc += include_directories('hgemm')
nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
endif
elif get_option('enable-avx')
tensor_sources += 'blas_avx.cpp'
tensor_headers += 'blas_avx.h'
endif
endif

Expand Down
12 changes: 12 additions & 0 deletions nntrainer/tensor/tensor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3820,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const {
return;
}

bool Tensor::hasNaN() const {
if (getDataType() == Tdatatype::FP16) {
#ifdef ENABLE_FP16
return has_nan(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>());
#else
throw std::invalid_argument("enble-fp16 is not set");
#endif
} else {
return has_nan(dim.getDataLen(), Tdatatype::FP32, getData<float>());
}
}

// namespace nntrainer

} /* namespace nntrainer */
6 changes: 6 additions & 0 deletions nntrainer/tensor/tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -2038,6 +2038,12 @@ class Tensor {

static constexpr float epsilon = 1e-5;

/**
* @brief check if there is NaN element
* @param[out] bool true if there is NaN else false
*/
bool hasNaN() const;

private:
/**< handle the data as a std::shared_ptr<float> type */
TensorDim dim;
Expand Down
Loading

0 comments on commit 59b7c2e

Please sign in to comment.