From 59b7c2e3a7b55fca4a544846859ce31ee4a57f6d Mon Sep 17 00:00:00 2001 From: "jijoong.moon" Date: Wed, 8 May 2024 19:04:18 +0900 Subject: [PATCH] [ Tensor ] add is_NaN check in Tensor This PR add is_NaN function to check if the tensor has NaN value. This is for the check NaN during mixed precision training. **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: jijoong.moon --- meson.build | 17 +++-- nntrainer/tensor/blas_avx.cpp | 96 +++++++++++++++++++++++++++++ nntrainer/tensor/blas_avx.h | 20 ++++++ nntrainer/tensor/blas_interface.cpp | 36 +++++++++++ nntrainer/tensor/blas_interface.h | 10 +++ nntrainer/tensor/blas_neon.cpp | 39 ++++++++++++ nntrainer/tensor/blas_neon.h | 18 ++++++ nntrainer/tensor/meson.build | 9 ++- nntrainer/tensor/tensor.cpp | 12 ++++ nntrainer/tensor/tensor.h | 6 ++ packaging/nntrainer.spec | 13 +++- test/unittest/models/meson.build | 23 ++++++- 12 files changed, 288 insertions(+), 11 deletions(-) diff --git a/meson.build b/meson.build index d4aea330a4..7ae692e6d9 100644 --- a/meson.build +++ b/meson.build @@ -64,9 +64,19 @@ warning_c_flags = [ '-Wno-error=varargs' ] +arch = host_machine.cpu_family() + +if get_option('enable-avx') + extra_defines += '-DUSE_AVX=1' + if get_option('platform') == 'tizen' + add_project_arguments(['-mavx2'], language: ['c','cpp']) + else + add_project_arguments(['-march=native'], language: ['c','cpp']) + endif + message('-march=native added for AVX hardware acceleration.') +endif if get_option('enable-fp16') - arch = host_machine.cpu_family() if get_option('platform') == 'android' add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp']) extra_defines += '-DENABLE_FP16=1' @@ -105,11 +115,6 @@ if get_option('enable-fp16') if cc.version().version_compare('>=12.1.0') message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.') extra_defines += '-DENABLE_FP16=1' - if get_option('enable-avx') - extra_defines += '-DUSE_AVX=1' - add_project_arguments(['-march=native'], language: ['c','cpp']) - message('-march=native added for AVX hardware acceleration.') - endif else warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.') endif diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp index ce59583d6f..2fd4908463 100644 --- a/nntrainer/tensor/blas_avx.cpp +++ b/nntrainer/tensor/blas_avx.cpp @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 void vcvt_f16_f32(size_t N, const void *input, float *output) { assert(N != 0); assert(input != NULL); @@ -114,4 +115,99 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) { } } +bool hasNaN(const size_t N, const _Float16 *input) { + assert(N != 0); + assert(input != NULL); + + int temp = 0; + size_t idx = 0; + + // 16 single-precision check : ( X != X ) + for (; N - idx >= 16; idx += 16) { + const __m256 vec0 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + const __m256 vec1 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8)); + + input += 16; + + __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return true; + + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res1); + + if (temp) + return true; + } + + // 8 single-precision check : ( X != X ) + for (; N - idx >= 8; idx += 8) { + const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + input += 8; + __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return true; + } + + // remain check : ( X != X ) + while (idx < N) { + if (*input != *input) { + return true; + } + ++input; + } + + return false; +} +#endif + +bool hasNaN(const size_t N, const float *input) { + assert(N != 0); + assert(input != NULL); + + int temp = 0; + size_t idx = 0; + + // 16 single-precision check : ( X != X ) + for (; N - idx >= 16; idx += 16) { + const __m256 vec0 = _mm256_loadu_ps(input); + const __m256 vec1 = _mm256_loadu_ps(input + 8); + input += 16; + __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res1); + + if (temp) + return true; + } + + // 8 single-precision check : ( X != X ) + for (; N - idx >= 8; idx += 8) { + const __m256 vec = _mm256_loadu_ps(input); + input += 8; + __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return true; + } + + // remain check : ( X != X ) + while (idx < N) { + if (*input != *input) { + return true; + } + ++input; + } + + return false; +} + } // namespace nntrainer::avx diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h index ab1270a208..d25ded103f 100644 --- a/nntrainer/tensor/blas_avx.h +++ b/nntrainer/tensor/blas_avx.h @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 /** * @brief Converts half-precision floating point values to single-precision * floating point values. @@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output); */ void vcvt_f32_f16(size_t N, const float *input, void *output); +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] X half-precision * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const _Float16 *X); +#endif + +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const float *X); + } // namespace nntrainer::avx #endif /* __cplusplus */ diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 9be6fb9911..78cb708e53 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -1038,6 +1038,16 @@ static void ele_div_fallback(const unsigned int N, const float *X, } } +static bool has_nan_fallback(const size_t N, const float *X) { + for (size_t i = 0; i < N; ++i) { + if (*X != *X) + return true; + ++X; + } + + return false; +} + void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, float alpha, float beta, unsigned int i_stride, unsigned int o_stride) { @@ -1090,4 +1100,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); } +bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X) { + if (d_type == ml::train::TensorDim::DataType::FP16) { +#ifdef ENABLE_FP16 + const _FP16 *vec = (const _FP16 *)X; +#ifdef USE_NEON + return nntrainer::neon::hasNaN(N, vec); +#elif defined(USE_AVX) + return nntrainer::avx::hasNaN(N, vec); +#else + throw std::invalid_argument("Error: enable-fp16 is not enabled"); +#endif +#endif + } else if (d_type == ml::train::TensorDim::DataType::FP32) { + const float *vec = (const float *)X; +#ifdef USE_NEON + return nntrainer::neon::hasNaN(N, vec); +#elif defined(USE_AVX) + return nntrainer::avx::hasNaN(N, vec); +#endif + + return has_nan_fallback(N, vec); + } + return false; +} + } // namespace nntrainer diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 04a8a23018..bcd557111e 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, void ele_div(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief check if X array has NaN + * @param[in] N length of the vector + * @param[in] X float/fp16 * for Vector X + * @param[out] bool true if NaN else false + */ +bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X); + } /* namespace nntrainer */ #endif /* __cplusplus */ #endif /* __BLAS_INTERFACE_H__ */ diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 3609b6b8b5..ce6deba3ad 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -546,6 +546,25 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, } } +bool hasNaN(const size_t N, const float *X) { + bool temp = false; + size_t i = 0; + for (; N - i >= 4; i += 4) { + float32x4_t vec = vld1q_f32(&X[i]); + uint32x4_t vcmp = vceqq_f32(vec, vec); + if (vaddvq_u32(vcmp)) + return true; + } + + while (i < N) { + if (X[i] != X[i]) + return true; + ++i; + } + + return temp; +} + #ifdef ENABLE_FP16 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N, @@ -1994,5 +2013,25 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) { } } +bool hasNaN(const size_t N, const __fp16 *input) { + bool temp = 0; + size_t i = 0; + for (; N - i >= 8; i += 8) { + float16x8_t vec = vld1q_f16(&input[i]); + uint16x8_t vcmp = vceqq_f16(vec, vec); + + if (vaddvq_u16(vcmp)) + return true; + } + + while (i < N) { + if (input[i] != input[i]) + return true; + ++i; + } + + return temp; +} + #endif } // namespace nntrainer::neon diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index db1b6a5ccc..6da5e952e1 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, void ele_div(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] input float * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const float *input); + #ifdef ENABLE_FP16 /** * @brief hgemv computation with neon : Y = alpha*A*X + beta*Y @@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M, * @param X __fp16 * for Vector X */ void inv_sqrt_inplace(const unsigned int N, __fp16 *X); + +/** + * @brief check if the X has NaN value + * @note it compare !(x==x) + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[out] true if it has NaN + */ +bool hasNaN(const size_t N, const __fp16 *X); #endif } // namespace nntrainer::neon diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build index 0884dbd3b4..b14fa0ee85 100644 --- a/nntrainer/tensor/meson.build +++ b/nntrainer/tensor/meson.build @@ -44,6 +44,12 @@ cl_headers = [ arch = host_machine.cpu_family() + +if get_option('enable-avx') + tensor_sources += 'blas_avx.cpp' + tensor_headers += 'blas_avx.h' +endif + if get_option('enable-fp16') if arch == 'arm' error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.') @@ -55,9 +61,6 @@ if get_option('enable-fp16') nntrainer_inc += include_directories('hgemm') nntrainer_inc_abs += meson.current_source_dir() / 'hgemm' endif - elif get_option('enable-avx') - tensor_sources += 'blas_avx.cpp' - tensor_headers += 'blas_avx.h' endif endif diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index b14bbd7ae4..f9db2e2ab0 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -3820,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const { return; } +bool Tensor::hasNaN() const { + if (getDataType() == Tdatatype::FP16) { +#ifdef ENABLE_FP16 + return has_nan(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>()); +#else + throw std::invalid_argument("enble-fp16 is not set"); +#endif + } else { + return has_nan(dim.getDataLen(), Tdatatype::FP32, getData()); + } +} + // namespace nntrainer } /* namespace nntrainer */ diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 2ea0393e66..968ec4d502 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -2038,6 +2038,12 @@ class Tensor { static constexpr float epsilon = 1e-5; + /** + * @brief check if there is NaN element + * @param[out] bool true if there is NaN else false + */ + bool hasNaN() const; + private: /**< handle the data as a std::shared_ptr type */ TensorDim dim; diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec index 36ba371d22..2f1dc57f68 100644 --- a/packaging/nntrainer.spec +++ b/packaging/nntrainer.spec @@ -65,6 +65,13 @@ %define neon_support -Denable-neon=false %endif # arch aarch64 +%ifarch x86_64 +%define enable_avx 1 +%define avx_support -Denable-avx=true +%else +%define avx_support -Denable-avx=false +%endif # arch aarch64 + Name: nntrainer Summary: Software framework for training neural networks @@ -410,7 +417,7 @@ meson --buildtype=plain --prefix=%{_prefix} --sysconfdir=%{_sysconfdir} \ %{enable_reduce_tolerance} %{configure_subplugin_install_path} %{enable_debug} \ -Dml-api-support=enabled -Denable-nnstreamer-tensor-filter=enabled \ -Denable-nnstreamer-tensor-trainer=enabled -Denable-capi=enabled \ - %{fp16_support} %{neon_support} build + %{fp16_support} %{neon_support} %{avx_support} build ninja -C build %{?_smp_mflags} @@ -563,6 +570,10 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/ %{_includedir}/nntrainer/util_simd_neon.h %endif +%if 0%{?enable_avx} +%{_includedir}/nntrainer/blas_avx.h +%endif + %files devel-static %{_libdir}/libnntrainer*.a %exclude %{_libdir}/libcapi*.a diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build index 4a6e81e65d..3f17369f94 100644 --- a/test/unittest/models/meson.build +++ b/test/unittest/models/meson.build @@ -1,4 +1,5 @@ test_name = 'unittest_models' +mixed_test_name = 'unittest_mixed_models' test_target = [] @@ -11,8 +12,28 @@ models_targets = [ # disable temperally ] +mixed_test_targets = [ + 'models_test_utils.cpp', + 'models_golden_test.cpp', + 'unittest_models_mixed_precision.cpp', +] + if get_option('enable-fp16') - models_targets += 'unittest_models_mixed_precision.cpp' + mixed_exe = executable( + mixed_test_name, + mixed_test_targets, + include_directories: include_directories('.'), + dependencies: [ + nntrainer_test_main_deps, nntrainer_ccapi_dep + ], + install: get_option('enable-test'), + install_dir: application_install_dir + ) + + test(mixed_test_name, mixed_exe, + args: '--gtest_output=xml:@0@/@1@.xml'.format(meson.build_root(), mixed_test_name), + timeout: test_timeout + ) endif test_target += models_targets