From 59b7c2e3a7b55fca4a544846859ce31ee4a57f6d Mon Sep 17 00:00:00 2001
From: "jijoong.moon" <jijoong.moon@samsung.com>
Date: Wed, 8 May 2024 19:04:18 +0900
Subject: [PATCH] [ Tensor ] add is_NaN check in Tensor

This PR add is_NaN function to check if the tensor has NaN value. This
is for the check NaN during mixed precision training.

**Self evaluation:**
1. Build test:	 [X]Passed [ ]Failed [ ]Skipped
2. Run test:	 [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: jijoong.moon <jijoong.moon@samsung.com>
---
 meson.build                         | 17 +++--
 nntrainer/tensor/blas_avx.cpp       | 96 +++++++++++++++++++++++++++++
 nntrainer/tensor/blas_avx.h         | 20 ++++++
 nntrainer/tensor/blas_interface.cpp | 36 +++++++++++
 nntrainer/tensor/blas_interface.h   | 10 +++
 nntrainer/tensor/blas_neon.cpp      | 39 ++++++++++++
 nntrainer/tensor/blas_neon.h        | 18 ++++++
 nntrainer/tensor/meson.build        |  9 ++-
 nntrainer/tensor/tensor.cpp         | 12 ++++
 nntrainer/tensor/tensor.h           |  6 ++
 packaging/nntrainer.spec            | 13 +++-
 test/unittest/models/meson.build    | 23 ++++++-
 12 files changed, 288 insertions(+), 11 deletions(-)

diff --git a/meson.build b/meson.build
index d4aea330a4..7ae692e6d9 100644
--- a/meson.build
+++ b/meson.build
@@ -64,9 +64,19 @@ warning_c_flags = [
   '-Wno-error=varargs'
 ]
 
+arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+   extra_defines += '-DUSE_AVX=1'
+   if get_option('platform') == 'tizen'
+      add_project_arguments(['-mavx2'], language: ['c','cpp'])
+   else
+      add_project_arguments(['-march=native'], language: ['c','cpp'])
+   endif
+   message('-march=native added for AVX hardware acceleration.')
+endif
 
 if get_option('enable-fp16')
-   arch = host_machine.cpu_family()
    if get_option('platform') == 'android'
      add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
      extra_defines += '-DENABLE_FP16=1'
@@ -105,11 +115,6 @@ if get_option('enable-fp16')
      if cc.version().version_compare('>=12.1.0')
        message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
        extra_defines += '-DENABLE_FP16=1'
-       if get_option('enable-avx')
-        extra_defines += '-DUSE_AVX=1'
-        add_project_arguments(['-march=native'], language: ['c','cpp'])
-        message('-march=native added for AVX hardware acceleration.')
-       endif
      else
        warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
      endif
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
index ce59583d6f..2fd4908463 100644
--- a/nntrainer/tensor/blas_avx.cpp
+++ b/nntrainer/tensor/blas_avx.cpp
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 void vcvt_f16_f32(size_t N, const void *input, float *output) {
   assert(N != 0);
   assert(input != NULL);
@@ -114,4 +115,99 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
   }
 }
 
+bool hasNaN(const size_t N, const _Float16 *input) {
+  assert(N != 0);
+  assert(input != NULL);
+
+  int temp = 0;
+  size_t idx = 0;
+
+  // 16 single-precision check : ( X != X )
+  for (; N - idx >= 16; idx += 16) {
+    const __m256 vec0 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    const __m256 vec1 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input + 8));
+
+    input += 16;
+
+    __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return true;
+
+    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res1);
+
+    if (temp)
+      return true;
+  }
+
+  // 8 single-precision check : ( X != X )
+  for (; N - idx >= 8; idx += 8) {
+    const __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    input += 8;
+    __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return true;
+  }
+
+  // remain check : ( X != X )
+  while (idx < N) {
+    if (*input != *input) {
+      return true;
+    }
+    ++input;
+  }
+
+  return false;
+}
+#endif
+
+bool hasNaN(const size_t N, const float *input) {
+  assert(N != 0);
+  assert(input != NULL);
+
+  int temp = 0;
+  size_t idx = 0;
+
+  // 16 single-precision check : ( X != X )
+  for (; N - idx >= 16; idx += 16) {
+    const __m256 vec0 = _mm256_loadu_ps(input);
+    const __m256 vec1 = _mm256_loadu_ps(input + 8);
+    input += 16;
+    __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res1);
+
+    if (temp)
+      return true;
+  }
+
+  // 8 single-precision check : ( X != X )
+  for (; N - idx >= 8; idx += 8) {
+    const __m256 vec = _mm256_loadu_ps(input);
+    input += 8;
+    __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return true;
+  }
+
+  // remain check : ( X != X )
+  while (idx < N) {
+    if (*input != *input) {
+      return true;
+    }
+    ++input;
+  }
+
+  return false;
+}
+
 } // namespace nntrainer::avx
diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h
index ab1270a208..d25ded103f 100644
--- a/nntrainer/tensor/blas_avx.h
+++ b/nntrainer/tensor/blas_avx.h
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 /**
  * @brief Converts half-precision floating point values to single-precision
  * floating point values.
@@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output);
  */
 void vcvt_f32_f16(size_t N, const float *input, void *output);
 
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] X half-precision * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const _Float16 *X);
+#endif
+
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const float *X);
+
 } // namespace nntrainer::avx
 
 #endif /* __cplusplus */
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 9be6fb9911..78cb708e53 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -1038,6 +1038,16 @@ static void ele_div_fallback(const unsigned int N, const float *X,
   }
 }
 
+static bool has_nan_fallback(const size_t N, const float *X) {
+  for (size_t i = 0; i < N; ++i) {
+    if (*X != *X)
+      return true;
+    ++X;
+  }
+
+  return false;
+}
+
 void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
              float alpha, float beta, unsigned int i_stride,
              unsigned int o_stride) {
@@ -1090,4 +1100,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
     ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
 }
 
+bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
+             const void *X) {
+  if (d_type == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+    const _FP16 *vec = (const _FP16 *)X;
+#ifdef USE_NEON
+    return nntrainer::neon::hasNaN(N, vec);
+#elif defined(USE_AVX)
+    return nntrainer::avx::hasNaN(N, vec);
+#else
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+#endif
+  } else if (d_type == ml::train::TensorDim::DataType::FP32) {
+    const float *vec = (const float *)X;
+#ifdef USE_NEON
+    return nntrainer::neon::hasNaN(N, vec);
+#elif defined(USE_AVX)
+    return nntrainer::avx::hasNaN(N, vec);
+#endif
+
+    return has_nan_fallback(N, vec);
+  }
+  return false;
+}
+
 } // namespace nntrainer
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index 04a8a23018..bcd557111e 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
 void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
              unsigned int o_stride = 1);
+
+/**
+ * @brief     check if X array has NaN
+ * @param[in] N  length of the vector
+ * @param[in] X float/fp16 * for Vector X
+ * @param[out] bool true if NaN else false
+ */
+bool has_nan(const size_t N, ml::train::TensorDim::DataType d_type,
+             const void *X);
+
 } /* namespace nntrainer */
 #endif /* __cplusplus */
 #endif /* __BLAS_INTERFACE_H__ */
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 3609b6b8b5..ce6deba3ad 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -546,6 +546,25 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
   }
 }
 
+bool hasNaN(const size_t N, const float *X) {
+  bool temp = false;
+  size_t i = 0;
+  for (; N - i >= 4; i += 4) {
+    float32x4_t vec = vld1q_f32(&X[i]);
+    uint32x4_t vcmp = vceqq_f32(vec, vec);
+    if (vaddvq_u32(vcmp))
+      return true;
+  }
+
+  while (i < N) {
+    if (X[i] != X[i])
+      return true;
+    ++i;
+  }
+
+  return temp;
+}
+
 #ifdef ENABLE_FP16
 
 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N,
@@ -1994,5 +2013,25 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) {
   }
 }
 
+bool hasNaN(const size_t N, const __fp16 *input) {
+  bool temp = 0;
+  size_t i = 0;
+  for (; N - i >= 8; i += 8) {
+    float16x8_t vec = vld1q_f16(&input[i]);
+    uint16x8_t vcmp = vceqq_f16(vec, vec);
+
+    if (vaddvq_u16(vcmp))
+      return true;
+  }
+
+  while (i < N) {
+    if (input[i] != input[i])
+      return true;
+    ++i;
+  }
+
+  return temp;
+}
+
 #endif
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index db1b6a5ccc..6da5e952e1 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
 void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              float alpha = 1.f, float beta = 0.f);
 
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] input float * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const float *input);
+
 #ifdef ENABLE_FP16
 /**
  * @brief     hgemv computation with neon : Y = alpha*A*X + beta*Y
@@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M,
  * @param X __fp16 * for Vector X
  */
 void inv_sqrt_inplace(const unsigned int N, __fp16 *X);
+
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare !(x==x)
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] true if it has NaN
+ */
+bool hasNaN(const size_t N, const __fp16 *X);
 #endif
 
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
index 0884dbd3b4..b14fa0ee85 100644
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -44,6 +44,12 @@ cl_headers = [
 
 
 arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+    tensor_sources += 'blas_avx.cpp'
+    tensor_headers += 'blas_avx.h'
+endif
+
 if get_option('enable-fp16') 
   if arch == 'arm'
     error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
@@ -55,9 +61,6 @@ if get_option('enable-fp16')
       nntrainer_inc += include_directories('hgemm')
       nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
     endif
-  elif get_option('enable-avx')
-    tensor_sources += 'blas_avx.cpp'
-    tensor_headers += 'blas_avx.h'
   endif
 endif
 
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index b14bbd7ae4..f9db2e2ab0 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -3820,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const {
   return;
 }
 
+bool Tensor::hasNaN() const {
+  if (getDataType() == Tdatatype::FP16) {
+#ifdef ENABLE_FP16
+    return has_nan(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>());
+#else
+    throw std::invalid_argument("enble-fp16 is not set");
+#endif
+  } else {
+    return has_nan(dim.getDataLen(), Tdatatype::FP32, getData<float>());
+  }
+}
+
 // namespace nntrainer
 
 } /* namespace nntrainer */
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 2ea0393e66..968ec4d502 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -2038,6 +2038,12 @@ class Tensor {
 
   static constexpr float epsilon = 1e-5;
 
+  /**
+   * @brief      check if there is NaN element
+   * @param[out] bool true if there is NaN else false
+   */
+  bool hasNaN() const;
+
 private:
   /**< handle the data as a std::shared_ptr<float> type */
   TensorDim dim;
diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec
index 36ba371d22..2f1dc57f68 100644
--- a/packaging/nntrainer.spec
+++ b/packaging/nntrainer.spec
@@ -65,6 +65,13 @@
 %define neon_support -Denable-neon=false
 %endif # arch aarch64
 
+%ifarch x86_64
+%define enable_avx 1
+%define avx_support -Denable-avx=true
+%else
+%define avx_support -Denable-avx=false
+%endif # arch aarch64
+
 
 Name:		nntrainer
 Summary:	Software framework for training neural networks
@@ -410,7 +417,7 @@ meson --buildtype=plain --prefix=%{_prefix} --sysconfdir=%{_sysconfdir} \
       %{enable_reduce_tolerance} %{configure_subplugin_install_path} %{enable_debug} \
       -Dml-api-support=enabled -Denable-nnstreamer-tensor-filter=enabled \
       -Denable-nnstreamer-tensor-trainer=enabled -Denable-capi=enabled \
-      %{fp16_support} %{neon_support} build
+      %{fp16_support} %{neon_support} %{avx_support} build
 
 ninja -C build %{?_smp_mflags}
 
@@ -563,6 +570,10 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/
 %{_includedir}/nntrainer/util_simd_neon.h
 %endif
 
+%if 0%{?enable_avx}
+%{_includedir}/nntrainer/blas_avx.h
+%endif
+
 %files devel-static
 %{_libdir}/libnntrainer*.a
 %exclude %{_libdir}/libcapi*.a
diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build
index 4a6e81e65d..3f17369f94 100644
--- a/test/unittest/models/meson.build
+++ b/test/unittest/models/meson.build
@@ -1,4 +1,5 @@
 test_name = 'unittest_models'
+mixed_test_name = 'unittest_mixed_models'
 
 test_target = []
 
@@ -11,8 +12,28 @@ models_targets = [
   # disable temperally
 ]
 
+mixed_test_targets = [
+  'models_test_utils.cpp',
+  'models_golden_test.cpp',
+  'unittest_models_mixed_precision.cpp',
+]
+
 if get_option('enable-fp16')
-   models_targets += 'unittest_models_mixed_precision.cpp'
+  mixed_exe = executable(
+    mixed_test_name,
+    mixed_test_targets,
+    include_directories: include_directories('.'),
+    dependencies: [
+      nntrainer_test_main_deps, nntrainer_ccapi_dep
+    ],
+    install: get_option('enable-test'),
+    install_dir: application_install_dir
+  )
+
+  test(mixed_test_name, mixed_exe,
+    args: '--gtest_output=xml:@0@/@1@.xml'.format(meson.build_root(), mixed_test_name),
+    timeout: test_timeout
+  )
 endif
 
 test_target += models_targets