diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index fbcf899ad5..2a190150fb 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -387,7 +387,7 @@ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] + static_cast<_FP16>(beta) * Z[i]; else @@ -402,7 +402,7 @@ void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i] + static_cast<_FP16>(beta) * Z[i]; else @@ -417,7 +417,7 @@ void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i] + static_cast<_FP16>(beta) * Z[i]; else @@ -432,7 +432,7 @@ void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]) + static_cast<_FP16>(beta) * Z[i]; else @@ -943,7 +943,7 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -957,7 +957,7 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -971,7 +971,7 @@ void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -986,7 +986,7 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta); #else for (unsigned int i = 0; i < N; ++i) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 7682ae4182..3e9ff0fcfc 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -385,7 +385,7 @@ unsigned int isamax(const unsigned int N, const float *X, const int incX); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief cosine with neon: Y = cos(alpha * X) @@ -394,7 +394,7 @@ void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief inversed squared root transformation inplace : X = 1 / sqrt(X) diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index bfe49d8422..4dc9708d56 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -447,14 +447,14 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -474,14 +474,14 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -501,14 +501,14 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -528,14 +528,14 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, y0_3 = vmulq_f32(y0_3, alpha_vec); } float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec); vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3)); } else vst1q_f32(&Z[i], xy0_3); } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); @@ -2134,7 +2134,7 @@ void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vmulq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2142,7 +2142,7 @@ void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = alpha * X[i] * Y[i] + beta * Z[i]; else Z[i] = alpha * X[i] * Y[i]; @@ -2162,7 +2162,7 @@ void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vaddq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2170,7 +2170,7 @@ void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] + alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] + alpha * Y[i]; @@ -2190,7 +2190,7 @@ void ele_sub(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vsubq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2198,7 +2198,7 @@ void ele_sub(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] - alpha * Y[i] + beta * Z[i]; else Z[i] = X[i] - alpha * Y[i]; @@ -2218,7 +2218,7 @@ void ele_div(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, y0_7 = vmulq_f16(y0_7, alpha_vec); } float16x8_t xy0_7 = vdivq_f16(x0_7, y0_7); - if (beta != 0.f) { + if (std::abs(beta) > __FLT_MIN__) { float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec); vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7)); } else { @@ -2226,7 +2226,7 @@ void ele_div(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z, } } while (i < N) { - if (beta != 0.f) + if (std::abs(beta) > __FLT_MIN__) Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i]; else Z[i] = X[i] / (alpha * Y[i]); diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 4a7af67e4e..92a48124f1 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -79,7 +79,7 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief cosine with neon: Y = cos(alpha * X) @@ -88,7 +88,7 @@ void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0); * @param[in] Y float * for Vector Y * @param[in] alpha float * for scaling angle (radian) */ -void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0); +void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f); /** * @brief inversed squared root transformation with neon : X = 1 / sqrt(X)