Skip to content

Commit

Permalink
[ BLAS ] Fix beta comparing logic
Browse files Browse the repository at this point in the history
- According to discussions made from #2473, we found a better way of comparing float scalar multiplier using __FLT_MIN__

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 committed Feb 19, 2024
1 parent ae4e50e commit 3cd78b1
Show file tree
Hide file tree
Showing 4 changed files with 28 additions and 28 deletions.
16 changes: 8 additions & 8 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -387,7 +387,7 @@ void ele_mul(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = static_cast<_FP16>(alpha) * X[i] * Y[i] +
static_cast<_FP16>(beta) * Z[i];
else
Expand All @@ -402,7 +402,7 @@ void ele_add(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] + static_cast<_FP16>(alpha) * Y[i] +
static_cast<_FP16>(beta) * Z[i];
else
Expand All @@ -417,7 +417,7 @@ void ele_sub(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] - static_cast<_FP16>(alpha) * Y[i] +
static_cast<_FP16>(beta) * Z[i];
else
Expand All @@ -432,7 +432,7 @@ void ele_div(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z,
nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] / (static_cast<_FP16>(alpha) * Y[i]) +
static_cast<_FP16>(beta) * Z[i];
else
Expand Down Expand Up @@ -943,7 +943,7 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
nntrainer::neon::ele_mul(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = alpha * X[i] * Y[i] + beta * Z[i];
else
Z[i] = alpha * X[i] * Y[i];
Expand All @@ -957,7 +957,7 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
nntrainer::neon::ele_add(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] + alpha * Y[i] + beta * Z[i];
else
Z[i] = X[i] + alpha * Y[i];
Expand All @@ -971,7 +971,7 @@ void ele_sub(const unsigned int N, const float *X, const float *Y, float *Z,
nntrainer::neon::ele_sub(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] - alpha * Y[i] + beta * Z[i];
else
Z[i] = X[i] - alpha * Y[i];
Expand All @@ -986,7 +986,7 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
nntrainer::neon::ele_div(N, X, Y, Z, alpha, beta);
#else
for (unsigned int i = 0; i < N; ++i) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i];
else
Z[i] = X[i] / (alpha * Y[i]);
Expand Down
4 changes: 2 additions & 2 deletions nntrainer/tensor/blas_interface.h
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ unsigned int isamax(const unsigned int N, const float *X, const int incX);
* @param[in] Y float * for Vector Y
* @param[in] alpha float * for scaling angle (radian)
*/
void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0);
void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);

/**
* @brief cosine with neon: Y = cos(alpha * X)
Expand All @@ -394,7 +394,7 @@ void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0);
* @param[in] Y float * for Vector Y
* @param[in] alpha float * for scaling angle (radian)
*/
void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0);
void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);

/**
* @brief inversed squared root transformation inplace : X = 1 / sqrt(X)
Expand Down
32 changes: 16 additions & 16 deletions nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -447,14 +447,14 @@ void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
y0_3 = vmulq_f32(y0_3, alpha_vec);
}
float32x4_t xy0_3 = vmulq_f32(x0_3, y0_3);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
} else
vst1q_f32(&Z[i], xy0_3);
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = alpha * X[i] * Y[i] + beta * Z[i];
else
Z[i] = alpha * X[i] * Y[i];
Expand All @@ -474,14 +474,14 @@ void ele_add(const unsigned int N, const float *X, const float *Y, float *Z,
y0_3 = vmulq_f32(y0_3, alpha_vec);
}
float32x4_t xy0_3 = vaddq_f32(x0_3, y0_3);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
} else
vst1q_f32(&Z[i], xy0_3);
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] + alpha * Y[i] + beta * Z[i];
else
Z[i] = X[i] + alpha * Y[i];
Expand All @@ -501,14 +501,14 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
y0_3 = vmulq_f32(y0_3, alpha_vec);
}
float32x4_t xy0_3 = vsubq_f32(x0_3, y0_3);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
} else
vst1q_f32(&Z[i], xy0_3);
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] - alpha * Y[i] + beta * Z[i];
else
Z[i] = X[i] - alpha * Y[i];
Expand All @@ -528,14 +528,14 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
y0_3 = vmulq_f32(y0_3, alpha_vec);
}
float32x4_t xy0_3 = vdivq_f32(x0_3, y0_3);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float32x4_t z0_3 = vmulq_f32(vld1q_f32(&Z[i]), beta_vec);
vst1q_f32(&Z[i], vaddq_f32(z0_3, xy0_3));
} else
vst1q_f32(&Z[i], xy0_3);
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i];
else
Z[i] = X[i] / (alpha * Y[i]);
Expand Down Expand Up @@ -2134,15 +2134,15 @@ void ele_mul(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z,
y0_7 = vmulq_f16(y0_7, alpha_vec);
}
float16x8_t xy0_7 = vmulq_f16(x0_7, y0_7);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec);
vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7));
} else {
vst1q_f16(&Z[i], xy0_7);
}
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = alpha * X[i] * Y[i] + beta * Z[i];
else
Z[i] = alpha * X[i] * Y[i];
Expand All @@ -2162,15 +2162,15 @@ void ele_add(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z,
y0_7 = vmulq_f16(y0_7, alpha_vec);
}
float16x8_t xy0_7 = vaddq_f16(x0_7, y0_7);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec);
vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7));
} else {
vst1q_f16(&Z[i], xy0_7);
}
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] + alpha * Y[i] + beta * Z[i];
else
Z[i] = X[i] + alpha * Y[i];
Expand All @@ -2190,15 +2190,15 @@ void ele_sub(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z,
y0_7 = vmulq_f16(y0_7, alpha_vec);
}
float16x8_t xy0_7 = vsubq_f16(x0_7, y0_7);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec);
vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7));
} else {
vst1q_f16(&Z[i], xy0_7);
}
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] - alpha * Y[i] + beta * Z[i];
else
Z[i] = X[i] - alpha * Y[i];
Expand All @@ -2218,15 +2218,15 @@ void ele_div(const unsigned int N, const __fp16 *X, const __fp16 *Y, __fp16 *Z,
y0_7 = vmulq_f16(y0_7, alpha_vec);
}
float16x8_t xy0_7 = vdivq_f16(x0_7, y0_7);
if (beta != 0.f) {
if (std::abs(beta) > __FLT_MIN__) {
float16x8_t z0_7 = vmulq_f16(vld1q_f16(&Z[i]), beta_vec);
vst1q_f16(&Z[i], vaddq_f16(z0_7, xy0_7));
} else {
vst1q_f16(&Z[i], xy0_7);
}
}
while (i < N) {
if (beta != 0.f)
if (std::abs(beta) > __FLT_MIN__)
Z[i] = X[i] / (alpha * Y[i]) + beta * Z[i];
else
Z[i] = X[i] / (alpha * Y[i]);
Expand Down
4 changes: 2 additions & 2 deletions nntrainer/tensor/blas_neon.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
* @param[in] Y float * for Vector Y
* @param[in] alpha float * for scaling angle (radian)
*/
void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0);
void sine(const unsigned int N, float *X, float *Y, float alpha = 1.f);

/**
* @brief cosine with neon: Y = cos(alpha * X)
Expand All @@ -88,7 +88,7 @@ void sine(const unsigned int N, float *X, float *Y, float alpha = 1.0);
* @param[in] Y float * for Vector Y
* @param[in] alpha float * for scaling angle (radian)
*/
void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.0);
void cosine(const unsigned int N, float *X, float *Y, float alpha = 1.f);

/**
* @brief inversed squared root transformation with neon : X = 1 / sqrt(X)
Expand Down

0 comments on commit 3cd78b1

Please sign in to comment.