Skip to content

Commit

Permalink
[ hdot ] Use precision-enhanced hdot
Browse files Browse the repository at this point in the history
- Previous hdot was using full-fp16.
- Since this is also one of dimension-shrinking computation, should use inter-fp32 values to enhance precision.
- This has not been detected due to small dimension Tensor usage in unittest. Add higher dimension test case accordingly.

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <[email protected]>
  • Loading branch information
skykongkong8 authored and jijoongmoon committed May 10, 2024
1 parent 35c4491 commit 72ee686
Show file tree
Hide file tree
Showing 2 changed files with 76 additions and 34 deletions.
46 changes: 12 additions & 34 deletions nntrainer/tensor/blas_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1192,51 +1192,29 @@ void haxpy(const unsigned int N, const float alpha, const __fp16 *X,
}

__fp16 hdot(const unsigned int N, const __fp16 *X, const __fp16 *Y) {

float16x8_t accX8 = vmovq_n_f16(0);
float16x4_t accX4 = vmov_n_f16(0);
float32x4_t accX0_3 = vmovq_n_f32(0.F);
float32x4_t accX4_7 = vmovq_n_f32(0.F);

unsigned int idx = 0;
__fp16 ret = 0;
unsigned int N8 = (N >> 3) << 3;
float ret = 0;

// processing batch of 8
for (; (N - idx) >= 8; idx += 8) {
// Adaptive loop for batch size of 8
for (; idx < N8; idx += 8) {
float16x8_t x = vld1q_f16(&X[idx]);
float16x8_t y = vld1q_f16(&Y[idx]);

// x*y + accX8 -> accX8
accX8 = vfmaq_f16(accX8, x, y);
}

// check at least one batch of 8 is processed
if (N - 8 >= 0) {
__fp16 result[8];
vst1q_f16(result, accX8);
for (unsigned int i = 0; i < 8; i++)
ret += result[i];
}

// processing remaining batch of 4
for (; (N - idx) >= 4; idx += 4) {
float16x4_t x = vld1_f16(&X[idx]);
float16x4_t y = vld1_f16(&Y[idx]);

// x*y + accX4 -> accX4
accX4 = vfma_f16(accX4, x, y);
}

// check at least one batch of 4 is processed
if (N % 8 >= 4) {
__fp16 result[4];
vst1_f16(result, accX4);
ret += result[0] + result[1] + result[2] + result[3];
x = vmulq_f16(x, y);
accX0_3 = vaddq_f32(accX0_3, vcvt_f32_f16(vget_low_f16(x)));
accX4_7 = vaddq_f32(accX4_7, vcvt_f32_f16(vget_high_f16(x)));
}
ret += vaddvq_f32(accX0_3) + vaddvq_f32(accX4_7);

// pocessing remaining values
// Loop for remaining indices
for (; idx < N; idx++)
ret += X[idx] * Y[idx];

return ret;
return static_cast<__fp16>(ret);
}

__fp16 hnrm2(const unsigned int N, const __fp16 *X) {
Expand Down
64 changes: 64 additions & 0 deletions test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,70 @@ TEST(nntrainer_Tensor, dot) {
EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
}

TEST(nntrainer_Tensor, hdot_768) {

nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};

nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};

// conditions for fp16 hdot call:
// this->(batch * channel * height) = arg->(width) = 1;
size_t batch = 1;
size_t channel = 1;
size_t height = 1;
size_t width = 768;

nntrainer::Tensor input(
nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16));

nntrainer::Tensor input_2(
nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp16));

nntrainer::Tensor input_fp32(
nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32));

nntrainer::Tensor input_fp32_2(
nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp32));

const float alpha = 1e-1;
const int MOD = 10;

GEN_TEST_INPUT(input, ((i * j * (batch * height * channel) +
j * (batch * height) + k * (width) + l + 1) %
MOD) *
alpha);
GEN_TEST_INPUT(input_fp32, ((i * j * (batch * height * channel) +
j * (batch * height) + k * (width) + l + 1) %
MOD) *
alpha);
GEN_TEST_INPUT(input_2, ((i * k * (batch * height * channel) +
j * (batch * height) + k * (width) + l + 1) %
MOD) *
alpha);
GEN_TEST_INPUT(input_fp32_2, ((i * k * (batch * height * channel) +
j * (batch * height) + k * (width) + l + 1) %
MOD) *
alpha);

nntrainer::Tensor result_neon = input.dot(input_2, false, false);
nntrainer::Tensor result_fp32 = input_fp32.dot(input_fp32_2, false, false);

float mseErrorNeon =
mse<__fp16>(result_neon.getData<__fp16>(), result_fp32.getData<float>(),
result_neon.size());

double cosSimNeon =
cosine_similarity<__fp16>(result_neon.getData<__fp16>(),
result_fp32.getData<float>(), result_neon.size());

const float epsilon = 1e-3;

EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
}

TEST(nntrainer_Tensor, l2norm) {

nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
Expand Down

0 comments on commit 72ee686

Please sign in to comment.