diff --git a/test/unittest/unittest_blas_kernels_cl.cpp b/test/unittest/unittest_blas_kernels_cl.cpp index ab1c8a03f..6a7ccc93a 100644 --- a/test/unittest/unittest_blas_kernels_cl.cpp +++ b/test/unittest/unittest_blas_kernels_cl.cpp @@ -75,6 +75,52 @@ TEST(blas_kernels, dotCL_sgemv_M_1_1) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(blas_kernels, dotCL_sgemv_M_1_1_fp16) { + setUpGpuContext(); + int batch = 1; + int channel = 1; + int height = 1; + int width = 768; + + int height_b = 2048; + int width_b = 768; + + bool transA = false; + bool transB = true; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::Tensor A_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B_fp16(batch, channel, height_b, width_b, t_type_nchw_fp16); + + GEN_TEST_INPUT(A_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp16, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = dotCl(A_fp16, B_fp16, transA, transB); + nntrainer::Tensor C_fp16 = A_fp16.dot(B_fp16, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(blas_kernels, dotCL_sgemv_M_1_2) { int batch = 1; int channel = 1; @@ -120,6 +166,51 @@ TEST(blas_kernels, dotCL_sgemv_M_1_2) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(blas_kernels, dotCL_sgemv_M_1_2_fp16) { + int batch = 1; + int channel = 1; + int height = 1; + int width = 768; + + int height_b = 768; + int width_b = 2048; + + bool transA = false; + bool transB = false; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::Tensor A_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B_fp16(batch, channel, height_b, width_b, t_type_nchw_fp16); + + GEN_TEST_INPUT(A_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp16, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = dotCl(A_fp16, B_fp16, transA, transB); + nntrainer::Tensor C_fp16 = A_fp16.dot(B_fp16, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(blas_kernels, dotCL_sgemv_N_1_1) { int batch = 1; int channel = 1; @@ -165,6 +256,51 @@ TEST(blas_kernels, dotCL_sgemv_N_1_1) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(blas_kernels, dotCL_sgemv_N_1_1_fp16) { + int batch = 1; + int channel = 1; + int height = 768; + int width = 2048; + + int height_b = 768; + int width_b = 1; + + bool transA = true; + bool transB = false; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::Tensor A_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B_fp16(batch, channel, height_b, width_b, t_type_nchw_fp16); + + GEN_TEST_INPUT(A_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp16, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = dotCl(A_fp16, B_fp16, transA, transB); + nntrainer::Tensor C_fp16 = A_fp16.dot(B_fp16, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(blas_kernels, dotCL_sgemv_N_1_2) { int batch = 1; int channel = 1; @@ -210,6 +346,51 @@ TEST(blas_kernels, dotCL_sgemv_N_1_2) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(blas_kernels, dotCL_sgemv_N_1_2_fp16) { + int batch = 1; + int channel = 1; + int height = 768; + int width = 2048; + + int height_b = 2048; + int width_b = 1; + + bool transA = false; + bool transB = false; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::Tensor A_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B_fp16(batch, channel, height_b, width_b, t_type_nchw_fp16); + + GEN_TEST_INPUT(A_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp16, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = dotCl(A_fp16, B_fp16, transA, transB); + nntrainer::Tensor C_fp16 = A_fp16.dot(B_fp16, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(blas_kernels, dotCL_sgemv_n) { int batch = 1; @@ -244,6 +425,40 @@ TEST(blas_kernels, dotCL_sgemv_n) { EXPECT_THROW(dotCl(A_fp32, B_fp32, transA, transB), std::runtime_error); } +TEST(blas_kernels, dotCL_sgemv_n_fp16) { + + int batch = 1; + int channel = 1; + int height = 1; + int width = 768; + + int height_b = 768; + int width_b = 2048; + + bool transA = true; + bool transB = false; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::Tensor A_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B_fp16(batch, channel, height_b, width_b, t_type_nchw_fp16); + + GEN_TEST_INPUT(A_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp16, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + EXPECT_THROW(dotCl(A_fp16, B_fp16, transA, transB), std::runtime_error); +} + TEST(nntrainer_Tensor, multiply_i) { int batch = 1; @@ -347,6 +562,67 @@ TEST(nntrainer_Tensor, dot_gemm_50_768_1024_noTrans) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(nntrainer_Tensor, dot_gemm_50_768_1024_noTrans_fp16) { + /// @note GEMM : A X B = C + + int batch = 1; + int channel = 1; + int height = 50; + int width = 768; + + int height_b = 768; + int width_b = 1024; + + bool transA = false; + bool transB = false; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + + nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16); + + nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32); + nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32); + + GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) + + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = dotCl(A, B, transA, transB); + nntrainer::Tensor C_fp16 = A.dot(B, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(nntrainer_Tensor, dot_gemm_50_768_2048_transB) { /// @note GEMM : A X B = C @@ -408,6 +684,67 @@ TEST(nntrainer_Tensor, dot_gemm_50_768_2048_transB) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(nntrainer_Tensor, dot_gemm_50_768_2048_transB_fp16) { + /// @note GEMM : A X B = C + + int batch = 1; + int channel = 1; + int height = 50; + int width = 768; + + int height_b = 2048; + int width_b = 768; + + bool transA = false; + bool transB = true; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + + nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16); + + nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32); + nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32); + + GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) + + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = dotCl(A, B, transA, transB); + nntrainer::Tensor C_fp16 = A.dot(B, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp16.getData<__fp16>(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(nntrainer_Tensor, dot_gemm_50_768_1024_transA) { /// @note GEMM : A X B = C @@ -582,6 +919,58 @@ TEST(blas_kernels, addition_i) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(blas_kernels, addition_i_fp16) { + + int batch = 12; + int channel = 1; + int height = 26; + int width = 26; + + int batch_b = 1; + + const float alpha = 1e-1; + const int MOD = 10; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::Tensor A_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B_fp16(batch_b, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor C_fp16(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor D_fp16(batch_b, channel, height, width, t_type_nchw_fp16); + + GEN_TEST_INPUT(A_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_C(B_fp16, ((i * (batch_b * height * channel) + + j * (batch_b * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(C_fp16, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_C(D_fp16, ((i * (batch_b * height * channel) + + j * (batch_b * height) + k * (width) + l + 1) % + MOD) * + alpha); + + A_fp16.add_i(B_fp16); + add_i_cl(C_fp16, D_fp16); + + float mseErrorNeon = mse<__fp16>(A_fp16.getData<__fp16>(), + C_fp16.getData<__fp16>(), A_fp16.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + A_fp16.getData<__fp16>(), C_fp16.getData<__fp16>(), A_fp16.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + GTEST_API_ int main(int argc, char **argv) { int result = -1;