diff --git a/reference_designs/ipu-xrt/matrix_multiplication/aie2.py b/reference_designs/ipu-xrt/matrix_multiplication/aie2.py index 5e9dbeab9c..aa82582442 100644 --- a/reference_designs/ipu-xrt/matrix_multiplication/aie2.py +++ b/reference_designs/ipu-xrt/matrix_multiplication/aie2.py @@ -16,7 +16,7 @@ def my_matmul(): K = 256 N = 256 m = 64 - k = 32 + k = 64 n = 64 r = 4 s = 8 @@ -26,7 +26,7 @@ def my_matmul(): vectorized = True enable_tracing = False - trace_size = 8192 + trace_size = 16384 A_sz_in_i32s = M * K * word_size_in // 4 B_sz_in_i32s = K * N * word_size_in // 4 diff --git a/reference_designs/ipu-xrt/matrix_multiplication/mm.cc b/reference_designs/ipu-xrt/matrix_multiplication/mm.cc index ecb2d0f9b8..8b7732fdaf 100644 --- a/reference_designs/ipu-xrt/matrix_multiplication/mm.cc +++ b/reference_designs/ipu-xrt/matrix_multiplication/mm.cc @@ -64,7 +64,7 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB, for (unsigned j = 0; j < colB; j += 2) // chess_loop_range(2, ) { - chess_prepare_for_pipelining chess_loop_range(16, ) { + chess_prepare_for_pipelining chess_loop_range(8, ) { const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A; const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A; const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B; @@ -103,6 +103,7 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB, for (unsigned i = 1; i < colA; ++i) chess_prepare_for_pipelining chess_loop_range(7, ) { + // chess_unroll_loop() { A0 = aie::load_v(pA1); pA1 += MMUL::size_A; A1 = aie::load_v(pA2); @@ -133,8 +134,8 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB, template -void matmul_vectorized_unroll(const T_in *__restrict pA, - const T_in *__restrict pB, T_out *__restrict pC) { +void matmul_vectorized_2x2(const T_in *__restrict pA, const T_in *__restrict pB, + T_out *__restrict pC) { using MMUL = aie::mmul; event0(); @@ -149,32 +150,42 @@ void matmul_vectorized_unroll(const T_in *__restrict pA, // is laid out contiguously in row-major). An element in row 0, column 4 // would be stored at offset 16 in the same example. - for (unsigned z = 0; z < rowA; z += 2) + for (unsigned z = 0; z < rowA; z += 4) chess_loop_range(2, ) { T_out *__restrict pC1 = pC + (z * colB + 0) * MMUL::size_C; - T_out *__restrict pC1b = - pC + (z * colB + 0) * MMUL::size_C + MMUL::size_C; T_out *__restrict pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C; - T_out *__restrict pC2b = - pC + ((z + 1) * colB + 0) * MMUL::size_C + MMUL::size_C; + T_out *__restrict pC3 = pC + ((z + 2) * colB + 0) * MMUL::size_C; + T_out *__restrict pC4 = pC + ((z + 3) * colB + 0) * MMUL::size_C; - for (unsigned j = 0; j < colB; j += 2) - // chess_modulo_scheduling_budget_ratio(19000) - // chess_peel_pipelined_loop(1) + for (unsigned j = 0; j < colB; j += 4) + // chess_loop_range(2, ) { chess_prepare_for_pipelining chess_loop_range(8, ) { const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A; const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A; + const T_in *__restrict pA3 = pA + ((z + 2) * colA + 0) * MMUL::size_A; + const T_in *__restrict pA4 = pA + ((z + 3) * colA + 0) * MMUL::size_A; + const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B; const T_in *__restrict pB2 = pB + (0 * colB + (j + 1)) * MMUL::size_B; + const T_in *__restrict pB3 = pB + (0 * colB + (j + 2)) * MMUL::size_B; + const T_in *__restrict pB4 = pB + (0 * colB + (j + 3)) * MMUL::size_B; aie::vector A0 = aie::load_v(pA1); pA1 += MMUL::size_A; aie::vector A1 = aie::load_v(pA2); pA2 += MMUL::size_A; + aie::vector A2 = aie::load_v(pA3); + pA3 += MMUL::size_A; + aie::vector A3 = aie::load_v(pA4); + pA4 += MMUL::size_A; aie::vector B0 = aie::load_v(pB1); pB1 += MMUL::size_B * colB; aie::vector B1 = aie::load_v(pB2); pB2 += MMUL::size_B * colB; + aie::vector B2 = aie::load_v(pB3); + pB3 += MMUL::size_B * colB; + aie::vector B3 = aie::load_v(pB4); + pB4 += MMUL::size_B * colB; // We modify the library documentation implementation to accumulate // in the C dimension, since this vectorized kernel will be called @@ -182,72 +193,156 @@ void matmul_vectorized_unroll(const T_in *__restrict pA, aie::vector acc_C00 = aie::load_v(pC1); aie::vector acc_C01 = - aie::load_v(pC1b); + aie::load_v(pC1 + MMUL::size_C); + aie::vector acc_C02 = + aie::load_v(pC1 + 2 * MMUL::size_C); + aie::vector acc_C03 = + aie::load_v(pC1 + 3 * MMUL::size_C); + aie::vector acc_C10 = aie::load_v(pC2); aie::vector acc_C11 = - aie::load_v(pC2b); + aie::load_v(pC2 + MMUL::size_C); + aie::vector acc_C12 = + aie::load_v(pC2 + 2 * MMUL::size_C); + aie::vector acc_C13 = + aie::load_v(pC2 + 3 * MMUL::size_C); + + aie::vector acc_C20 = + aie::load_v(pC3); + aie::vector acc_C21 = + aie::load_v(pC3 + MMUL::size_C); + aie::vector acc_C22 = + aie::load_v(pC3 + 2 * MMUL::size_C); + aie::vector acc_C23 = + aie::load_v(pC3 + 3 * MMUL::size_C); + + aie::vector acc_C30 = + aie::load_v(pC4); + aie::vector acc_C31 = + aie::load_v(pC4 + MMUL::size_C); + aie::vector acc_C32 = + aie::load_v(pC4 + 2 * MMUL::size_C); + aie::vector acc_C33 = + aie::load_v(pC4 + 3 * MMUL::size_C); MMUL C00(acc_C00); MMUL C01(acc_C01); + MMUL C02(acc_C02); + MMUL C03(acc_C03); + MMUL C10(acc_C10); MMUL C11(acc_C11); + MMUL C12(acc_C12); + MMUL C13(acc_C13); + + MMUL C20(acc_C20); + MMUL C21(acc_C21); + MMUL C22(acc_C22); + MMUL C23(acc_C23); + + MMUL C30(acc_C30); + MMUL C31(acc_C31); + MMUL C32(acc_C32); + MMUL C33(acc_C33); C00.mac(A0, B0); C01.mac(A0, B1); C10.mac(A1, B0); C11.mac(A1, B1); - aie::vector A0b = aie::load_v(pA1); - pA1 += MMUL::size_A; - aie::vector A1b = aie::load_v(pA2); - pA2 += MMUL::size_A; - aie::vector B0b = aie::load_v(pB1); - pB1 += MMUL::size_B * colB; - aie::vector B1b = aie::load_v(pB2); - pB2 += MMUL::size_B * colB; + C02.mac(A0, B2); + C03.mac(A0, B3); + C12.mac(A1, B2); + C13.mac(A1, B3); - C00.mac(A0b, B0b); - C01.mac(A0b, B1b); - C10.mac(A1b, B0b); - C11.mac(A1b, B1b); + C20.mac(A2, B0); + C21.mac(A2, B1); + C30.mac(A3, B0); + C31.mac(A3, B1); - A0 = aie::load_v(pA1); - pA1 += MMUL::size_A; - A1 = aie::load_v(pA2); - pA2 += MMUL::size_A; - B0 = aie::load_v(pB1); - pB1 += MMUL::size_B * colB; - B1 = aie::load_v(pB2); - pB2 += MMUL::size_B * colB; + C22.mac(A2, B2); + C23.mac(A2, B3); + C32.mac(A3, B2); + C33.mac(A3, B3); - C00.mac(A0, B0); - C01.mac(A0, B1); - C10.mac(A1, B0); - C11.mac(A1, B1); + for (unsigned i = 1; i < colA; ++i) + chess_prepare_for_pipelining chess_loop_range(7, ) { + // chess_unroll_loop() { + A0 = aie::load_v(pA1); + pA1 += MMUL::size_A; + A1 = aie::load_v(pA2); + pA2 += MMUL::size_A; + A2 = aie::load_v(pA3); + pA3 += MMUL::size_A; + A3 = aie::load_v(pA4); + pA4 += MMUL::size_A; - A0b = aie::load_v(pA1); - pA1 += MMUL::size_A; - A1b = aie::load_v(pA2); - pA2 += MMUL::size_A; - B0b = aie::load_v(pB1); - pB1 += MMUL::size_B * colB; - B1b = aie::load_v(pB2); - pB2 += MMUL::size_B * colB; + B0 = aie::load_v(pB1); + pB1 += MMUL::size_B * colB; + B1 = aie::load_v(pB2); + pB2 += MMUL::size_B * colB; + B2 = aie::load_v(pB3); + pB3 += MMUL::size_B * colB; + B3 = aie::load_v(pB4); + pB4 += MMUL::size_B * colB; + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + + C02.mac(A0, B2); + C03.mac(A0, B3); + C12.mac(A1, B2); + C13.mac(A1, B3); - C00.mac(A0b, B0b); - C01.mac(A0b, B1b); - C10.mac(A1b, B0b); - C11.mac(A1b, B1b); + C20.mac(A2, B0); + C21.mac(A2, B1); + C30.mac(A3, B0); + C31.mac(A3, B1); + + C22.mac(A2, B2); + C23.mac(A2, B3); + C32.mac(A3, B2); + C33.mac(A3, B3); + } aie::store_v(pC1, C00.template to_vector()); - pC1 += 2 * MMUL::size_C; - aie::store_v(pC1b, C01.template to_vector()); - pC1b += 2 * MMUL::size_C; + pC1 += MMUL::size_C; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C02.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C03.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC2, C10.template to_vector()); - pC2 += 2 * MMUL::size_C; - aie::store_v(pC2b, C11.template to_vector()); - pC2b += 2 * MMUL::size_C; + pC2 += MMUL::size_C; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C12.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C13.template to_vector()); + pC2 += MMUL::size_C; + + aie::store_v(pC3, C20.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C21.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C22.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C23.template to_vector()); + pC3 += MMUL::size_C; + + aie::store_v(pC4, C30.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C31.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C32.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C33.template to_vector()); + pC4 += MMUL::size_C; } } @@ -281,8 +376,9 @@ void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA, static_assert(m % (2 * r) == 0 && m / (2 * r) > 0); static_assert(k % (2 * s) == 0 && k / (2 * s) > 0); static_assert(n % (2 * t) == 0 && n / (2 * t) > 0); - return matmul_vectorized_unroll(pA, pB, pC); + // return matmul_vectorized( + return matmul_vectorized_2x2(pA, pB, pC); } template diff --git a/reference_designs/ipu-xrt/matrix_multiplication/test.cpp b/reference_designs/ipu-xrt/matrix_multiplication/test.cpp index 98768b7795..e2fbcfdef9 100644 --- a/reference_designs/ipu-xrt/matrix_multiplication/test.cpp +++ b/reference_designs/ipu-xrt/matrix_multiplication/test.cpp @@ -42,7 +42,7 @@ constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE)); constexpr bool VERIFY = true; constexpr bool ENABLE_TRACING = false; -constexpr int TRACE_SIZE = 8192; +constexpr int TRACE_SIZE = 16384; constexpr int OUT_SIZE = C_SIZE + (ENABLE_TRACING ? TRACE_SIZE : 0); @@ -234,53 +234,87 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto start = std::chrono::system_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_out); - run.wait(); - auto stop = std::chrono::system_clock::now(); - bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - // Reinterpret first C_VOLUME bytes of bufOut as our output C_DATATYPE C - // matrix - C_DATATYPE *COut = (C_DATATYPE *)bufOut; + unsigned num_iter = 10; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; int errors = 0; - int max_errors = 100; - - if (VERIFY) { - std::vector output_ref0; - for (uint32_t i = 0; i < C_VOLUME; i++) - output_ref0.push_back(0); - matmul(AVec, BVec, output_ref0); - - const C_DATATYPE absTol = std::abs(0.1); - for (uint32_t i = 0; i < C_VOLUME; i++) { - if (std::abs((float)COut[i] - (float)output_ref0[i]) > absTol) { - errors++; - if (errors < max_errors) { - std::cout << "\nerror, id " << i << " expected " - << std::to_string((float)output_ref0[i]) << ", got " - << std::to_string((float)COut[i]) << "\n"; + float macs = 2.0 * float(M) * float(K) * float(N); + + for (unsigned iter = 0; iter < num_iter; iter++) { + + auto start = std::chrono::system_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_out); + run.wait(); + auto stop = std::chrono::system_clock::now(); + + bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + // Reinterpret first C_VOLUME bytes of bufOut as our output C_DATATYPE C + // matrix + C_DATATYPE *COut = (C_DATATYPE *)bufOut; + + int max_errors = 100; + + if (VERIFY) { + std::cout << "Verifying against reference matmul ..." << std::endl; + auto vstart = std::chrono::system_clock::now(); + std::vector output_ref0; + for (uint32_t i = 0; i < C_VOLUME; i++) + output_ref0.push_back(0); + matmul(AVec, BVec, output_ref0); + + const C_DATATYPE absTol = std::abs(0.1); + for (uint32_t i = 0; i < C_VOLUME; i++) { + if (std::abs((float)COut[i] - (float)output_ref0[i]) > absTol) { + errors++; + if (errors < max_errors) { + std::cout << "\nerror, id " << i << " expected " + << std::to_string((float)output_ref0[i]) << ", got " + << std::to_string((float)COut[i]) << "\n"; + } } } + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } else { + if (verbosity >= 1) + std::cout << "WARNING: matmul results not verified." << std::endl; } - } else { - std::cout << "WARNING: matmul results not verified." << std::endl; - } - if (ENABLE_TRACING) { - write_out_trace(bufOut, vm["trace"].as()); + if (ENABLE_TRACING) { + write_out_trace(bufOut, vm["trace"].as()); + } + + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; } std::cout << std::endl - << "NPU matmul time: " - << std::chrono::duration_cast(stop - - start) - .count() - << "ms." << std::endl; + << "Avg NPU matmul time: " << npu_time_total / num_iter << "us." + << std::endl; + std::cout << "Avg NPU gflops: " << macs / (1000 * npu_time_total / num_iter) + << std::endl; + + std::cout << std::endl + << "Min NPU matmul time: " << npu_time_min << "us." << std::endl; + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_min) << std::endl; + + std::cout << std::endl + << "Max NPU matmul time: " << npu_time_max << "us." << std::endl; + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_max) << std::endl; - if (!errors) { + if (VERIFY && !errors) { std::cout << "\nPASS!\n\n"; return 0; } else { diff --git a/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc b/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc index 76b5eb0cd5..8b7732fdaf 100755 --- a/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc +++ b/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc @@ -132,6 +132,223 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB, event1(); } +template +void matmul_vectorized_2x2(const T_in *__restrict pA, const T_in *__restrict pB, + T_out *__restrict pC) { + using MMUL = aie::mmul; + + event0(); + + // For int16 (4x4x4), this implementation iterates over the output space in + // steps of 4x4 tiles; each iteration makes an r*s, s*t and r*t step in the + // input and output space, respectively. The data layout expected is such + // that each r*s/s*t/r*t tile's elements are laid out contiguously in + // row-major order, and tiles themselves are organized in row-major + // order. For example, for 4x4x4 tiles, this means that an element in + // row 1, column 0 would be stored at offset 4 (since the first 4x4 tile + // is laid out contiguously in row-major). An element in row 0, column 4 + // would be stored at offset 16 in the same example. + + for (unsigned z = 0; z < rowA; z += 4) + chess_loop_range(2, ) { + T_out *__restrict pC1 = pC + (z * colB + 0) * MMUL::size_C; + T_out *__restrict pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C; + T_out *__restrict pC3 = pC + ((z + 2) * colB + 0) * MMUL::size_C; + T_out *__restrict pC4 = pC + ((z + 3) * colB + 0) * MMUL::size_C; + + for (unsigned j = 0; j < colB; j += 4) + // chess_loop_range(2, ) { + chess_prepare_for_pipelining chess_loop_range(8, ) { + const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A; + const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A; + const T_in *__restrict pA3 = pA + ((z + 2) * colA + 0) * MMUL::size_A; + const T_in *__restrict pA4 = pA + ((z + 3) * colA + 0) * MMUL::size_A; + + const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B; + const T_in *__restrict pB2 = pB + (0 * colB + (j + 1)) * MMUL::size_B; + const T_in *__restrict pB3 = pB + (0 * colB + (j + 2)) * MMUL::size_B; + const T_in *__restrict pB4 = pB + (0 * colB + (j + 3)) * MMUL::size_B; + + aie::vector A0 = aie::load_v(pA1); + pA1 += MMUL::size_A; + aie::vector A1 = aie::load_v(pA2); + pA2 += MMUL::size_A; + aie::vector A2 = aie::load_v(pA3); + pA3 += MMUL::size_A; + aie::vector A3 = aie::load_v(pA4); + pA4 += MMUL::size_A; + aie::vector B0 = aie::load_v(pB1); + pB1 += MMUL::size_B * colB; + aie::vector B1 = aie::load_v(pB2); + pB2 += MMUL::size_B * colB; + aie::vector B2 = aie::load_v(pB3); + pB3 += MMUL::size_B * colB; + aie::vector B3 = aie::load_v(pB4); + pB4 += MMUL::size_B * colB; + + // We modify the library documentation implementation to accumulate + // in the C dimension, since this vectorized kernel will be called + // multiple times as we further tile the input at a higher level. + aie::vector acc_C00 = + aie::load_v(pC1); + aie::vector acc_C01 = + aie::load_v(pC1 + MMUL::size_C); + aie::vector acc_C02 = + aie::load_v(pC1 + 2 * MMUL::size_C); + aie::vector acc_C03 = + aie::load_v(pC1 + 3 * MMUL::size_C); + + aie::vector acc_C10 = + aie::load_v(pC2); + aie::vector acc_C11 = + aie::load_v(pC2 + MMUL::size_C); + aie::vector acc_C12 = + aie::load_v(pC2 + 2 * MMUL::size_C); + aie::vector acc_C13 = + aie::load_v(pC2 + 3 * MMUL::size_C); + + aie::vector acc_C20 = + aie::load_v(pC3); + aie::vector acc_C21 = + aie::load_v(pC3 + MMUL::size_C); + aie::vector acc_C22 = + aie::load_v(pC3 + 2 * MMUL::size_C); + aie::vector acc_C23 = + aie::load_v(pC3 + 3 * MMUL::size_C); + + aie::vector acc_C30 = + aie::load_v(pC4); + aie::vector acc_C31 = + aie::load_v(pC4 + MMUL::size_C); + aie::vector acc_C32 = + aie::load_v(pC4 + 2 * MMUL::size_C); + aie::vector acc_C33 = + aie::load_v(pC4 + 3 * MMUL::size_C); + + MMUL C00(acc_C00); + MMUL C01(acc_C01); + MMUL C02(acc_C02); + MMUL C03(acc_C03); + + MMUL C10(acc_C10); + MMUL C11(acc_C11); + MMUL C12(acc_C12); + MMUL C13(acc_C13); + + MMUL C20(acc_C20); + MMUL C21(acc_C21); + MMUL C22(acc_C22); + MMUL C23(acc_C23); + + MMUL C30(acc_C30); + MMUL C31(acc_C31); + MMUL C32(acc_C32); + MMUL C33(acc_C33); + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + + C02.mac(A0, B2); + C03.mac(A0, B3); + C12.mac(A1, B2); + C13.mac(A1, B3); + + C20.mac(A2, B0); + C21.mac(A2, B1); + C30.mac(A3, B0); + C31.mac(A3, B1); + + C22.mac(A2, B2); + C23.mac(A2, B3); + C32.mac(A3, B2); + C33.mac(A3, B3); + + for (unsigned i = 1; i < colA; ++i) + chess_prepare_for_pipelining chess_loop_range(7, ) { + // chess_unroll_loop() { + A0 = aie::load_v(pA1); + pA1 += MMUL::size_A; + A1 = aie::load_v(pA2); + pA2 += MMUL::size_A; + A2 = aie::load_v(pA3); + pA3 += MMUL::size_A; + A3 = aie::load_v(pA4); + pA4 += MMUL::size_A; + + B0 = aie::load_v(pB1); + pB1 += MMUL::size_B * colB; + B1 = aie::load_v(pB2); + pB2 += MMUL::size_B * colB; + B2 = aie::load_v(pB3); + pB3 += MMUL::size_B * colB; + B3 = aie::load_v(pB4); + pB4 += MMUL::size_B * colB; + + C00.mac(A0, B0); + C01.mac(A0, B1); + C10.mac(A1, B0); + C11.mac(A1, B1); + + C02.mac(A0, B2); + C03.mac(A0, B3); + C12.mac(A1, B2); + C13.mac(A1, B3); + + C20.mac(A2, B0); + C21.mac(A2, B1); + C30.mac(A3, B0); + C31.mac(A3, B1); + + C22.mac(A2, B2); + C23.mac(A2, B3); + C32.mac(A3, B2); + C33.mac(A3, B3); + } + + aie::store_v(pC1, C00.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C01.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C02.template to_vector()); + pC1 += MMUL::size_C; + aie::store_v(pC1, C03.template to_vector()); + pC1 += MMUL::size_C; + + aie::store_v(pC2, C10.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C11.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C12.template to_vector()); + pC2 += MMUL::size_C; + aie::store_v(pC2, C13.template to_vector()); + pC2 += MMUL::size_C; + + aie::store_v(pC3, C20.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C21.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C22.template to_vector()); + pC3 += MMUL::size_C; + aie::store_v(pC3, C23.template to_vector()); + pC3 += MMUL::size_C; + + aie::store_v(pC4, C30.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C31.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C32.template to_vector()); + pC4 += MMUL::size_C; + aie::store_v(pC4, C33.template to_vector()); + pC4 += MMUL::size_C; + } + } + + event1(); +} + template void matmul_vectorized_4x4x4_i16_i16(const int16 *__restrict pA, const int16 *__restrict pB, @@ -159,8 +376,9 @@ void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA, static_assert(m % (2 * r) == 0 && m / (2 * r) > 0); static_assert(k % (2 * s) == 0 && k / (2 * s) > 0); static_assert(n % (2 * t) == 0 && n / (2 * t) > 0); - return matmul_vectorized( - pA, pB, pC); + // return matmul_vectorized( + return matmul_vectorized_2x2(pA, pB, pC); } template diff --git a/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp b/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp index 6743e27613..1c3c58a794 100644 --- a/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp +++ b/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp @@ -231,52 +231,86 @@ int main(int argc, const char *argv[]) { if (verbosity >= 1) std::cout << "Running Kernel.\n"; - auto start = std::chrono::high_resolution_clock::now(); - auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c); - run.wait(); - auto stop = std::chrono::high_resolution_clock::now(); - bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); - - C_DATATYPE *bufOut = bo_c.map(); + int num_iter = 10; + float npu_time_total = 0; + float npu_time_min = 9999999; + float npu_time_max = 0; int errors = 0; - int max_errors = 100; - - if (VERIFY) { - std::vector output_ref0; - for (uint32_t i = 0; i < C_VOLUME; i++) - output_ref0.push_back(0); - // output_ref0.push_back(K); - matmul(AVec, BVec, output_ref0); - - const float absTol = std::abs(0.1); - // const float absTol = std::abs(5); - for (int row = 0; row < M; row++) { - for (int col = 0; col < N; col++) { - if (std::abs((float)bufOut[row * N + col] - - (float)output_ref0[row * N + col]) > absTol) { - errors++; - if (errors < max_errors) { - std::cout << "\nerror, row: " << row << " col: " << col - << " expected " - << std::to_string((float)output_ref0[row * N + col]) - << ", got " - << std::to_string((float)bufOut[row * N + col]) << "\n"; + float macs = 2.0 * float(M) * float(K) * float(N); + + for (unsigned iter = 0; iter < num_iter; iter++) { + + auto start = std::chrono::high_resolution_clock::now(); + auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c); + run.wait(); + auto stop = std::chrono::high_resolution_clock::now(); + + bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE); + + C_DATATYPE *bufOut = bo_c.map(); + + int max_errors = 100; + + if (VERIFY) { + std::cout << "Verifying against reference matmul ..." << std::endl; + auto vstart = std::chrono::system_clock::now(); + std::vector output_ref0; + for (uint32_t i = 0; i < C_VOLUME; i++) + output_ref0.push_back(0); + // output_ref0.push_back(K); + matmul(AVec, BVec, output_ref0); + + const float absTol = std::abs(0.1); + // const float absTol = std::abs(5); + for (int row = 0; row < M; row++) { + for (int col = 0; col < N; col++) { + if (std::abs((float)bufOut[row * N + col] - + (float)output_ref0[row * N + col]) > absTol) { + errors++; + if (errors < max_errors) { + std::cout << "\nerror, row: " << row << " col: " << col + << " expected " + << std::to_string((float)output_ref0[row * N + col]) + << ", got " + << std::to_string((float)bufOut[row * N + col]) << "\n"; + } } } } + auto vstop = std::chrono::system_clock::now(); + float vtime = + std::chrono::duration_cast(vstop - vstart) + .count(); + std::cout << "Verify time: " << vtime << "secs." << std::endl; + } else { + if (verbosity >= 1) + std::cout << "WARNING: matmul results not verified." << std::endl; } + + float npu_time = + std::chrono::duration_cast(stop - start) + .count(); + + npu_time_total += npu_time; + npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min; + npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max; } - float npu_time = - std::chrono::duration_cast(stop - start) - .count(); - float macs = 2.0 * float(M) * float(K) * float(N); + std::cout << std::endl + << "Avg NPU matmul time: " << npu_time_total / num_iter << "us." + << std::endl; + std::cout << "Avg NPU gflops: " << macs / (1000 * npu_time_total / num_iter) + << std::endl; + + std::cout << std::endl + << "Min NPU matmul time: " << npu_time_min << "us." << std::endl; + std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_min) << std::endl; std::cout << std::endl - << "NPU matmul time: " << npu_time << "us." << std::endl; - std::cout << "NPU gflops: " << macs / (1000 * npu_time) << std::endl; + << "Max NPU matmul time: " << npu_time_max << "us." << std::endl; + std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_max) << std::endl; if (VERIFY && !errors) { std::cout << "\nPASS!\n\n";