diff --git a/reference_designs/ipu-xrt/matrix_multiplication/aie2.py b/reference_designs/ipu-xrt/matrix_multiplication/aie2.py
index 5e9dbeab9c..aa82582442 100644
--- a/reference_designs/ipu-xrt/matrix_multiplication/aie2.py
+++ b/reference_designs/ipu-xrt/matrix_multiplication/aie2.py
@@ -16,7 +16,7 @@ def my_matmul():
     K = 256
     N = 256
     m = 64
-    k = 32
+    k = 64
     n = 64
     r = 4
     s = 8
@@ -26,7 +26,7 @@ def my_matmul():
 
     vectorized = True
     enable_tracing = False
-    trace_size = 8192
+    trace_size = 16384
 
     A_sz_in_i32s = M * K * word_size_in // 4
     B_sz_in_i32s = K * N * word_size_in // 4
diff --git a/reference_designs/ipu-xrt/matrix_multiplication/mm.cc b/reference_designs/ipu-xrt/matrix_multiplication/mm.cc
index ecb2d0f9b8..8b7732fdaf 100644
--- a/reference_designs/ipu-xrt/matrix_multiplication/mm.cc
+++ b/reference_designs/ipu-xrt/matrix_multiplication/mm.cc
@@ -64,7 +64,7 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,
 
       for (unsigned j = 0; j < colB; j += 2)
         // chess_loop_range(2, ) {
-        chess_prepare_for_pipelining chess_loop_range(16, ) {
+        chess_prepare_for_pipelining chess_loop_range(8, ) {
           const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A;
           const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A;
           const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B;
@@ -103,6 +103,7 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,
 
           for (unsigned i = 1; i < colA; ++i)
             chess_prepare_for_pipelining chess_loop_range(7, ) {
+              // chess_unroll_loop() {
               A0 = aie::load_v<MMUL::size_A>(pA1);
               pA1 += MMUL::size_A;
               A1 = aie::load_v<MMUL::size_A>(pA2);
@@ -133,8 +134,8 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,
 
 template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
           unsigned colB, unsigned r, unsigned s, unsigned t>
-void matmul_vectorized_unroll(const T_in *__restrict pA,
-                              const T_in *__restrict pB, T_out *__restrict pC) {
+void matmul_vectorized_2x2(const T_in *__restrict pA, const T_in *__restrict pB,
+                           T_out *__restrict pC) {
   using MMUL = aie::mmul<r, s, t, T_in, T_in, accfloat>;
 
   event0();
@@ -149,32 +150,42 @@ void matmul_vectorized_unroll(const T_in *__restrict pA,
   // is laid out contiguously in row-major). An element in row 0, column 4
   // would be stored at offset 16 in the same example.
 
-  for (unsigned z = 0; z < rowA; z += 2)
+  for (unsigned z = 0; z < rowA; z += 4)
     chess_loop_range(2, ) {
       T_out *__restrict pC1 = pC + (z * colB + 0) * MMUL::size_C;
-      T_out *__restrict pC1b =
-          pC + (z * colB + 0) * MMUL::size_C + MMUL::size_C;
       T_out *__restrict pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C;
-      T_out *__restrict pC2b =
-          pC + ((z + 1) * colB + 0) * MMUL::size_C + MMUL::size_C;
+      T_out *__restrict pC3 = pC + ((z + 2) * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC4 = pC + ((z + 3) * colB + 0) * MMUL::size_C;
 
-      for (unsigned j = 0; j < colB; j += 2)
-        // chess_modulo_scheduling_budget_ratio(19000)
-        // chess_peel_pipelined_loop(1)
+      for (unsigned j = 0; j < colB; j += 4)
+        // chess_loop_range(2, ) {
         chess_prepare_for_pipelining chess_loop_range(8, ) {
           const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A;
           const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA3 = pA + ((z + 2) * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA4 = pA + ((z + 3) * colA + 0) * MMUL::size_A;
+
           const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B;
           const T_in *__restrict pB2 = pB + (0 * colB + (j + 1)) * MMUL::size_B;
+          const T_in *__restrict pB3 = pB + (0 * colB + (j + 2)) * MMUL::size_B;
+          const T_in *__restrict pB4 = pB + (0 * colB + (j + 3)) * MMUL::size_B;
 
           aie::vector<T_in, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1);
           pA1 += MMUL::size_A;
           aie::vector<T_in, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2);
           pA2 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A2 = aie::load_v<MMUL::size_A>(pA3);
+          pA3 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A3 = aie::load_v<MMUL::size_A>(pA4);
+          pA4 += MMUL::size_A;
           aie::vector<T_in, MMUL::size_B> B0 = aie::load_v<MMUL::size_B>(pB1);
           pB1 += MMUL::size_B * colB;
           aie::vector<T_in, MMUL::size_B> B1 = aie::load_v<MMUL::size_B>(pB2);
           pB2 += MMUL::size_B * colB;
+          aie::vector<T_in, MMUL::size_B> B2 = aie::load_v<MMUL::size_B>(pB3);
+          pB3 += MMUL::size_B * colB;
+          aie::vector<T_in, MMUL::size_B> B3 = aie::load_v<MMUL::size_B>(pB4);
+          pB4 += MMUL::size_B * colB;
 
           // We modify the library documentation implementation to accumulate
           // in the C dimension, since this vectorized kernel will be called
@@ -182,72 +193,156 @@ void matmul_vectorized_unroll(const T_in *__restrict pA,
           aie::vector<T_out, MMUL::size_C> acc_C00 =
               aie::load_v<MMUL::size_C>(pC1);
           aie::vector<T_out, MMUL::size_C> acc_C01 =
-              aie::load_v<MMUL::size_C>(pC1b);
+              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C02 =
+              aie::load_v<MMUL::size_C>(pC1 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C03 =
+              aie::load_v<MMUL::size_C>(pC1 + 3 * MMUL::size_C);
+
           aie::vector<T_out, MMUL::size_C> acc_C10 =
               aie::load_v<MMUL::size_C>(pC2);
           aie::vector<T_out, MMUL::size_C> acc_C11 =
-              aie::load_v<MMUL::size_C>(pC2b);
+              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C12 =
+              aie::load_v<MMUL::size_C>(pC2 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C13 =
+              aie::load_v<MMUL::size_C>(pC2 + 3 * MMUL::size_C);
+
+          aie::vector<T_out, MMUL::size_C> acc_C20 =
+              aie::load_v<MMUL::size_C>(pC3);
+          aie::vector<T_out, MMUL::size_C> acc_C21 =
+              aie::load_v<MMUL::size_C>(pC3 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C22 =
+              aie::load_v<MMUL::size_C>(pC3 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C23 =
+              aie::load_v<MMUL::size_C>(pC3 + 3 * MMUL::size_C);
+
+          aie::vector<T_out, MMUL::size_C> acc_C30 =
+              aie::load_v<MMUL::size_C>(pC4);
+          aie::vector<T_out, MMUL::size_C> acc_C31 =
+              aie::load_v<MMUL::size_C>(pC4 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C32 =
+              aie::load_v<MMUL::size_C>(pC4 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C33 =
+              aie::load_v<MMUL::size_C>(pC4 + 3 * MMUL::size_C);
 
           MMUL C00(acc_C00);
           MMUL C01(acc_C01);
+          MMUL C02(acc_C02);
+          MMUL C03(acc_C03);
+
           MMUL C10(acc_C10);
           MMUL C11(acc_C11);
+          MMUL C12(acc_C12);
+          MMUL C13(acc_C13);
+
+          MMUL C20(acc_C20);
+          MMUL C21(acc_C21);
+          MMUL C22(acc_C22);
+          MMUL C23(acc_C23);
+
+          MMUL C30(acc_C30);
+          MMUL C31(acc_C31);
+          MMUL C32(acc_C32);
+          MMUL C33(acc_C33);
 
           C00.mac(A0, B0);
           C01.mac(A0, B1);
           C10.mac(A1, B0);
           C11.mac(A1, B1);
 
-          aie::vector<T_in, MMUL::size_A> A0b = aie::load_v<MMUL::size_A>(pA1);
-          pA1 += MMUL::size_A;
-          aie::vector<T_in, MMUL::size_A> A1b = aie::load_v<MMUL::size_A>(pA2);
-          pA2 += MMUL::size_A;
-          aie::vector<T_in, MMUL::size_B> B0b = aie::load_v<MMUL::size_B>(pB1);
-          pB1 += MMUL::size_B * colB;
-          aie::vector<T_in, MMUL::size_B> B1b = aie::load_v<MMUL::size_B>(pB2);
-          pB2 += MMUL::size_B * colB;
+          C02.mac(A0, B2);
+          C03.mac(A0, B3);
+          C12.mac(A1, B2);
+          C13.mac(A1, B3);
 
-          C00.mac(A0b, B0b);
-          C01.mac(A0b, B1b);
-          C10.mac(A1b, B0b);
-          C11.mac(A1b, B1b);
+          C20.mac(A2, B0);
+          C21.mac(A2, B1);
+          C30.mac(A3, B0);
+          C31.mac(A3, B1);
 
-          A0 = aie::load_v<MMUL::size_A>(pA1);
-          pA1 += MMUL::size_A;
-          A1 = aie::load_v<MMUL::size_A>(pA2);
-          pA2 += MMUL::size_A;
-          B0 = aie::load_v<MMUL::size_B>(pB1);
-          pB1 += MMUL::size_B * colB;
-          B1 = aie::load_v<MMUL::size_B>(pB2);
-          pB2 += MMUL::size_B * colB;
+          C22.mac(A2, B2);
+          C23.mac(A2, B3);
+          C32.mac(A3, B2);
+          C33.mac(A3, B3);
 
-          C00.mac(A0, B0);
-          C01.mac(A0, B1);
-          C10.mac(A1, B0);
-          C11.mac(A1, B1);
+          for (unsigned i = 1; i < colA; ++i)
+            chess_prepare_for_pipelining chess_loop_range(7, ) {
+              // chess_unroll_loop() {
+              A0 = aie::load_v<MMUL::size_A>(pA1);
+              pA1 += MMUL::size_A;
+              A1 = aie::load_v<MMUL::size_A>(pA2);
+              pA2 += MMUL::size_A;
+              A2 = aie::load_v<MMUL::size_A>(pA3);
+              pA3 += MMUL::size_A;
+              A3 = aie::load_v<MMUL::size_A>(pA4);
+              pA4 += MMUL::size_A;
 
-          A0b = aie::load_v<MMUL::size_A>(pA1);
-          pA1 += MMUL::size_A;
-          A1b = aie::load_v<MMUL::size_A>(pA2);
-          pA2 += MMUL::size_A;
-          B0b = aie::load_v<MMUL::size_B>(pB1);
-          pB1 += MMUL::size_B * colB;
-          B1b = aie::load_v<MMUL::size_B>(pB2);
-          pB2 += MMUL::size_B * colB;
+              B0 = aie::load_v<MMUL::size_B>(pB1);
+              pB1 += MMUL::size_B * colB;
+              B1 = aie::load_v<MMUL::size_B>(pB2);
+              pB2 += MMUL::size_B * colB;
+              B2 = aie::load_v<MMUL::size_B>(pB3);
+              pB3 += MMUL::size_B * colB;
+              B3 = aie::load_v<MMUL::size_B>(pB4);
+              pB4 += MMUL::size_B * colB;
+
+              C00.mac(A0, B0);
+              C01.mac(A0, B1);
+              C10.mac(A1, B0);
+              C11.mac(A1, B1);
+
+              C02.mac(A0, B2);
+              C03.mac(A0, B3);
+              C12.mac(A1, B2);
+              C13.mac(A1, B3);
 
-          C00.mac(A0b, B0b);
-          C01.mac(A0b, B1b);
-          C10.mac(A1b, B0b);
-          C11.mac(A1b, B1b);
+              C20.mac(A2, B0);
+              C21.mac(A2, B1);
+              C30.mac(A3, B0);
+              C31.mac(A3, B1);
+
+              C22.mac(A2, B2);
+              C23.mac(A2, B3);
+              C32.mac(A3, B2);
+              C33.mac(A3, B3);
+            }
 
           aie::store_v(pC1, C00.template to_vector<T_out>());
-          pC1 += 2 * MMUL::size_C;
-          aie::store_v(pC1b, C01.template to_vector<T_out>());
-          pC1b += 2 * MMUL::size_C;
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C01.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C02.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C03.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+
           aie::store_v(pC2, C10.template to_vector<T_out>());
-          pC2 += 2 * MMUL::size_C;
-          aie::store_v(pC2b, C11.template to_vector<T_out>());
-          pC2b += 2 * MMUL::size_C;
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C11.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C12.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C13.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+
+          aie::store_v(pC3, C20.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC3, C21.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC3, C22.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC3, C23.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+
+          aie::store_v(pC4, C30.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+          aie::store_v(pC4, C31.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+          aie::store_v(pC4, C32.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+          aie::store_v(pC4, C33.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
         }
     }
 
@@ -281,8 +376,9 @@ void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA,
   static_assert(m % (2 * r) == 0 && m / (2 * r) > 0);
   static_assert(k % (2 * s) == 0 && k / (2 * s) > 0);
   static_assert(n % (2 * t) == 0 && n / (2 * t) > 0);
-  return matmul_vectorized_unroll<bfloat16, bfloat16, m / r, k / s, n / t, r, s,
-                                  t>(pA, pB, pC);
+  // return matmul_vectorized<bfloat16, bfloat16, m / r, k / s, n / t, r, s, t>(
+  return matmul_vectorized_2x2<bfloat16, bfloat16, m / r, k / s, n / t, r, s,
+                               t>(pA, pB, pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
diff --git a/reference_designs/ipu-xrt/matrix_multiplication/test.cpp b/reference_designs/ipu-xrt/matrix_multiplication/test.cpp
index 98768b7795..e2fbcfdef9 100644
--- a/reference_designs/ipu-xrt/matrix_multiplication/test.cpp
+++ b/reference_designs/ipu-xrt/matrix_multiplication/test.cpp
@@ -42,7 +42,7 @@ constexpr int C_SIZE = (C_VOLUME * sizeof(C_DATATYPE));
 
 constexpr bool VERIFY = true;
 constexpr bool ENABLE_TRACING = false;
-constexpr int TRACE_SIZE = 8192;
+constexpr int TRACE_SIZE = 16384;
 
 constexpr int OUT_SIZE = C_SIZE + (ENABLE_TRACING ? TRACE_SIZE : 0);
 
@@ -234,53 +234,87 @@ int main(int argc, const char *argv[]) {
 
   if (verbosity >= 1)
     std::cout << "Running Kernel.\n";
-  auto start = std::chrono::system_clock::now();
-  auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_out);
-  run.wait();
-  auto stop = std::chrono::system_clock::now();
 
-  bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-  // Reinterpret first C_VOLUME bytes of bufOut as our output C_DATATYPE C
-  // matrix
-  C_DATATYPE *COut = (C_DATATYPE *)bufOut;
+  unsigned num_iter = 10;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
 
   int errors = 0;
-  int max_errors = 100;
-
-  if (VERIFY) {
-    std::vector<C_DATATYPE> output_ref0;
-    for (uint32_t i = 0; i < C_VOLUME; i++)
-      output_ref0.push_back(0);
-    matmul(AVec, BVec, output_ref0);
-
-    const C_DATATYPE absTol = std::abs(0.1);
-    for (uint32_t i = 0; i < C_VOLUME; i++) {
-      if (std::abs((float)COut[i] - (float)output_ref0[i]) > absTol) {
-        errors++;
-        if (errors < max_errors) {
-          std::cout << "\nerror, id " << i << " expected "
-                    << std::to_string((float)output_ref0[i]) << ", got "
-                    << std::to_string((float)COut[i]) << "\n";
+  float macs = 2.0 * float(M) * float(K) * float(N);
+
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    auto start = std::chrono::system_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_out);
+    run.wait();
+    auto stop = std::chrono::system_clock::now();
+
+    bo_out.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    // Reinterpret first C_VOLUME bytes of bufOut as our output C_DATATYPE C
+    // matrix
+    C_DATATYPE *COut = (C_DATATYPE *)bufOut;
+
+    int max_errors = 100;
+
+    if (VERIFY) {
+      std::cout << "Verifying against reference matmul ..." << std::endl;
+      auto vstart = std::chrono::system_clock::now();
+      std::vector<C_DATATYPE> output_ref0;
+      for (uint32_t i = 0; i < C_VOLUME; i++)
+        output_ref0.push_back(0);
+      matmul(AVec, BVec, output_ref0);
+
+      const C_DATATYPE absTol = std::abs(0.1);
+      for (uint32_t i = 0; i < C_VOLUME; i++) {
+        if (std::abs((float)COut[i] - (float)output_ref0[i]) > absTol) {
+          errors++;
+          if (errors < max_errors) {
+            std::cout << "\nerror, id " << i << " expected "
+                      << std::to_string((float)output_ref0[i]) << ", got "
+                      << std::to_string((float)COut[i]) << "\n";
+          }
         }
       }
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      std::cout << "Verify time: " << vtime << "secs." << std::endl;
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: matmul results not verified." << std::endl;
     }
-  } else {
-    std::cout << "WARNING: matmul results not verified." << std::endl;
-  }
 
-  if (ENABLE_TRACING) {
-    write_out_trace(bufOut, vm["trace"].as<std::string>());
+    if (ENABLE_TRACING) {
+      write_out_trace(bufOut, vm["trace"].as<std::string>());
+    }
+
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
   }
 
   std::cout << std::endl
-            << "NPU matmul time: "
-            << std::chrono::duration_cast<std::chrono::milliseconds>(stop -
-                                                                     start)
-                   .count()
-            << "ms." << std::endl;
+            << "Avg NPU matmul time: " << npu_time_total / num_iter << "us."
+            << std::endl;
+  std::cout << "Avg NPU gflops: " << macs / (1000 * npu_time_total / num_iter)
+            << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU matmul time: " << npu_time_min << "us." << std::endl;
+  std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_min) << std::endl;
+
+  std::cout << std::endl
+            << "Max NPU matmul time: " << npu_time_max << "us." << std::endl;
+  std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_max) << std::endl;
 
-  if (!errors) {
+  if (VERIFY && !errors) {
     std::cout << "\nPASS!\n\n";
     return 0;
   } else {
diff --git a/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc b/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc
index 76b5eb0cd5..8b7732fdaf 100755
--- a/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc
+++ b/reference_designs/ipu-xrt/matrix_multiplication_array/mm.cc
@@ -132,6 +132,223 @@ void matmul_vectorized(const T_in *__restrict pA, const T_in *__restrict pB,
   event1();
 }
 
+template <typename T_in, typename T_out, unsigned rowA, unsigned colA,
+          unsigned colB, unsigned r, unsigned s, unsigned t>
+void matmul_vectorized_2x2(const T_in *__restrict pA, const T_in *__restrict pB,
+                           T_out *__restrict pC) {
+  using MMUL = aie::mmul<r, s, t, T_in, T_in, accfloat>;
+
+  event0();
+
+  // For int16 (4x4x4), this implementation iterates over the output space in
+  // steps of 4x4 tiles; each iteration makes an r*s, s*t and r*t step in the
+  // input and output space, respectively. The data layout expected is such
+  // that each r*s/s*t/r*t tile's elements are laid out contiguously in
+  // row-major order, and tiles themselves are organized in row-major
+  // order. For example, for 4x4x4 tiles, this means that an element in
+  // row 1, column 0 would be stored at offset 4 (since the first 4x4 tile
+  // is laid out contiguously in row-major). An element in row 0, column 4
+  // would be stored at offset 16 in the same example.
+
+  for (unsigned z = 0; z < rowA; z += 4)
+    chess_loop_range(2, ) {
+      T_out *__restrict pC1 = pC + (z * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC2 = pC + ((z + 1) * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC3 = pC + ((z + 2) * colB + 0) * MMUL::size_C;
+      T_out *__restrict pC4 = pC + ((z + 3) * colB + 0) * MMUL::size_C;
+
+      for (unsigned j = 0; j < colB; j += 4)
+        // chess_loop_range(2, ) {
+        chess_prepare_for_pipelining chess_loop_range(8, ) {
+          const T_in *__restrict pA1 = pA + (z * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA2 = pA + ((z + 1) * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA3 = pA + ((z + 2) * colA + 0) * MMUL::size_A;
+          const T_in *__restrict pA4 = pA + ((z + 3) * colA + 0) * MMUL::size_A;
+
+          const T_in *__restrict pB1 = pB + (0 * colB + j) * MMUL::size_B;
+          const T_in *__restrict pB2 = pB + (0 * colB + (j + 1)) * MMUL::size_B;
+          const T_in *__restrict pB3 = pB + (0 * colB + (j + 2)) * MMUL::size_B;
+          const T_in *__restrict pB4 = pB + (0 * colB + (j + 3)) * MMUL::size_B;
+
+          aie::vector<T_in, MMUL::size_A> A0 = aie::load_v<MMUL::size_A>(pA1);
+          pA1 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A1 = aie::load_v<MMUL::size_A>(pA2);
+          pA2 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A2 = aie::load_v<MMUL::size_A>(pA3);
+          pA3 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_A> A3 = aie::load_v<MMUL::size_A>(pA4);
+          pA4 += MMUL::size_A;
+          aie::vector<T_in, MMUL::size_B> B0 = aie::load_v<MMUL::size_B>(pB1);
+          pB1 += MMUL::size_B * colB;
+          aie::vector<T_in, MMUL::size_B> B1 = aie::load_v<MMUL::size_B>(pB2);
+          pB2 += MMUL::size_B * colB;
+          aie::vector<T_in, MMUL::size_B> B2 = aie::load_v<MMUL::size_B>(pB3);
+          pB3 += MMUL::size_B * colB;
+          aie::vector<T_in, MMUL::size_B> B3 = aie::load_v<MMUL::size_B>(pB4);
+          pB4 += MMUL::size_B * colB;
+
+          // We modify the library documentation implementation to accumulate
+          // in the C dimension, since this vectorized kernel will be called
+          // multiple times as we further tile the input at a higher level.
+          aie::vector<T_out, MMUL::size_C> acc_C00 =
+              aie::load_v<MMUL::size_C>(pC1);
+          aie::vector<T_out, MMUL::size_C> acc_C01 =
+              aie::load_v<MMUL::size_C>(pC1 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C02 =
+              aie::load_v<MMUL::size_C>(pC1 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C03 =
+              aie::load_v<MMUL::size_C>(pC1 + 3 * MMUL::size_C);
+
+          aie::vector<T_out, MMUL::size_C> acc_C10 =
+              aie::load_v<MMUL::size_C>(pC2);
+          aie::vector<T_out, MMUL::size_C> acc_C11 =
+              aie::load_v<MMUL::size_C>(pC2 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C12 =
+              aie::load_v<MMUL::size_C>(pC2 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C13 =
+              aie::load_v<MMUL::size_C>(pC2 + 3 * MMUL::size_C);
+
+          aie::vector<T_out, MMUL::size_C> acc_C20 =
+              aie::load_v<MMUL::size_C>(pC3);
+          aie::vector<T_out, MMUL::size_C> acc_C21 =
+              aie::load_v<MMUL::size_C>(pC3 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C22 =
+              aie::load_v<MMUL::size_C>(pC3 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C23 =
+              aie::load_v<MMUL::size_C>(pC3 + 3 * MMUL::size_C);
+
+          aie::vector<T_out, MMUL::size_C> acc_C30 =
+              aie::load_v<MMUL::size_C>(pC4);
+          aie::vector<T_out, MMUL::size_C> acc_C31 =
+              aie::load_v<MMUL::size_C>(pC4 + MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C32 =
+              aie::load_v<MMUL::size_C>(pC4 + 2 * MMUL::size_C);
+          aie::vector<T_out, MMUL::size_C> acc_C33 =
+              aie::load_v<MMUL::size_C>(pC4 + 3 * MMUL::size_C);
+
+          MMUL C00(acc_C00);
+          MMUL C01(acc_C01);
+          MMUL C02(acc_C02);
+          MMUL C03(acc_C03);
+
+          MMUL C10(acc_C10);
+          MMUL C11(acc_C11);
+          MMUL C12(acc_C12);
+          MMUL C13(acc_C13);
+
+          MMUL C20(acc_C20);
+          MMUL C21(acc_C21);
+          MMUL C22(acc_C22);
+          MMUL C23(acc_C23);
+
+          MMUL C30(acc_C30);
+          MMUL C31(acc_C31);
+          MMUL C32(acc_C32);
+          MMUL C33(acc_C33);
+
+          C00.mac(A0, B0);
+          C01.mac(A0, B1);
+          C10.mac(A1, B0);
+          C11.mac(A1, B1);
+
+          C02.mac(A0, B2);
+          C03.mac(A0, B3);
+          C12.mac(A1, B2);
+          C13.mac(A1, B3);
+
+          C20.mac(A2, B0);
+          C21.mac(A2, B1);
+          C30.mac(A3, B0);
+          C31.mac(A3, B1);
+
+          C22.mac(A2, B2);
+          C23.mac(A2, B3);
+          C32.mac(A3, B2);
+          C33.mac(A3, B3);
+
+          for (unsigned i = 1; i < colA; ++i)
+            chess_prepare_for_pipelining chess_loop_range(7, ) {
+              // chess_unroll_loop() {
+              A0 = aie::load_v<MMUL::size_A>(pA1);
+              pA1 += MMUL::size_A;
+              A1 = aie::load_v<MMUL::size_A>(pA2);
+              pA2 += MMUL::size_A;
+              A2 = aie::load_v<MMUL::size_A>(pA3);
+              pA3 += MMUL::size_A;
+              A3 = aie::load_v<MMUL::size_A>(pA4);
+              pA4 += MMUL::size_A;
+
+              B0 = aie::load_v<MMUL::size_B>(pB1);
+              pB1 += MMUL::size_B * colB;
+              B1 = aie::load_v<MMUL::size_B>(pB2);
+              pB2 += MMUL::size_B * colB;
+              B2 = aie::load_v<MMUL::size_B>(pB3);
+              pB3 += MMUL::size_B * colB;
+              B3 = aie::load_v<MMUL::size_B>(pB4);
+              pB4 += MMUL::size_B * colB;
+
+              C00.mac(A0, B0);
+              C01.mac(A0, B1);
+              C10.mac(A1, B0);
+              C11.mac(A1, B1);
+
+              C02.mac(A0, B2);
+              C03.mac(A0, B3);
+              C12.mac(A1, B2);
+              C13.mac(A1, B3);
+
+              C20.mac(A2, B0);
+              C21.mac(A2, B1);
+              C30.mac(A3, B0);
+              C31.mac(A3, B1);
+
+              C22.mac(A2, B2);
+              C23.mac(A2, B3);
+              C32.mac(A3, B2);
+              C33.mac(A3, B3);
+            }
+
+          aie::store_v(pC1, C00.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C01.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C02.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+          aie::store_v(pC1, C03.template to_vector<T_out>());
+          pC1 += MMUL::size_C;
+
+          aie::store_v(pC2, C10.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C11.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C12.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+          aie::store_v(pC2, C13.template to_vector<T_out>());
+          pC2 += MMUL::size_C;
+
+          aie::store_v(pC3, C20.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC3, C21.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC3, C22.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+          aie::store_v(pC3, C23.template to_vector<T_out>());
+          pC3 += MMUL::size_C;
+
+          aie::store_v(pC4, C30.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+          aie::store_v(pC4, C31.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+          aie::store_v(pC4, C32.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+          aie::store_v(pC4, C33.template to_vector<T_out>());
+          pC4 += MMUL::size_C;
+        }
+    }
+
+  event1();
+}
+
 template <unsigned m, unsigned k, unsigned n>
 void matmul_vectorized_4x4x4_i16_i16(const int16 *__restrict pA,
                                      const int16 *__restrict pB,
@@ -159,8 +376,9 @@ void matmul_vectorized_4x8x4_bf16_bf16(const bfloat16 *__restrict pA,
   static_assert(m % (2 * r) == 0 && m / (2 * r) > 0);
   static_assert(k % (2 * s) == 0 && k / (2 * s) > 0);
   static_assert(n % (2 * t) == 0 && n / (2 * t) > 0);
-  return matmul_vectorized<bfloat16, bfloat16, m / r, k / s, n / t, r, s, t>(
-      pA, pB, pC);
+  // return matmul_vectorized<bfloat16, bfloat16, m / r, k / s, n / t, r, s, t>(
+  return matmul_vectorized_2x2<bfloat16, bfloat16, m / r, k / s, n / t, r, s,
+                               t>(pA, pB, pC);
 }
 
 template <unsigned m, unsigned k, unsigned n>
diff --git a/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp b/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp
index 6743e27613..1c3c58a794 100644
--- a/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp
+++ b/reference_designs/ipu-xrt/matrix_multiplication_array/test.cpp
@@ -231,52 +231,86 @@ int main(int argc, const char *argv[]) {
 
   if (verbosity >= 1)
     std::cout << "Running Kernel.\n";
-  auto start = std::chrono::high_resolution_clock::now();
-  auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
-  run.wait();
-  auto stop = std::chrono::high_resolution_clock::now();
 
-  bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
-
-  C_DATATYPE *bufOut = bo_c.map<C_DATATYPE *>();
+  int num_iter = 10;
+  float npu_time_total = 0;
+  float npu_time_min = 9999999;
+  float npu_time_max = 0;
 
   int errors = 0;
-  int max_errors = 100;
-
-  if (VERIFY) {
-    std::vector<C_DATATYPE> output_ref0;
-    for (uint32_t i = 0; i < C_VOLUME; i++)
-      output_ref0.push_back(0);
-    // output_ref0.push_back(K);
-    matmul(AVec, BVec, output_ref0);
-
-    const float absTol = std::abs(0.1);
-    // const float absTol = std::abs(5);
-    for (int row = 0; row < M; row++) {
-      for (int col = 0; col < N; col++) {
-        if (std::abs((float)bufOut[row * N + col] -
-                     (float)output_ref0[row * N + col]) > absTol) {
-          errors++;
-          if (errors < max_errors) {
-            std::cout << "\nerror, row: " << row << " col: " << col
-                      << " expected "
-                      << std::to_string((float)output_ref0[row * N + col])
-                      << ", got "
-                      << std::to_string((float)bufOut[row * N + col]) << "\n";
+  float macs = 2.0 * float(M) * float(K) * float(N);
+
+  for (unsigned iter = 0; iter < num_iter; iter++) {
+
+    auto start = std::chrono::high_resolution_clock::now();
+    auto run = kernel(bo_instr, instr_v.size(), bo_a, bo_b, bo_c);
+    run.wait();
+    auto stop = std::chrono::high_resolution_clock::now();
+
+    bo_c.sync(XCL_BO_SYNC_BO_FROM_DEVICE);
+
+    C_DATATYPE *bufOut = bo_c.map<C_DATATYPE *>();
+
+    int max_errors = 100;
+
+    if (VERIFY) {
+      std::cout << "Verifying against reference matmul ..." << std::endl;
+      auto vstart = std::chrono::system_clock::now();
+      std::vector<C_DATATYPE> output_ref0;
+      for (uint32_t i = 0; i < C_VOLUME; i++)
+        output_ref0.push_back(0);
+      // output_ref0.push_back(K);
+      matmul(AVec, BVec, output_ref0);
+
+      const float absTol = std::abs(0.1);
+      // const float absTol = std::abs(5);
+      for (int row = 0; row < M; row++) {
+        for (int col = 0; col < N; col++) {
+          if (std::abs((float)bufOut[row * N + col] -
+                       (float)output_ref0[row * N + col]) > absTol) {
+            errors++;
+            if (errors < max_errors) {
+              std::cout << "\nerror, row: " << row << " col: " << col
+                        << " expected "
+                        << std::to_string((float)output_ref0[row * N + col])
+                        << ", got "
+                        << std::to_string((float)bufOut[row * N + col]) << "\n";
+            }
           }
         }
       }
+      auto vstop = std::chrono::system_clock::now();
+      float vtime =
+          std::chrono::duration_cast<std::chrono::seconds>(vstop - vstart)
+              .count();
+      std::cout << "Verify time: " << vtime << "secs." << std::endl;
+    } else {
+      if (verbosity >= 1)
+        std::cout << "WARNING: matmul results not verified." << std::endl;
     }
+
+    float npu_time =
+        std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
+            .count();
+
+    npu_time_total += npu_time;
+    npu_time_min = (npu_time < npu_time_min) ? npu_time : npu_time_min;
+    npu_time_max = (npu_time > npu_time_max) ? npu_time : npu_time_max;
   }
 
-  float npu_time =
-      std::chrono::duration_cast<std::chrono::microseconds>(stop - start)
-          .count();
-  float macs = 2.0 * float(M) * float(K) * float(N);
+  std::cout << std::endl
+            << "Avg NPU matmul time: " << npu_time_total / num_iter << "us."
+            << std::endl;
+  std::cout << "Avg NPU gflops: " << macs / (1000 * npu_time_total / num_iter)
+            << std::endl;
+
+  std::cout << std::endl
+            << "Min NPU matmul time: " << npu_time_min << "us." << std::endl;
+  std::cout << "Min NPU gflops: " << macs / (1000 * npu_time_min) << std::endl;
 
   std::cout << std::endl
-            << "NPU matmul time: " << npu_time << "us." << std::endl;
-  std::cout << "NPU gflops: " << macs / (1000 * npu_time) << std::endl;
+            << "Max NPU matmul time: " << npu_time_max << "us." << std::endl;
+  std::cout << "Max NPU gflops: " << macs / (1000 * npu_time_max) << std::endl;
 
   if (VERIFY && !errors) {
     std::cout << "\nPASS!\n\n";