diff --git a/include/linalg/mtx.hpp b/include/linalg/mtx.hpp
index 46046a2f7..b30976de2 100644
--- a/include/linalg/mtx.hpp
+++ b/include/linalg/mtx.hpp
@@ -187,6 +187,84 @@ class Mtx {
                  const std::vector<std::vector<float>> &B,
                  std::vector<std::vector<float>> &C);
 
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of 8 bit ints
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void
+    mtx_sub(const int8_t *A, const int8_t *B, int8_t *C, int rows, int cols);
+
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of 16 bit ints
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void
+    mtx_sub(const int16_t *A, const int16_t *B, int16_t *C, int rows, int cols);
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of type int
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void mtx_sub(const int *A, const int *B, int *C, int rows, int cols);
+
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of type double
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void
+    mtx_sub(const double *A, const double *B, double *C, int rows, int cols);
+
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of type float
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void mtx_sub(const float *A, const float *B, float *C, int rows, int cols);
+
+    void mtx_mult(const int8_t *A,
+                  const int8_t *B,
+                  int8_t *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
+    void mtx_mult(const int16_t *A,
+                  const int16_t *B,
+                  int16_t *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
+    void mtx_mult(const int *A,
+                  const int *B,
+                  int *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
     /**
      * @brief Perform matrix subtraction using Intel intrinsics, accepts
      * vectors of type int
@@ -373,7 +451,52 @@ class Mtx {
      * @overload
      */
     template <typename T>
-    void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols);
+    void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols) {
+        // MTX A AND B MUST BE SAME SIZE
+        for (int i = 0; i < rows; ++i) {
+            for (int j = 0; j < cols; ++j) {
+                // perform matrix addition
+                C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
+            }
+        }
+    }
+    /**
+     * @brief Perform matrix subtraction on two matrices as flat arrays
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @param rows Number of rows
+     * @param cols Number of columns
+     * @overload
+     */
+    template <typename T>
+    void std_mtx_sub(const T *A, const T *B, T *C, int rows, int cols) {
+        // MTX A AND B MUST BE SAME SIZE
+        for (int i = 0; i < rows; ++i) {
+            for (int j = 0; j < cols; ++j) {
+                // perform matrix addition
+                C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
+            }
+        }
+    }
+
+    template <typename T>
+    void std_mtx_mult(const T *A,
+                      const T *B,
+                      T *C,
+                      int rowsA,
+                      int colsA,
+                      int colsB) {
+        for (int i = 0; i < rowsA; ++i) {
+            for (int j = 0; j < colsB; ++j) {
+                T sum = 0; // Use T type for sum
+                for (int k = 0; k < colsA; ++k) {
+                    sum += A[i * colsA + k] * B[k * colsB + j];
+                }
+                C[i * colsB + j] = sum;
+            }
+        }
+    }
 
     /**
      * @brief Perform matrix addition on two matrices as flat vectors
diff --git a/modules/linalg/mtx_avx2_arr_i16.cpp b/modules/linalg/mtx_avx2_arr_i16.cpp
index 7237e53e1..62d1399f2 100644
--- a/modules/linalg/mtx_avx2_arr_i16.cpp
+++ b/modules/linalg/mtx_avx2_arr_i16.cpp
@@ -85,6 +85,70 @@ void gpmp::linalg::Mtx::mtx_add(const int16_t *A,
     }
 }
 
+void gpmp::linalg::Mtx::mtx_sub(const int16_t *A,
+                                const int16_t *B,
+                                int16_t *C,
+                                int rows,
+                                int cols) {
+    for (int i = 0; i < rows; ++i) {
+        int j = 0;
+        for (; j < cols - 15; j += 16) {
+            __m256i a = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&A[i * cols + j]));
+            __m256i b = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&B[i * cols + j]));
+            __m256i c = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&C[i * cols + j]));
+
+            // Perform vectorized subtraction and accumulate the result
+            c = _mm256_sub_epi16(a, b);
+
+            // Store the result back to the C matrix
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]),
+                                c);
+        }
+
+        for (; j < cols; ++j) {
+            C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
+        }
+    }
+}
+
+void gpmp::linalg::Mtx::mtx_mult(const int16_t *A,
+              const int16_t *B,
+              int16_t *C,
+              int rows_a,
+              int cols_a,
+              int cols_b) {
+    for (int i = 0; i < rows_a; ++i) {
+        for (int j = 0; j < cols_b; j += 16) {
+            __m256i c = _mm256_setzero_si256();
+            
+            for (int k = 0; k < cols_a; ++k) {
+                __m256i a = _mm256_set1_epi16(A[i * cols_a + k]);
+                __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j]));
+                
+                __m256i prod = _mm256_mullo_epi16(a, b);
+                c = _mm256_add_epi16(c, prod);
+            }
+            
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c); 
+        }
+        
+        // Handle remaining elements
+        for (int j = cols_b - cols_b % 16; j < cols_b; ++j) {
+            int sum = 0;
+            
+            for (int k = 0; k < cols_a; ++k) {
+                sum += A[i * cols_a + k] * B[k * cols_b + j];
+            }
+            
+            C[i * cols_b + j] = sum;
+        }
+    }
+}
+
+
 #endif
 
 // x86
diff --git a/modules/linalg/mtx_avx2_arr_i32.cpp b/modules/linalg/mtx_avx2_arr_i32.cpp
index 857542748..82697a503 100644
--- a/modules/linalg/mtx_avx2_arr_i32.cpp
+++ b/modules/linalg/mtx_avx2_arr_i32.cpp
@@ -89,12 +89,50 @@ void gpmp::linalg::Mtx::mtx_add(const int *A,
                 C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
             }
         }
-    } else {
+    }
+
+    else {
         // use standard matrix addition
         std_mtx_add(A, B, C, rows, cols);
     }
 }
 
+void gpmp::linalg::Mtx::mtx_mult(const int *A,
+                                 const int *B,
+                                 int *C,
+                                 int rows_a,
+                                 int cols_a,
+                                 int cols_b) {
+    for (int i = 0; i < rows_a; ++i) {
+        for (int j = 0; j < cols_b; j += 8) {
+            __m256i c = _mm256_setzero_si256();
+
+            for (int k = 0; k < cols_a; ++k) {
+                __m256i a = _mm256_set1_epi32(A[i * cols_a + k]);
+                __m256i b = _mm256_loadu_si256(
+                    reinterpret_cast<const __m256i *>(&B[k * cols_b + j]));
+
+                __m256i prod = _mm256_mullo_epi32(a, b);
+                c = _mm256_add_epi32(c, prod);
+            }
+
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]),
+                                c);
+        }
+
+        // Handle remaining elements
+        for (int j = cols_b - cols_b % 8; j < cols_b; ++j) {
+            int sum = 0;
+
+            for (int k = 0; k < cols_a; ++k) {
+                sum += A[i * cols_a + k] * B[k * cols_b + j];
+            }
+
+            C[i * cols_b + j] = sum;
+        }
+    }
+}
+
 #endif
 
 // x86
diff --git a/modules/linalg/mtx_avx2_arr_i8.cpp b/modules/linalg/mtx_avx2_arr_i8.cpp
index c6ab8d6a2..54b013ced 100644
--- a/modules/linalg/mtx_avx2_arr_i8.cpp
+++ b/modules/linalg/mtx_avx2_arr_i8.cpp
@@ -34,6 +34,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <iostream>
 #include <vector>
 
@@ -85,6 +86,75 @@ void gpmp::linalg::Mtx::mtx_add(const int8_t *A,
     }
 }
 
+void gpmp::linalg::Mtx::mtx_sub(const int8_t *A,
+                                const int8_t *B,
+                                int8_t *C,
+                                int rows,
+                                int cols) {
+    for (int i = 0; i < rows; ++i) {
+        int j = 0;
+        for (; j < cols - 31; j += 32) {
+            __m256i a = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&A[i * cols + j]));
+            __m256i b = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&B[i * cols + j]));
+            __m256i c = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&C[i * cols + j]));
+
+            // Perform vectorized subtraction and accumulate the result
+            c = _mm256_sub_epi8(a, b);
+
+            // Store the result back to the C matrix
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]),
+                                c);
+        }
+
+        for (; j < cols; ++j) {
+            C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
+        }
+    }
+}
+
+void gpmp::linalg::Mtx::mtx_mult(const int8_t *A,
+              const int8_t *B,
+              int8_t *C,
+              int rows_a,
+              int cols_a,
+              int cols_b) {
+
+    for (int i = 0; i < rows_a; ++i) {
+        for (int j = 0; j < cols_b; j += 32) {
+            __m256i c = _mm256_setzero_si256(); 
+            
+            for (int k = 0; k < cols_a; ++k) {
+                __m256i a = _mm256_set1_epi8(A[i * cols_a + k]); 
+                __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j])); 
+                
+                __m256i prod = _mm256_maddubs_epi16(a, b); 
+                c = _mm256_add_epi16(c, prod); 
+            }
+            
+            c = _mm256_srai_epi16(c, 8);
+            c = _mm256_packs_epi16(c, _mm256_setzero_si256()); 
+            
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c); 
+        }
+        
+        // Handle remaining elements
+        for (int j = cols_b - cols_b % 32; j < cols_b; ++j) {
+            int sum = 0;
+            
+            for (int k = 0; k < cols_a; ++k) {
+                sum += A[i * cols_a + k] * B[k * cols_b + j];
+            }
+            
+            C[i * cols_b + j] = sum;
+        }
+    }
+
+}
+
+
 #endif
 
 // x86
diff --git a/modules/linalg/mtx_naive.cpp b/modules/linalg/mtx_naive.cpp
index e1a12ca3b..7ac11afad 100644
--- a/modules/linalg/mtx_naive.cpp
+++ b/modules/linalg/mtx_naive.cpp
@@ -42,53 +42,6 @@
  * Standard/Naive Matrix Operations on Arrays
  *
  ************************************************************************/
-// naive matrix addition algorithm on arrays
-template <typename T>
-void gpmp::linalg::Mtx::std_mtx_add(const T *A,
-                                    const T *B,
-                                    T *C,
-                                    int rows,
-                                    int cols) {
-    // MTX A AND B MUST BE SAME SIZE
-    for (int i = 0; i < rows; ++i) {
-        for (int j = 0; j < cols; ++j) {
-            // perform matrix addition
-            C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
-        }
-    }
-}
-
-// instantiations for types accepted by templated std_mtx_add function for
-// flat arrays
-template void gpmp::linalg::Mtx::std_mtx_add(const int8_t *A,
-                                             const int8_t *B,
-                                             int8_t *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const int16_t *A,
-                                             const int16_t *B,
-                                             int16_t *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const int *A,
-                                             const int *B,
-                                             int *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const double *A,
-                                             const double *B,
-                                             double *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const float *A,
-                                             const float *B,
-                                             float *C,
-                                             int rows,
-                                             int cols);
 
 /************************************************************************
  *
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index a157ee905..a7e05395e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -55,8 +55,10 @@ set(CPP_TEST_FILES
     linalg/t_vector_vector_f64.cpp
     linalg/t_vector_vector_i8.cpp
     linalg/t_vector_vector_i32.cpp
+
     linalg/t_vector_vector_naive.cpp
-    
+    linalg/t_matrix_arr_naive.cpp
+
     nt/t_cipher.cpp
     nt/t_rc4.cpp
     nt/t_primes.cpp
@@ -157,9 +159,9 @@ if(LCOV)
     add_custom_command(TARGET RUN_CPP_TESTS POST_BUILD
         COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "[==========]"
         COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --cyan --bold "[Generating Coverage Reports]"
-        COMMAND lcov --directory .. --capture --output-file lcov.info --rc geninfo_unexecuted_blocks=1 #--ignore-errors mismatch,mismatch
+        COMMAND lcov --directory .. --capture --output-file lcov.info --rc geninfo_unexecuted_blocks=1 --ignore-errors mismatch,mismatch
         COMMAND lcov --remove lcov.info "*c++*" "*11*" "*/googletest/*" "*/gtest*" "/usr/*" "/src" "/build" -o lcov.info --ignore-errors unused
-        COMMAND mv lcov.info ../../.coverage
+        #COMMAND mv lcov.info ../../.coverage
     )
     endif()
 else()
diff --git a/tests/linalg/t_matrix.hpp b/tests/linalg/t_matrix.hpp
index b1aa69a65..fb68293e0 100644
--- a/tests/linalg/t_matrix.hpp
+++ b/tests/linalg/t_matrix.hpp
@@ -65,6 +65,7 @@ bool mtx_verif(const T *A, const T *B, int rows, int cols) {
     }
     return true;
 }
+
 template <typename T> void print_matrix(const T *matrix, int rows, int cols) {
     for (int i = 0; i < rows; ++i) {
         for (int j = 0; j < cols; ++j) {
diff --git a/tests/linalg/t_matrix_arr_i16.cpp b/tests/linalg/t_matrix_arr_i16.cpp
index 9299ae254..1292a387b 100644
--- a/tests/linalg/t_matrix_arr_i16.cpp
+++ b/tests/linalg/t_matrix_arr_i16.cpp
@@ -143,4 +143,110 @@ TEST(MatrixArrayTestI16, AdditionPerformanceComparison) {
     delete[] result;
 }
 
+TEST(MatrixArrayTestI16, SubtractionPerformanceComparison) {
+    int mtx_size = 1024;
+    TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
+    // define input matrices A and B
+    int16_t *A = new int16_t[mtx_size * mtx_size];
+    int16_t *B = new int16_t[mtx_size * mtx_size];
+    int16_t *expected = new int16_t[mtx_size * mtx_size];
+    int16_t *result = new int16_t[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 100);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = static_cast<int16_t>(distribution(gen));
+            B[i * mtx_size + j] = static_cast<int16_t>(distribution(gen));
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // expected result using the naive implementation
+    mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size);
+
+    auto end_std = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
+    auto start_intrin = std::chrono::high_resolution_clock::now();
+
+    // result using the intrinsics implementation
+    mtx.mtx_sub(A, B, result, mtx_size, mtx_size);
+    auto end_intrin = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_intrin =
+        end_intrin - start_intrin;
+
+    TEST_COUT << "INTRINSIC Matrix Subtraction Time      : "
+              << elapsed_seconds_intrin.count() << " seconds" << std::endl;
+    TEST_COUT << "STANDARD  Matrix Subtraction Time      : "
+              << elapsed_seconds_std.count() << " seconds" << std::endl;
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+
+TEST(MatrixArrayTestI16, MultiplicationPerformanceComparison) {
+    int mtx_size = 1024;
+    TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
+    // define input matrices A and B
+    int16_t *A = new int16_t[mtx_size * mtx_size];
+    int16_t *B = new int16_t[mtx_size * mtx_size];
+    int16_t *expected = new int16_t[mtx_size * mtx_size];
+    int16_t *result = new int16_t[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 15);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = static_cast<int16_t>(distribution(gen));
+            B[i * mtx_size + j] = static_cast<int16_t>(distribution(gen));
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // expected result using the naive implementation
+    mtx.std_mtx_mult(A, B, expected, mtx_size, mtx_size, mtx_size);
+
+    auto end_std = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
+    auto start_intrin = std::chrono::high_resolution_clock::now();
+
+    // result using the intrinsics implementation
+    mtx.mtx_mult(A, B, result, mtx_size, mtx_size, mtx_size);
+    auto end_intrin = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_intrin =
+        end_intrin - start_intrin;
+
+    TEST_COUT << "INTRINSIC Matrix Multiplication Time      : "
+              << elapsed_seconds_intrin.count() << " seconds" << std::endl;
+    TEST_COUT << "STANDARD  Matrix Multiplication Time      : "
+              << elapsed_seconds_std.count() << " seconds" << std::endl;
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+
 } // namespace
diff --git a/tests/linalg/t_matrix_arr_i32.cpp b/tests/linalg/t_matrix_arr_i32.cpp
index e489df4f6..6a9d01307 100644
--- a/tests/linalg/t_matrix_arr_i32.cpp
+++ b/tests/linalg/t_matrix_arr_i32.cpp
@@ -172,4 +172,57 @@ TEST(MatrixArrayTestI32, AdditionPerformanceComparison) {
     delete[] expected;
     delete[] result;
 }
+
+TEST(MatrixArrayTestI32, MultiplicationPerformanceComparison) {
+    int mtx_size = 1024;
+    TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
+    // define input matrices A and B
+    int *A = new int[mtx_size * mtx_size];
+    int *B = new int[mtx_size * mtx_size];
+    int *expected = new int[mtx_size * mtx_size];
+    int *result = new int[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 100);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = distribution(gen);
+            B[i * mtx_size + j] = distribution(gen);
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // expected result using the naive implementation
+    mtx.std_mtx_mult(A, B, expected, mtx_size, mtx_size, mtx_size);
+
+    auto end_std = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
+    auto start_intrin = std::chrono::high_resolution_clock::now();
+
+    // result using the intrinsics implementation
+    mtx.mtx_mult(A, B, result, mtx_size, mtx_size, mtx_size);
+    auto end_intrin = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_intrin =
+        end_intrin - start_intrin;
+
+    TEST_COUT << "INTRINSIC Matrix Multiplication Time      : "
+              << elapsed_seconds_intrin.count() << " seconds" << std::endl;
+    TEST_COUT << "STANDARD  Matrix Multiplication Time      : "
+              << elapsed_seconds_std.count() << " seconds" << std::endl;
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+
 } // namespace
diff --git a/tests/linalg/t_matrix_arr_i8.cpp b/tests/linalg/t_matrix_arr_i8.cpp
index 54a04e14a..782f248ea 100644
--- a/tests/linalg/t_matrix_arr_i8.cpp
+++ b/tests/linalg/t_matrix_arr_i8.cpp
@@ -142,4 +142,185 @@ TEST(MatrixArrayTestI8, AdditionPerformanceComparison) {
     delete[] expected;
     delete[] result;
 }
+
+TEST(MatrixArrayTestI8, SubtractionComparisonSmall) {
+    int mtx_size = 400;
+    // define input matrices A and B
+    int8_t *A = new int8_t[mtx_size * mtx_size];
+    int8_t *B = new int8_t[mtx_size * mtx_size];
+    int8_t *expected = new int8_t[mtx_size * mtx_size];
+    int8_t *result = new int8_t[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 100);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+            B[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+    // expected result using the naive implementation
+    mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size);
+
+    // result using the intrinsics implementation
+    mtx.mtx_sub(A, B, result, mtx_size, mtx_size);
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+
+TEST(MatrixArrayTestI8, SubtractionComparisonLarge) {
+    int mtx_size = 1024;
+    // define input matrices A and B
+    int8_t *A = new int8_t[mtx_size * mtx_size];
+    int8_t *B = new int8_t[mtx_size * mtx_size];
+    int8_t *expected = new int8_t[mtx_size * mtx_size];
+    int8_t *result = new int8_t[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 100);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+            B[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+    // expected result using the naive implementation
+    mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size);
+
+    // result using the intrinsics implementation
+    mtx.mtx_sub(A, B, result, mtx_size, mtx_size);
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+
+TEST(MatrixArrayTestI8, SubtractionPerformanceComparison) {
+    int mtx_size = 1024;
+    TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
+    // define input matrices A and B
+    int8_t *A = new int8_t[mtx_size * mtx_size];
+    int8_t *B = new int8_t[mtx_size * mtx_size];
+    int8_t *expected = new int8_t[mtx_size * mtx_size];
+    int8_t *result = new int8_t[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 100);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+            B[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // expected result using the naive implementation
+
+    mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size);
+    auto end_std = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
+    auto start_intrin = std::chrono::high_resolution_clock::now();
+
+    // result using the intrinsics implementation
+    mtx.mtx_sub(A, B, result, mtx_size, mtx_size);
+    auto end_intrin = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_intrin =
+        end_intrin - start_intrin;
+
+    TEST_COUT << "INTRINSIC Matrix Subtraction Time      : "
+              << elapsed_seconds_intrin.count() << " seconds" << std::endl;
+    TEST_COUT << "STANDARD  Matrix Subtraction Time      : "
+              << elapsed_seconds_std.count() << " seconds" << std::endl;
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+
+/*
+TEST(MatrixArrayTestI8, MultiplicationPerformanceComparison) {
+    int mtx_size = 1024;
+    TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
+    // define input matrices A and B
+    int8_t *A = new int8_t[mtx_size * mtx_size];
+    int8_t *B = new int8_t[mtx_size * mtx_size];
+    int8_t *expected = new int8_t[mtx_size * mtx_size];
+    int8_t *result = new int8_t[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd; 
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 5);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+            B[i * mtx_size + j] = static_cast<int8_t>(distribution(gen));
+        }
+    }   
+
+    gpmp::linalg::Mtx mtx;
+
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // expected result using the naive implementation
+    mtx.std_mtx_mult(A, B, expected, mtx_size, mtx_size, mtx_size);
+
+    auto end_std = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
+    auto start_intrin = std::chrono::high_resolution_clock::now();
+
+    // result using the intrinsics implementation
+    mtx.mtx_mult(A, B, result, mtx_size, mtx_size, mtx_size);
+    auto end_intrin = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_intrin =
+        end_intrin - start_intrin;
+
+    TEST_COUT << "INTRINSIC Matrix Multiplication Time      : "
+              << elapsed_seconds_intrin.count() << " seconds" << std::endl;
+    TEST_COUT << "STANDARD  Matrix Multiplication Time      : "
+              << elapsed_seconds_std.count() << " seconds" << std::endl;
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+*/
+
 } // namespace
diff --git a/tests/linalg/t_matrix_arr_naive.cpp b/tests/linalg/t_matrix_arr_naive.cpp
new file mode 100644
index 000000000..7475366ea
--- /dev/null
+++ b/tests/linalg/t_matrix_arr_naive.cpp
@@ -0,0 +1,43 @@
+#include "../../include/linalg/mtx.hpp"
+#include "../../include/linalg/mtx_tmpl.hpp"
+#include "t_matrix.hpp"
+#include <chrono>
+#include <cmath>
+#include <cstdint>
+#include <gtest/gtest.h>
+#include <iostream>
+#include <limits.h>
+#include <string>
+#include <vector>
+
+using namespace gpmp;
+#define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
+
+namespace {
+TEST(MatrixArrayTest, BasicTest) {
+    gpmp::linalg::Mtx mtx;
+    const int rowsA = 2;
+    const int colsA = 3;
+    const int colsB = 2;
+
+    // Define matrices A, B, and expected result C
+    int A[rowsA * colsA] = {1, 2, 3, 4, 5, 6};
+    int B[colsA * colsB] = {7, 8, 9, 10, 11, 12};
+
+    int expectedC[rowsA * colsB] = {58, 64, 139, 154};
+
+    // Initialize result matrix C
+    int C[rowsA * colsB];
+
+    // Perform matrix multiplication
+    mtx.std_mtx_mult(A, B, C, rowsA, colsA, colsB);
+
+    // Check if result matrix C matches expectedC
+    for (int i = 0; i < rowsA; ++i) {
+        for (int j = 0; j < colsB; ++j) {
+            EXPECT_EQ(C[i * colsB + j], expectedC[i * colsB + j]);
+        }
+    }
+}
+
+} // namespace
diff --git a/tests/linalg/t_matrix_vector_i32.cpp b/tests/linalg/t_matrix_vector_i32.cpp
index 79a5032cd..85c3373e3 100644
--- a/tests/linalg/t_matrix_vector_i32.cpp
+++ b/tests/linalg/t_matrix_vector_i32.cpp
@@ -175,4 +175,56 @@ TEST(MatrixVectorTestI32, AdditionPerformanceComparison) {
     // compare the results
     ASSERT_TRUE(mtx_verif(expected, result));
 }
+
+TEST(MatrixVectorTestI32, MultiplicationPerformanceComparison) {
+    int mtx_size = 1024;
+    TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
+
+    // define input matrices A and B
+    std::vector<std::vector<int>> A(mtx_size, std::vector<int>(mtx_size));
+    std::vector<std::vector<int>> B(mtx_size, std::vector<int>(mtx_size));
+    std::vector<std::vector<int>> expected(mtx_size,
+                                           std::vector<int>(mtx_size));
+    std::vector<std::vector<int>> result(mtx_size, std::vector<int>(mtx_size));
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 100);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i][j] = distribution(gen);
+            B[i][j] = distribution(gen);
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // expected result using the naive implementation
+    mtx.std_mtx_mult(A, B, expected);
+
+    auto end_std = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
+    auto start_intrin = std::chrono::high_resolution_clock::now();
+
+    // result using the intrinsics implementation
+    mtx.mtx_mult(A, B, result);
+    auto end_intrin = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_intrin =
+        end_intrin - start_intrin;
+
+    TEST_COUT << "INTRINSIC Matrix Multiplication Time      : "
+              << elapsed_seconds_intrin.count() << " seconds" << std::endl;
+    TEST_COUT << "STANDARD  Matrix Multiplication Time      : "
+              << elapsed_seconds_std.count() << " seconds" << std::endl;
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result));
+}
+
 } // namespace