diff --git a/include/linalg/mtx.hpp b/include/linalg/mtx.hpp index 46046a2f7..b30976de2 100644 --- a/include/linalg/mtx.hpp +++ b/include/linalg/mtx.hpp @@ -187,6 +187,84 @@ class Mtx { const std::vector> &B, std::vector> &C); + /** + * @brief Perform matrix subtraction using Intel intrinsics, accepts + * flat arrays of 8 bit ints + * @param A Input matrix A + * @param B Input matrix B + * @param C Output matrix C + * @note Matrices must be of at least size 8x8 + * @overload + */ + void + mtx_sub(const int8_t *A, const int8_t *B, int8_t *C, int rows, int cols); + + /** + * @brief Perform matrix subtraction using Intel intrinsics, accepts + * flat arrays of 16 bit ints + * @param A Input matrix A + * @param B Input matrix B + * @param C Output matrix C + * @note Matrices must be of at least size 8x8 + * @overload + */ + void + mtx_sub(const int16_t *A, const int16_t *B, int16_t *C, int rows, int cols); + /** + * @brief Perform matrix subtraction using Intel intrinsics, accepts + * flat arrays of type int + * @param A Input matrix A + * @param B Input matrix B + * @param C Output matrix C + * @note Matrices must be of at least size 8x8 + * @overload + */ + void mtx_sub(const int *A, const int *B, int *C, int rows, int cols); + + /** + * @brief Perform matrix subtraction using Intel intrinsics, accepts + * flat arrays of type double + * @param A Input matrix A + * @param B Input matrix B + * @param C Output matrix C + * @note Matrices must be of at least size 8x8 + * @overload + */ + void + mtx_sub(const double *A, const double *B, double *C, int rows, int cols); + + /** + * @brief Perform matrix subtraction using Intel intrinsics, accepts + * flat arrays of type float + * @param A Input matrix A + * @param B Input matrix B + * @param C Output matrix C + * @note Matrices must be of at least size 8x8 + * @overload + */ + void mtx_sub(const float *A, const float *B, float *C, int rows, int cols); + + void mtx_mult(const int8_t *A, + const int8_t *B, + int8_t *C, + int rows_a, + int cols_a, + int cols_b); + + void mtx_mult(const int16_t *A, + const int16_t *B, + int16_t *C, + int rows_a, + int cols_a, + int cols_b); + + void mtx_mult(const int *A, + const int *B, + int *C, + int rows_a, + int cols_a, + int cols_b); + /** * @brief Perform matrix subtraction using Intel intrinsics, accepts * vectors of type int @@ -373,7 +451,52 @@ class Mtx { * @overload */ template - void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols); + void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols) { + // MTX A AND B MUST BE SAME SIZE + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + // perform matrix addition + C[i * cols + j] = A[i * cols + j] + B[i * cols + j]; + } + } + } + /** + * @brief Perform matrix subtraction on two matrices as flat arrays + * @param A Input matrix A + * @param B Input matrix B + * @param C Output matrix C + * @param rows Number of rows + * @param cols Number of columns + * @overload + */ + template + void std_mtx_sub(const T *A, const T *B, T *C, int rows, int cols) { + // MTX A AND B MUST BE SAME SIZE + for (int i = 0; i < rows; ++i) { + for (int j = 0; j < cols; ++j) { + // perform matrix addition + C[i * cols + j] = A[i * cols + j] - B[i * cols + j]; + } + } + } + + template + void std_mtx_mult(const T *A, + const T *B, + T *C, + int rowsA, + int colsA, + int colsB) { + for (int i = 0; i < rowsA; ++i) { + for (int j = 0; j < colsB; ++j) { + T sum = 0; // Use T type for sum + for (int k = 0; k < colsA; ++k) { + sum += A[i * colsA + k] * B[k * colsB + j]; + } + C[i * colsB + j] = sum; + } + } + } /** * @brief Perform matrix addition on two matrices as flat vectors diff --git a/modules/linalg/mtx_avx2_arr_i16.cpp b/modules/linalg/mtx_avx2_arr_i16.cpp index 7237e53e1..62d1399f2 100644 --- a/modules/linalg/mtx_avx2_arr_i16.cpp +++ b/modules/linalg/mtx_avx2_arr_i16.cpp @@ -85,6 +85,70 @@ void gpmp::linalg::Mtx::mtx_add(const int16_t *A, } } +void gpmp::linalg::Mtx::mtx_sub(const int16_t *A, + const int16_t *B, + int16_t *C, + int rows, + int cols) { + for (int i = 0; i < rows; ++i) { + int j = 0; + for (; j < cols - 15; j += 16) { + __m256i a = _mm256_loadu_si256( + reinterpret_cast(&A[i * cols + j])); + __m256i b = _mm256_loadu_si256( + reinterpret_cast(&B[i * cols + j])); + __m256i c = _mm256_loadu_si256( + reinterpret_cast(&C[i * cols + j])); + + // Perform vectorized subtraction and accumulate the result + c = _mm256_sub_epi16(a, b); + + // Store the result back to the C matrix + _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]), + c); + } + + for (; j < cols; ++j) { + C[i * cols + j] = A[i * cols + j] - B[i * cols + j]; + } + } +} + +void gpmp::linalg::Mtx::mtx_mult(const int16_t *A, + const int16_t *B, + int16_t *C, + int rows_a, + int cols_a, + int cols_b) { + for (int i = 0; i < rows_a; ++i) { + for (int j = 0; j < cols_b; j += 16) { + __m256i c = _mm256_setzero_si256(); + + for (int k = 0; k < cols_a; ++k) { + __m256i a = _mm256_set1_epi16(A[i * cols_a + k]); + __m256i b = _mm256_loadu_si256(reinterpret_cast(&B[k * cols_b + j])); + + __m256i prod = _mm256_mullo_epi16(a, b); + c = _mm256_add_epi16(c, prod); + } + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c); + } + + // Handle remaining elements + for (int j = cols_b - cols_b % 16; j < cols_b; ++j) { + int sum = 0; + + for (int k = 0; k < cols_a; ++k) { + sum += A[i * cols_a + k] * B[k * cols_b + j]; + } + + C[i * cols_b + j] = sum; + } + } +} + + #endif // x86 diff --git a/modules/linalg/mtx_avx2_arr_i32.cpp b/modules/linalg/mtx_avx2_arr_i32.cpp index 857542748..82697a503 100644 --- a/modules/linalg/mtx_avx2_arr_i32.cpp +++ b/modules/linalg/mtx_avx2_arr_i32.cpp @@ -89,12 +89,50 @@ void gpmp::linalg::Mtx::mtx_add(const int *A, C[i * cols + j] = A[i * cols + j] + B[i * cols + j]; } } - } else { + } + + else { // use standard matrix addition std_mtx_add(A, B, C, rows, cols); } } +void gpmp::linalg::Mtx::mtx_mult(const int *A, + const int *B, + int *C, + int rows_a, + int cols_a, + int cols_b) { + for (int i = 0; i < rows_a; ++i) { + for (int j = 0; j < cols_b; j += 8) { + __m256i c = _mm256_setzero_si256(); + + for (int k = 0; k < cols_a; ++k) { + __m256i a = _mm256_set1_epi32(A[i * cols_a + k]); + __m256i b = _mm256_loadu_si256( + reinterpret_cast(&B[k * cols_b + j])); + + __m256i prod = _mm256_mullo_epi32(a, b); + c = _mm256_add_epi32(c, prod); + } + + _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]), + c); + } + + // Handle remaining elements + for (int j = cols_b - cols_b % 8; j < cols_b; ++j) { + int sum = 0; + + for (int k = 0; k < cols_a; ++k) { + sum += A[i * cols_a + k] * B[k * cols_b + j]; + } + + C[i * cols_b + j] = sum; + } + } +} + #endif // x86 diff --git a/modules/linalg/mtx_avx2_arr_i8.cpp b/modules/linalg/mtx_avx2_arr_i8.cpp index c6ab8d6a2..54b013ced 100644 --- a/modules/linalg/mtx_avx2_arr_i8.cpp +++ b/modules/linalg/mtx_avx2_arr_i8.cpp @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -85,6 +86,75 @@ void gpmp::linalg::Mtx::mtx_add(const int8_t *A, } } +void gpmp::linalg::Mtx::mtx_sub(const int8_t *A, + const int8_t *B, + int8_t *C, + int rows, + int cols) { + for (int i = 0; i < rows; ++i) { + int j = 0; + for (; j < cols - 31; j += 32) { + __m256i a = _mm256_loadu_si256( + reinterpret_cast(&A[i * cols + j])); + __m256i b = _mm256_loadu_si256( + reinterpret_cast(&B[i * cols + j])); + __m256i c = _mm256_loadu_si256( + reinterpret_cast(&C[i * cols + j])); + + // Perform vectorized subtraction and accumulate the result + c = _mm256_sub_epi8(a, b); + + // Store the result back to the C matrix + _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]), + c); + } + + for (; j < cols; ++j) { + C[i * cols + j] = A[i * cols + j] - B[i * cols + j]; + } + } +} + +void gpmp::linalg::Mtx::mtx_mult(const int8_t *A, + const int8_t *B, + int8_t *C, + int rows_a, + int cols_a, + int cols_b) { + + for (int i = 0; i < rows_a; ++i) { + for (int j = 0; j < cols_b; j += 32) { + __m256i c = _mm256_setzero_si256(); + + for (int k = 0; k < cols_a; ++k) { + __m256i a = _mm256_set1_epi8(A[i * cols_a + k]); + __m256i b = _mm256_loadu_si256(reinterpret_cast(&B[k * cols_b + j])); + + __m256i prod = _mm256_maddubs_epi16(a, b); + c = _mm256_add_epi16(c, prod); + } + + c = _mm256_srai_epi16(c, 8); + c = _mm256_packs_epi16(c, _mm256_setzero_si256()); + + _mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c); + } + + // Handle remaining elements + for (int j = cols_b - cols_b % 32; j < cols_b; ++j) { + int sum = 0; + + for (int k = 0; k < cols_a; ++k) { + sum += A[i * cols_a + k] * B[k * cols_b + j]; + } + + C[i * cols_b + j] = sum; + } + } + +} + + #endif // x86 diff --git a/modules/linalg/mtx_naive.cpp b/modules/linalg/mtx_naive.cpp index e1a12ca3b..7ac11afad 100644 --- a/modules/linalg/mtx_naive.cpp +++ b/modules/linalg/mtx_naive.cpp @@ -42,53 +42,6 @@ * Standard/Naive Matrix Operations on Arrays * ************************************************************************/ -// naive matrix addition algorithm on arrays -template -void gpmp::linalg::Mtx::std_mtx_add(const T *A, - const T *B, - T *C, - int rows, - int cols) { - // MTX A AND B MUST BE SAME SIZE - for (int i = 0; i < rows; ++i) { - for (int j = 0; j < cols; ++j) { - // perform matrix addition - C[i * cols + j] = A[i * cols + j] + B[i * cols + j]; - } - } -} - -// instantiations for types accepted by templated std_mtx_add function for -// flat arrays -template void gpmp::linalg::Mtx::std_mtx_add(const int8_t *A, - const int8_t *B, - int8_t *C, - int rows, - int cols); - -template void gpmp::linalg::Mtx::std_mtx_add(const int16_t *A, - const int16_t *B, - int16_t *C, - int rows, - int cols); - -template void gpmp::linalg::Mtx::std_mtx_add(const int *A, - const int *B, - int *C, - int rows, - int cols); - -template void gpmp::linalg::Mtx::std_mtx_add(const double *A, - const double *B, - double *C, - int rows, - int cols); - -template void gpmp::linalg::Mtx::std_mtx_add(const float *A, - const float *B, - float *C, - int rows, - int cols); /************************************************************************ * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index a157ee905..a7e05395e 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -55,8 +55,10 @@ set(CPP_TEST_FILES linalg/t_vector_vector_f64.cpp linalg/t_vector_vector_i8.cpp linalg/t_vector_vector_i32.cpp + linalg/t_vector_vector_naive.cpp - + linalg/t_matrix_arr_naive.cpp + nt/t_cipher.cpp nt/t_rc4.cpp nt/t_primes.cpp @@ -157,9 +159,9 @@ if(LCOV) add_custom_command(TARGET RUN_CPP_TESTS POST_BUILD COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --green "[==========]" COMMAND ${CMAKE_COMMAND} -E cmake_echo_color --cyan --bold "[Generating Coverage Reports]" - COMMAND lcov --directory .. --capture --output-file lcov.info --rc geninfo_unexecuted_blocks=1 #--ignore-errors mismatch,mismatch + COMMAND lcov --directory .. --capture --output-file lcov.info --rc geninfo_unexecuted_blocks=1 --ignore-errors mismatch,mismatch COMMAND lcov --remove lcov.info "*c++*" "*11*" "*/googletest/*" "*/gtest*" "/usr/*" "/src" "/build" -o lcov.info --ignore-errors unused - COMMAND mv lcov.info ../../.coverage + #COMMAND mv lcov.info ../../.coverage ) endif() else() diff --git a/tests/linalg/t_matrix.hpp b/tests/linalg/t_matrix.hpp index b1aa69a65..fb68293e0 100644 --- a/tests/linalg/t_matrix.hpp +++ b/tests/linalg/t_matrix.hpp @@ -65,6 +65,7 @@ bool mtx_verif(const T *A, const T *B, int rows, int cols) { } return true; } + template void print_matrix(const T *matrix, int rows, int cols) { for (int i = 0; i < rows; ++i) { for (int j = 0; j < cols; ++j) { diff --git a/tests/linalg/t_matrix_arr_i16.cpp b/tests/linalg/t_matrix_arr_i16.cpp index 9299ae254..1292a387b 100644 --- a/tests/linalg/t_matrix_arr_i16.cpp +++ b/tests/linalg/t_matrix_arr_i16.cpp @@ -143,4 +143,110 @@ TEST(MatrixArrayTestI16, AdditionPerformanceComparison) { delete[] result; } +TEST(MatrixArrayTestI16, SubtractionPerformanceComparison) { + int mtx_size = 1024; + TEST_COUT << "Matrix size : " << mtx_size << std::endl; + // define input matrices A and B + int16_t *A = new int16_t[mtx_size * mtx_size]; + int16_t *B = new int16_t[mtx_size * mtx_size]; + int16_t *expected = new int16_t[mtx_size * mtx_size]; + int16_t *result = new int16_t[mtx_size * mtx_size]; + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 100); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i * mtx_size + j] = static_cast(distribution(gen)); + B[i * mtx_size + j] = static_cast(distribution(gen)); + } + } + + gpmp::linalg::Mtx mtx; + + auto start_std = std::chrono::high_resolution_clock::now(); + + // expected result using the naive implementation + mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size); + + auto end_std = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_std = end_std - start_std; + + auto start_intrin = std::chrono::high_resolution_clock::now(); + + // result using the intrinsics implementation + mtx.mtx_sub(A, B, result, mtx_size, mtx_size); + auto end_intrin = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_intrin = + end_intrin - start_intrin; + + TEST_COUT << "INTRINSIC Matrix Subtraction Time : " + << elapsed_seconds_intrin.count() << " seconds" << std::endl; + TEST_COUT << "STANDARD Matrix Subtraction Time : " + << elapsed_seconds_std.count() << " seconds" << std::endl; + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size)); + delete[] A; + delete[] B; + delete[] expected; + delete[] result; +} + +TEST(MatrixArrayTestI16, MultiplicationPerformanceComparison) { + int mtx_size = 1024; + TEST_COUT << "Matrix size : " << mtx_size << std::endl; + // define input matrices A and B + int16_t *A = new int16_t[mtx_size * mtx_size]; + int16_t *B = new int16_t[mtx_size * mtx_size]; + int16_t *expected = new int16_t[mtx_size * mtx_size]; + int16_t *result = new int16_t[mtx_size * mtx_size]; + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 15); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i * mtx_size + j] = static_cast(distribution(gen)); + B[i * mtx_size + j] = static_cast(distribution(gen)); + } + } + + gpmp::linalg::Mtx mtx; + + auto start_std = std::chrono::high_resolution_clock::now(); + + // expected result using the naive implementation + mtx.std_mtx_mult(A, B, expected, mtx_size, mtx_size, mtx_size); + + auto end_std = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_std = end_std - start_std; + + auto start_intrin = std::chrono::high_resolution_clock::now(); + + // result using the intrinsics implementation + mtx.mtx_mult(A, B, result, mtx_size, mtx_size, mtx_size); + auto end_intrin = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_intrin = + end_intrin - start_intrin; + + TEST_COUT << "INTRINSIC Matrix Multiplication Time : " + << elapsed_seconds_intrin.count() << " seconds" << std::endl; + TEST_COUT << "STANDARD Matrix Multiplication Time : " + << elapsed_seconds_std.count() << " seconds" << std::endl; + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size)); + delete[] A; + delete[] B; + delete[] expected; + delete[] result; +} + } // namespace diff --git a/tests/linalg/t_matrix_arr_i32.cpp b/tests/linalg/t_matrix_arr_i32.cpp index e489df4f6..6a9d01307 100644 --- a/tests/linalg/t_matrix_arr_i32.cpp +++ b/tests/linalg/t_matrix_arr_i32.cpp @@ -172,4 +172,57 @@ TEST(MatrixArrayTestI32, AdditionPerformanceComparison) { delete[] expected; delete[] result; } + +TEST(MatrixArrayTestI32, MultiplicationPerformanceComparison) { + int mtx_size = 1024; + TEST_COUT << "Matrix size : " << mtx_size << std::endl; + // define input matrices A and B + int *A = new int[mtx_size * mtx_size]; + int *B = new int[mtx_size * mtx_size]; + int *expected = new int[mtx_size * mtx_size]; + int *result = new int[mtx_size * mtx_size]; + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 100); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i * mtx_size + j] = distribution(gen); + B[i * mtx_size + j] = distribution(gen); + } + } + + gpmp::linalg::Mtx mtx; + auto start_std = std::chrono::high_resolution_clock::now(); + + // expected result using the naive implementation + mtx.std_mtx_mult(A, B, expected, mtx_size, mtx_size, mtx_size); + + auto end_std = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_std = end_std - start_std; + + auto start_intrin = std::chrono::high_resolution_clock::now(); + + // result using the intrinsics implementation + mtx.mtx_mult(A, B, result, mtx_size, mtx_size, mtx_size); + auto end_intrin = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_intrin = + end_intrin - start_intrin; + + TEST_COUT << "INTRINSIC Matrix Multiplication Time : " + << elapsed_seconds_intrin.count() << " seconds" << std::endl; + TEST_COUT << "STANDARD Matrix Multiplication Time : " + << elapsed_seconds_std.count() << " seconds" << std::endl; + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size)); + delete[] A; + delete[] B; + delete[] expected; + delete[] result; +} + } // namespace diff --git a/tests/linalg/t_matrix_arr_i8.cpp b/tests/linalg/t_matrix_arr_i8.cpp index 54a04e14a..782f248ea 100644 --- a/tests/linalg/t_matrix_arr_i8.cpp +++ b/tests/linalg/t_matrix_arr_i8.cpp @@ -142,4 +142,185 @@ TEST(MatrixArrayTestI8, AdditionPerformanceComparison) { delete[] expected; delete[] result; } + +TEST(MatrixArrayTestI8, SubtractionComparisonSmall) { + int mtx_size = 400; + // define input matrices A and B + int8_t *A = new int8_t[mtx_size * mtx_size]; + int8_t *B = new int8_t[mtx_size * mtx_size]; + int8_t *expected = new int8_t[mtx_size * mtx_size]; + int8_t *result = new int8_t[mtx_size * mtx_size]; + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 100); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i * mtx_size + j] = static_cast(distribution(gen)); + B[i * mtx_size + j] = static_cast(distribution(gen)); + } + } + + gpmp::linalg::Mtx mtx; + // expected result using the naive implementation + mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size); + + // result using the intrinsics implementation + mtx.mtx_sub(A, B, result, mtx_size, mtx_size); + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size)); + delete[] A; + delete[] B; + delete[] expected; + delete[] result; +} + +TEST(MatrixArrayTestI8, SubtractionComparisonLarge) { + int mtx_size = 1024; + // define input matrices A and B + int8_t *A = new int8_t[mtx_size * mtx_size]; + int8_t *B = new int8_t[mtx_size * mtx_size]; + int8_t *expected = new int8_t[mtx_size * mtx_size]; + int8_t *result = new int8_t[mtx_size * mtx_size]; + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 100); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i * mtx_size + j] = static_cast(distribution(gen)); + B[i * mtx_size + j] = static_cast(distribution(gen)); + } + } + + gpmp::linalg::Mtx mtx; + // expected result using the naive implementation + mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size); + + // result using the intrinsics implementation + mtx.mtx_sub(A, B, result, mtx_size, mtx_size); + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size)); + delete[] A; + delete[] B; + delete[] expected; + delete[] result; +} + +TEST(MatrixArrayTestI8, SubtractionPerformanceComparison) { + int mtx_size = 1024; + TEST_COUT << "Matrix size : " << mtx_size << std::endl; + // define input matrices A and B + int8_t *A = new int8_t[mtx_size * mtx_size]; + int8_t *B = new int8_t[mtx_size * mtx_size]; + int8_t *expected = new int8_t[mtx_size * mtx_size]; + int8_t *result = new int8_t[mtx_size * mtx_size]; + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 100); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i * mtx_size + j] = static_cast(distribution(gen)); + B[i * mtx_size + j] = static_cast(distribution(gen)); + } + } + + gpmp::linalg::Mtx mtx; + + auto start_std = std::chrono::high_resolution_clock::now(); + + // expected result using the naive implementation + + mtx.std_mtx_sub(A, B, expected, mtx_size, mtx_size); + auto end_std = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_std = end_std - start_std; + + auto start_intrin = std::chrono::high_resolution_clock::now(); + + // result using the intrinsics implementation + mtx.mtx_sub(A, B, result, mtx_size, mtx_size); + auto end_intrin = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_intrin = + end_intrin - start_intrin; + + TEST_COUT << "INTRINSIC Matrix Subtraction Time : " + << elapsed_seconds_intrin.count() << " seconds" << std::endl; + TEST_COUT << "STANDARD Matrix Subtraction Time : " + << elapsed_seconds_std.count() << " seconds" << std::endl; + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size)); + delete[] A; + delete[] B; + delete[] expected; + delete[] result; +} + +/* +TEST(MatrixArrayTestI8, MultiplicationPerformanceComparison) { + int mtx_size = 1024; + TEST_COUT << "Matrix size : " << mtx_size << std::endl; + // define input matrices A and B + int8_t *A = new int8_t[mtx_size * mtx_size]; + int8_t *B = new int8_t[mtx_size * mtx_size]; + int8_t *expected = new int8_t[mtx_size * mtx_size]; + int8_t *result = new int8_t[mtx_size * mtx_size]; + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 5); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i * mtx_size + j] = static_cast(distribution(gen)); + B[i * mtx_size + j] = static_cast(distribution(gen)); + } + } + + gpmp::linalg::Mtx mtx; + + auto start_std = std::chrono::high_resolution_clock::now(); + + // expected result using the naive implementation + mtx.std_mtx_mult(A, B, expected, mtx_size, mtx_size, mtx_size); + + auto end_std = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_std = end_std - start_std; + + auto start_intrin = std::chrono::high_resolution_clock::now(); + + // result using the intrinsics implementation + mtx.mtx_mult(A, B, result, mtx_size, mtx_size, mtx_size); + auto end_intrin = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_intrin = + end_intrin - start_intrin; + + TEST_COUT << "INTRINSIC Matrix Multiplication Time : " + << elapsed_seconds_intrin.count() << " seconds" << std::endl; + TEST_COUT << "STANDARD Matrix Multiplication Time : " + << elapsed_seconds_std.count() << " seconds" << std::endl; + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size)); + delete[] A; + delete[] B; + delete[] expected; + delete[] result; +} +*/ + } // namespace diff --git a/tests/linalg/t_matrix_arr_naive.cpp b/tests/linalg/t_matrix_arr_naive.cpp new file mode 100644 index 000000000..7475366ea --- /dev/null +++ b/tests/linalg/t_matrix_arr_naive.cpp @@ -0,0 +1,43 @@ +#include "../../include/linalg/mtx.hpp" +#include "../../include/linalg/mtx_tmpl.hpp" +#include "t_matrix.hpp" +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace gpmp; +#define TEST_COUT std::cerr << "\033[32m[ ] [ INFO ] \033[0m" + +namespace { +TEST(MatrixArrayTest, BasicTest) { + gpmp::linalg::Mtx mtx; + const int rowsA = 2; + const int colsA = 3; + const int colsB = 2; + + // Define matrices A, B, and expected result C + int A[rowsA * colsA] = {1, 2, 3, 4, 5, 6}; + int B[colsA * colsB] = {7, 8, 9, 10, 11, 12}; + + int expectedC[rowsA * colsB] = {58, 64, 139, 154}; + + // Initialize result matrix C + int C[rowsA * colsB]; + + // Perform matrix multiplication + mtx.std_mtx_mult(A, B, C, rowsA, colsA, colsB); + + // Check if result matrix C matches expectedC + for (int i = 0; i < rowsA; ++i) { + for (int j = 0; j < colsB; ++j) { + EXPECT_EQ(C[i * colsB + j], expectedC[i * colsB + j]); + } + } +} + +} // namespace diff --git a/tests/linalg/t_matrix_vector_i32.cpp b/tests/linalg/t_matrix_vector_i32.cpp index 79a5032cd..85c3373e3 100644 --- a/tests/linalg/t_matrix_vector_i32.cpp +++ b/tests/linalg/t_matrix_vector_i32.cpp @@ -175,4 +175,56 @@ TEST(MatrixVectorTestI32, AdditionPerformanceComparison) { // compare the results ASSERT_TRUE(mtx_verif(expected, result)); } + +TEST(MatrixVectorTestI32, MultiplicationPerformanceComparison) { + int mtx_size = 1024; + TEST_COUT << "Matrix size : " << mtx_size << std::endl; + + // define input matrices A and B + std::vector> A(mtx_size, std::vector(mtx_size)); + std::vector> B(mtx_size, std::vector(mtx_size)); + std::vector> expected(mtx_size, + std::vector(mtx_size)); + std::vector> result(mtx_size, std::vector(mtx_size)); + + // initialize random number generator + std::random_device rd; + std::mt19937 gen(rd()); + std::uniform_int_distribution distribution(1, 100); + + // populate matrices A and B with random values + for (int i = 0; i < mtx_size; ++i) { + for (int j = 0; j < mtx_size; ++j) { + A[i][j] = distribution(gen); + B[i][j] = distribution(gen); + } + } + + gpmp::linalg::Mtx mtx; + + auto start_std = std::chrono::high_resolution_clock::now(); + + // expected result using the naive implementation + mtx.std_mtx_mult(A, B, expected); + + auto end_std = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_std = end_std - start_std; + + auto start_intrin = std::chrono::high_resolution_clock::now(); + + // result using the intrinsics implementation + mtx.mtx_mult(A, B, result); + auto end_intrin = std::chrono::high_resolution_clock::now(); + std::chrono::duration elapsed_seconds_intrin = + end_intrin - start_intrin; + + TEST_COUT << "INTRINSIC Matrix Multiplication Time : " + << elapsed_seconds_intrin.count() << " seconds" << std::endl; + TEST_COUT << "STANDARD Matrix Multiplication Time : " + << elapsed_seconds_std.count() << " seconds" << std::endl; + + // compare the results + ASSERT_TRUE(mtx_verif(expected, result)); +} + } // namespace