From 8423b748aa7f00d0715b102c71428366e029b50e Mon Sep 17 00:00:00 2001
From: akielaries <akiel@akiel.org>
Date: Sat, 17 Feb 2024 23:57:39 -0700
Subject: [PATCH] #EDITS: updates to linalg module and file seperation

---
 include/linalg/mtx.hpp              | 13 ++++++
 modules/linalg/mtx_arr_f90.cpp      | 17 ++++++++
 modules/linalg/mtx_avx2_arr_i32.cpp | 10 ++---
 modules/linalg/mtx_routines.f90     | 44 ++++++++++++++++----
 tests/CMakeLists.txt                |  2 +
 tests/linalg/t_matrix_arr_f90.cpp   | 63 ++++++++++++++++++++++++++---
 6 files changed, 131 insertions(+), 18 deletions(-)
diff --git a/include/linalg/mtx.hpp b/include/linalg/mtx.hpp
index 5ca284a9b..6aca3d9fc 100644
--- a/include/linalg/mtx.hpp
+++ b/include/linalg/mtx.hpp
@@ -87,6 +87,19 @@ class Mtx {
      * designated Fortran subroutine. Accepts type int
      */
     void mtx_add_f90(int *A, int *B, int *C, std::size_t matrixSize);
+
+    void mtx_mult_f90(int *A,
+                      int *B,
+                      int *C,
+                      std::size_t rows_a,
+                      std::size_t cols_a,
+                      std::size_t cols_b);
+
+  /*  void mtx_mult_f90(int *A,
+                      int *B,
+                      int *C,
+                      std::size_t mtx_size);
+*/
 #endif
 
 #if defined(__x86_64__) || defined(i386) || defined(__i386__) ||               \
diff --git a/modules/linalg/mtx_arr_f90.cpp b/modules/linalg/mtx_arr_f90.cpp
index 08ff1e4c4..0710d037c 100644
--- a/modules/linalg/mtx_arr_f90.cpp
+++ b/modules/linalg/mtx_arr_f90.cpp
@@ -64,6 +64,13 @@ void mtx_add_routine_float_(float *A,
 
 // Matrix add routine (INT)
 void mtx_add_routine_int_(int *A, int *B, int *C, std::size_t *mtx_size);
+
+void mtx_mult_routine_int_(int *A,
+                           int *B,
+                           int *C,
+                           std::size_t *rows_a,
+                           std::size_t *cols_a,
+                           std::size_t *cols_b);
 }
 
 // C++ wrapper for Fortran mtx addition subroutine FLOAT
@@ -82,4 +89,14 @@ void gpmp::linalg::Mtx::mtx_add_f90(int *A,
     mtx_add_routine_int_(A, B, C, &mtx_size);
 }
 
+// C++ wrapper for Fortran mtx multiplication subroutine INT
+void gpmp::linalg::Mtx::mtx_mult_f90(int *A,
+                                     int *B,
+                                     int *C,
+                                     std::size_t rows_a,
+                                     std::size_t cols_a,
+                                     std::size_t cols_b) {
+    mtx_mult_routine_int_(A, B, C, &rows_a, &cols_a, &cols_b);
+}
+
 #endif
diff --git a/modules/linalg/mtx_avx2_arr_i32.cpp b/modules/linalg/mtx_avx2_arr_i32.cpp
index 3d2326d62..7c2008b80 100644
--- a/modules/linalg/mtx_avx2_arr_i32.cpp
+++ b/modules/linalg/mtx_avx2_arr_i32.cpp
@@ -203,11 +203,11 @@ void gpmp::linalg::Mtx::mtx_mult(const int *A,
 
         // Handle remaining elements
         for (int j = cols_b - cols_b % 4; j < cols_b; ++j) {
-            long long sum = 0;
+            int64_t sum = 0;
 
             for (int k = 0; k < cols_a; ++k) {
-                sum += static_cast<long long>(A[i * cols_a + k]) *
-                       B[k * cols_b + j];
+                sum +=
+                    static_cast<int64_t>(A[i * cols_a + k]) * B[k * cols_b + j];
             }
 
             C[i * cols_b + j] = sum;
@@ -220,9 +220,7 @@ void gpmp::linalg::Mtx::mtx_tpose(const int *A, int *C, int rows, int cols) {
     for (int i = 0; i < rows; i += 8) {
         for (int j = 0; j < cols; j += 8) {
             // Load 8x8 block from A
-            //__m256i a0 = _mm256_loadu_si256((__m256i *)(A + i * cols + j));
-            __m256i a0 = _mm256_loadu_si256((const __m256i *)(A + i * cols + j));
-
+            __m256i a0 = _mm256_loadu_si256((__m256i *)(A + i * cols + j));
             __m256i a1 =
                 _mm256_loadu_si256((__m256i *)(A + (i + 1) * cols + j));
             __m256i a2 =
diff --git a/modules/linalg/mtx_routines.f90 b/modules/linalg/mtx_routines.f90
index cff7cda87..de6e75733 100644
--- a/modules/linalg/mtx_routines.f90
+++ b/modules/linalg/mtx_routines.f90
@@ -33,7 +33,7 @@
 ! mtx_routines.f90
 
 !> FORTRAN Subroutine for Matrix Addition on flattened matrices as arrays
-!! of type float. Contains C++ wrapper function
+!! of type float32. Contains C++ wrapper function
 !! @param A Addend A, an array representing a Matrix
 !! @param B Addend B, an array representing a Matrix
 !! @param C Sum C, an array representing the sum of A + B
@@ -49,6 +49,12 @@ SUBROUTINE mtx_add_routine_float_(A, B, C, mtx_size) bind(C)
     C = A + B
 END SUBROUTINE mtx_add_routine_float_
 
+!> FORTRAN Subroutine for Matrix Addition on flattened matrices as arrays
+!! of type int32. Contains C++ wrapper function
+!! @param A Addend A, an array representing a Matrix
+!! @param B Addend B, an array representing a Matrix
+!! @param C Sum C, an array representing the sum of A + B
+!! @param mtx_size Assumes same size M x N
 SUBROUTINE mtx_add_routine_int_(A, B, C, mtx_size) bind(C)
     USE :: ISO_FORTRAN_ENV
     USE :: ISO_C_BINDING
@@ -67,21 +73,45 @@ END SUBROUTINE mtx_add_routine_int_
 !! @param c Product c, an array representing the sum of a + b
 !! @param nrows_a Number of rows
 !! @param ncols Number of columns
-SUBROUTINE mtx_mult(matrix1, matrix2, result, nrows1, ncols1, ncols2)
+SUBROUTINE Bmtx_mult_routine_int_(A, B, C, nrows1, ncols1, ncols2) bind(C)
+    USE :: ISO_FORTRAN_ENV
+    USE :: ISO_C_BINDING
+
     implicit none
     INTEGER, INTENT(IN) :: nrows1, ncols1, ncols2
-    REAL, INTENT(IN) :: matrix1(nrows1, ncols1), matrix2(ncols1, ncols2)
-    REAL, INTENT(OUT) :: result(nrows1, ncols2)
+    REAL, INTENT(IN) :: A(nrows1, ncols1), B(ncols1, ncols2)
+    REAL, INTENT(OUT) :: C(nrows1, ncols2)
     INTEGER :: i, j, k
 
     ! Perform matrix multiplication
     do i = 1, nrows1
         do j = 1, ncols2
-            result(i, j) = 0.0
+            C(i, j) = 0
             do k = 1, ncols1
-                result(i, j) = result(i, j) + matrix1(i, k)*matrix2(k, j)
+                C(i, j) = C(i, j) + A(i, k)*B(k, j)
             end do
         end do
     end do
-END SUBROUTINE mtx_mult
+END SUBROUTINE Bmtx_mult_routine_int_
+
+SUBROUTINE mtx_mult_routine_int_(A, B, C, mtx_size) bind(C)
+    USE :: ISO_FORTRAN_ENV
+    USE :: ISO_C_BINDING
+
+    INTEGER, INTENT(IN) :: mtx_size
+    INTEGER(KIND=C_INT), DIMENSION(mtx_size, mtx_size), INTENT(IN) :: A, B
+    INTEGER(KIND=C_INT), DIMENSION(mtx_size, mtx_size), INTENT(OUT) :: C
+
+    INTEGER :: i, j, k
+
+    ! Perform matrix multiplication
+    DO i = 1, mtx_size
+        DO j = 1, mtx_size
+            C(i, j) = 0
+            DO k = 1, mtx_size
+                C(i, j) = C(i, j) + A(i, k) * B(k, j)
+            END DO
+        END DO
+    END DO
+END SUBROUTINE mtx_mult_routine_int_
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index eaa990376..0aa87ab2c 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -59,6 +59,8 @@ set(CPP_TEST_FILES
     linalg/t_vector_vector_naive.cpp
     linalg/t_matrix_arr_naive.cpp
 
+    linalg/t_matrix_arr_f90.cpp
+
     nt/t_cipher.cpp
     nt/t_rc4.cpp
     nt/t_primes.cpp
diff --git a/tests/linalg/t_matrix_arr_f90.cpp b/tests/linalg/t_matrix_arr_f90.cpp
index 56c1bb408..9e6d358b3 100644
--- a/tests/linalg/t_matrix_arr_f90.cpp
+++ b/tests/linalg/t_matrix_arr_f90.cpp
@@ -18,7 +18,7 @@ using namespace gpmp;
 
 namespace {
 
-TEST(MatrixArrayTestI32, AdditionPerformanceComparisonF90) {
+TEST(FORTRAN90MatrixArrayTestI32, AdditionPerformanceComparison) {
     int mtx_size = 1024;
     TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
     // define input matrices A and B
@@ -49,21 +49,73 @@ TEST(MatrixArrayTestI32, AdditionPerformanceComparisonF90) {
     auto end_std = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
 
+    auto start_f90 = std::chrono::high_resolution_clock::now();
+
+    // result using the f90sics implementation
+    mtx.mtx_add_f90(A, B, result, mtx_size);
+    auto end_f90 = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_f90 =
+        end_f90 - start_f90;
+
+    TEST_COUT << "FORTRAN90 Matrix Addition Time      : "
+              << elapsed_seconds_f90.count() << " seconds" << std::endl;
+    TEST_COUT << "STANDARD  Matrix Addition Time      : "
+              << elapsed_seconds_std.count() << " seconds" << std::endl;
+
+    // compare the results
+    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    delete[] A;
+    delete[] B;
+    delete[] expected;
+    delete[] result;
+}
+
+TEST(FORTRAN90MatrixArrayTestI32, MultiplicationPerformanceComparison) {
+    int mtx_size = 1024;
+    TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
+    // define input matrices A and B
+    int *A = new int[mtx_size * mtx_size];
+    int *B = new int[mtx_size * mtx_size];
+    int *expected = new int[mtx_size * mtx_size];
+    int *result = new int[mtx_size * mtx_size];
+
+    // initialize random number generator
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::uniform_int_distribution<int> distribution(1, 100);
+
+    // populate matrices A and B with random values
+    for (int i = 0; i < mtx_size; ++i) {
+        for (int j = 0; j < mtx_size; ++j) {
+            A[i * mtx_size + j] = distribution(gen);
+            B[i * mtx_size + j] = distribution(gen);
+        }
+    }
+
+    gpmp::linalg::Mtx mtx;
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // expected result using the naive implementation
+    mtx.std_mtx_mult(A, B, expected, mtx_size, mtx_size, mtx_size);
+
+    auto end_std = std::chrono::high_resolution_clock::now();
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
     auto start_intrin = std::chrono::high_resolution_clock::now();
 
     // result using the intrinsics implementation
-    mtx.mtx_add_f90(A, B, result, mtx_size, mtx_size);
+    mtx.mtx_mult_f90(A, B, result, mtx_size, mtx_size, mtx_size);
     auto end_intrin = std::chrono::high_resolution_clock::now();
     std::chrono::duration<double> elapsed_seconds_intrin =
         end_intrin - start_intrin;
 
-    TEST_COUT << "INTRINSIC Matrix Addition Time      : "
+    TEST_COUT << "INTRINSIC Matrix Multiplication Time      : "
               << elapsed_seconds_intrin.count() << " seconds" << std::endl;
-    TEST_COUT << "STANDARD  Matrix Addition Time      : "
+    TEST_COUT << "STANDARD  Matrix Multiplication Time      : "
               << elapsed_seconds_std.count() << " seconds" << std::endl;
 
     // compare the results
-    ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
+    //ASSERT_TRUE(mtx_verif(expected, result, mtx_size, mtx_size));
     delete[] A;
     delete[] B;
     delete[] expected;
@@ -71,4 +123,5 @@ TEST(MatrixArrayTestI32, AdditionPerformanceComparisonF90) {
 }
 
 
+
 }