#EDITS: updates to linalg module and file seperation

akielaries · Feb 18, 2024 · 1e02cdf · 1e02cdf
1 parent 4e1e46c
commit 1e02cdf
Show file tree

Hide file tree

Showing 7 changed files with 462 additions and 57 deletions.
diff --git a/experiment/blas2.c b/experiment/blas2.c
@@ -2,15 +2,64 @@
 #include <chrono>
 #include <iostream>
 
-const int matrixSize = 8192;
+const int matrixSize = 1024;
 
 void run_openblas_mtx_add();
+void run_openblas_mtx_mul();
 
 int main() {
     run_openblas_mtx_add();
+    run_openblas_mtx_mul();
+
     return 0;
 }
 
+void run_openblas_mtx_mul() {
+    // Create matrices A, B, and C
+    double *A = new double[matrixSize * matrixSize];
+    double *B = new double[matrixSize * matrixSize];
+    double *C = new double[matrixSize * matrixSize];
+
+    // Initialize matrices A and B with random values
+    for (int i = 0; i < matrixSize * matrixSize; ++i) {
+        A[i] = rand() % 100;
+        B[i] = rand() % 100;
+    }
+
+    // Measure the time for matrix multiplication using OpenBLAS
+    auto start_std = std::chrono::high_resolution_clock::now();
+
+    // Use OpenBLAS to multiply matrices A and B and store the result in matrix
+    // C
+    cblas_dgemm(CblasRowMajor,
+                CblasNoTrans,
+                CblasNoTrans,
+                matrixSize,
+                matrixSize,
+                matrixSize,
+                1.0,
+                A,
+                matrixSize,
+                B,
+                matrixSize,
+                0.0,
+                C,
+                matrixSize);
+
+    auto end_std = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
+
+    // Calculate and print the elapsed time
+    std::cout << "Matrix multiplication using OpenBLAS took "
+              << elapsed_seconds_std.count() << " seconds." << std::endl;
+
+    // Cleanup
+    delete[] A;
+    delete[] B;
+    delete[] C;
+}
+
 void run_openblas_mtx_add() {
     // Create matrices A, B, and C
     double *A = new double[matrixSize * matrixSize];
@@ -24,19 +73,20 @@ void run_openblas_mtx_add() {
     }
 
     // Measure the time for matrix addition using OpenBLAS
-    auto start_time = std::chrono::steady_clock::now();
+    // auto start_time = std::chrono::steady_clock::now();
+    auto start_std = std::chrono::high_resolution_clock::now();
 
     // Use OpenBLAS to add matrices A and B and store the result in matrix C
     cblas_daxpy(matrixSize * matrixSize, 1.0, A, 1, C, 1);
 
-    auto end_time = std::chrono::steady_clock::now();
+    // auto end_time = std::chrono::steady_clock::now();
+    auto end_std = std::chrono::high_resolution_clock::now();
+
+    std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;
 
     // Calculate and print the elapsed time
-    std::cout << "OpenBLAS - Time elapsed: "
-              << std::chrono::duration_cast<std::chrono::milliseconds>(
-                     end_time - start_time)
-                     .count()
-              << " ms" << std::endl;
+    std::cout << "TIME-ELAPSED: " << elapsed_seconds_std.count() << "seconds"
+              << std::endl;
 
     // Cleanup
     delete[] A;

diff --git a/include/linalg/mtx.hpp b/include/linalg/mtx.hpp
@@ -244,27 +244,62 @@ class Mtx {
      */
     void mtx_sub(const float *A, const float *B, float *C, int rows, int cols);
 
+    /**
+     * @brief Perform matrix multiplication using Intel intrinsics, accepts
+     * flat arrays of 8 bit integers
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
     void mtx_mult(const int8_t *A,
                   const int8_t *B,
                   int8_t *C,
                   int rows_a,
                   int cols_a,
                   int cols_b);
-
+    /**
+     * @brief Perform matrix multiplication using Intel intrinsics, accepts
+     * flat arrays of 16 bit integers
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
     void mtx_mult(const int16_t *A,
                   const int16_t *B,
                   int16_t *C,
                   int rows_a,
                   int cols_a,
                   int cols_b);
 
+    /**
+     * @brief Perform matrix multiplication using Intel intrinsics, accepts
+     * flat arrays of 32 bit integers
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
     void mtx_mult(const int *A,
                   const int *B,
                   int *C,
                   int rows_a,
                   int cols_a,
                   int cols_b);
 
+    void mtx_mult(const int *A,
+                  const int *B,
+                  int64_t *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
+    void mtx_tpose(const int *A, int *C, int rows, int cols);
+
     /**
      * @brief Perform matrix subtraction using Intel intrinsics, accepts
      * vectors of type int
@@ -480,24 +515,33 @@ class Mtx {
         }
     }
 
-    template <typename T>
+    // template <typename T>
+    template <typename T, typename U>
     void std_mtx_mult(const T *A,
                       const T *B,
-                      T *C,
-                      int rowsA,
-                      int colsA,
-                      int colsB) {
-        for (int i = 0; i < rowsA; ++i) {
-            for (int j = 0; j < colsB; ++j) {
-                T sum = 0; // Use T type for sum
-                for (int k = 0; k < colsA; ++k) {
-                    sum += A[i * colsA + k] * B[k * colsB + j];
+                      U *C,
+                      int rows_a,
+                      int cols_a,
+                      int cols_b) {
+        for (int i = 0; i < rows_a; ++i) {
+            for (int j = 0; j < cols_b; ++j) {
+                U sum = 0; // Use T type for sum
+                for (int k = 0; k < cols_a; ++k) {
+                    sum += A[i * cols_a + k] * B[k * cols_b + j];
                 }
-                C[i * colsB + j] = sum;
+                C[i * cols_b + j] = sum;
             }
         }
     }
 
+    template <typename T>
+    void std_mtx_tpose(const T *A, T *At, int rows, int cols) {
+        for (int i = 0; i < rows; ++i) {
+            for (int j = 0; j < cols; ++j) {
+                At[j * rows + i] = A[i * cols + j];
+            }
+        }
+    }
     /**
      * @brief Perform matrix addition on two matrices as flat vectors
      * @param A Input matrix A

diff --git a/modules/linalg/mtx_avx2_arr_i16.cpp b/modules/linalg/mtx_avx2_arr_i16.cpp
@@ -115,40 +115,41 @@ void gpmp::linalg::Mtx::mtx_sub(const int16_t *A,
 }
 
 void gpmp::linalg::Mtx::mtx_mult(const int16_t *A,
-              const int16_t *B,
-              int16_t *C,
-              int rows_a,
-              int cols_a,
-              int cols_b) {
+                                 const int16_t *B,
+                                 int16_t *C,
+                                 int rows_a,
+                                 int cols_a,
+                                 int cols_b) {
     for (int i = 0; i < rows_a; ++i) {
         for (int j = 0; j < cols_b; j += 16) {
             __m256i c = _mm256_setzero_si256();
-            
+
             for (int k = 0; k < cols_a; ++k) {
                 __m256i a = _mm256_set1_epi16(A[i * cols_a + k]);
-                __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j]));
-
+                __m256i b = _mm256_loadu_si256(
+                    reinterpret_cast<const __m256i *>(&B[k * cols_b + j]));
+
                 __m256i prod = _mm256_mullo_epi16(a, b);
                 c = _mm256_add_epi16(c, prod);
             }
-
-            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c); 
+
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]),
+                                c);
         }
-        
+
         // Handle remaining elements
         for (int j = cols_b - cols_b % 16; j < cols_b; ++j) {
             int sum = 0;
-            
+
             for (int k = 0; k < cols_a; ++k) {
                 sum += A[i * cols_a + k] * B[k * cols_b + j];
             }
-            
+
             C[i * cols_b + j] = sum;
         }
     }
 }
 
-
 #endif
 
 // x86