#EDITS: DGEMM implementation using SSE and ASM is broken

akielaries · Mar 1, 2024 · e1c7e63 · e1c7e63
1 parent 9f7edff
commit e1c7e63
Show file tree

Hide file tree

Showing 7 changed files with 121 additions and 22 deletions.
diff --git a/include/linalg/_dgemm.hpp b/include/linalg/_dgemm.hpp
@@ -146,6 +146,24 @@ class DGEMM {
                             int incRowC,
                             int incColC);
 
+/**
+ * @brief Perform a micro-kernel operation for double-precision matrix-matrix multiplication (DGEMM) 
+ *
+ * This function implements a micro-kernel operation for DGEMM, which is used as a building block in larger
+ * DGEMM routines. The micro-kernel performs a basic matrix multiplication operation with optimizations
+ * tailored for the SSE (Streaming SIMD Extensions) x86 architecture 
+ *
+ * @param kc The size of the inner dimension of the matrices A and B 
+ * @param alpha Scaling factor for the matrix multiplication 
+ * @param A Pointer to the first input matrix A in row-major order 
+ * @param B Pointer to the second input matrix B in column-major order 
+ * @param beta Scaling factor for the matrix C 
+ * @param C Pointer to the output matrix C in row-major order 
+ * @param incRowC Increment for moving between rows of the matrix C 
+ * @param incColC Increment for moving between columns of the matrix C 
+ * @param nextA Pointer to the next block of matrix A (unused in this micro-kernel) 
+ * @param nextB Pointer to the next block of matrix B (unused in this micro-kernel) 
+ */
     void dgemm_micro_kernel(long kc,
                             double alpha,
                             const double *A,

diff --git a/modules/linalg/dgemm_arr.cpp b/modules/linalg/dgemm_arr.cpp
@@ -188,6 +188,39 @@ void gpmp::linalg::DGEMM::pack_buffer_B(int kc,
     }
 }
 
+
+// use assembly SSE kernel
+#if defined (__SSE__)
+
+void gpmp::linalg::DGEMM::dgemm_micro_kernel(long kc,
+                               double alpha,
+                               const double *A,
+                               const double *B,
+                               double beta,
+                               double *C,
+                               long incRowC,
+                               long incColC,
+                               const double *nextA,
+                               const double *nextB) {
+    long kb = kc / 4;
+    long kl = kc % 4;
+
+    dgemm_kernel_asm(A,
+                     B,
+                     C,
+                     nextA,
+                     nextB,
+                     kl,
+                     kb,
+                     incRowC,
+                     incColC,
+                     alpha,
+                     beta);
+}
+
+// use naive implementation w/o assembly kernel using intrinsics
+#else
+
 // micro kernel that multiplies panels from A and B
 void gpmp::linalg::DGEMM::dgemm_micro_kernel(int kc,
                                              double alpha,
@@ -250,6 +283,8 @@ void gpmp::linalg::DGEMM::dgemm_micro_kernel(int kc,
     }
 }
 
+#endif
+
 // Compute Y += alpha*X (double precision AX + Y)
 void gpmp::linalg::DGEMM::dgeaxpy(int m,
                                   int n,
@@ -327,13 +362,34 @@ void gpmp::linalg::DGEMM::dgemm_macro_kernel(int mc,
     int mr, nr;
     int i, j;
 
+#if defined (__SSE__)
+
+    const double *nextA;
+    const double *nextB;
+
+#endif
+
     for (j = 0; j < np; ++j) {
         nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;
 
         for (i = 0; i < mp; ++i) {
             mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;
 
             if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
+#if defined (__SSE__)
+                dgemm_micro_kernel(
+                    kc,
+                    alpha,
+                    &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
+                    &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
+                    beta,
+                    &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
+                    incRowC,
+                    incColC,
+                    nextA,
+                    nextB);
+
+#else
                 dgemm_micro_kernel(
                     kc,
                     alpha,
@@ -343,7 +399,24 @@ void gpmp::linalg::DGEMM::dgemm_macro_kernel(int mc,
                     &C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
                     incRowC,
                     incColC);
-            } else {
+
+#endif
+            } 
+
+            else {
+
+#if defined (__SSE__)
+                dgemm_micro_kernel(kc,
+                                   alpha,
+                                   &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
+                                   &DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
+                                   0.0,
+                                   DGEMM_BUFF_C,
+                                   1,
+                                   BLOCK_SZ_MR,
+                                   nextA,
+                                   nextB);
+#else
                 dgemm_micro_kernel(kc,
                                    alpha,
                                    &DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
@@ -352,6 +425,8 @@ void gpmp::linalg::DGEMM::dgemm_macro_kernel(int mc,
                                    DGEMM_BUFF_C,
                                    1,
                                    BLOCK_SZ_MR);
+
+#endif
                 dgescal(
                     mr,
                     nr,

diff --git a/modules/linalg/dgemm_kernel_sse.S b/modules/linalg/dgemm_kernel_sse.S
@@ -59,21 +59,20 @@ dgemm_kernel_asm:
     */
 
     /***************************************************************************
+    * CPP FUNC: 
+    * dgemm_kernel_asm(const double *A,
+    *                  const double *B,
+    *                  double *C,
+    *                  const double *nextA,
+    *                  const double *nextB,
+    *                  long kl,
+    *                  long kb,
+    *                  long incRowC,
+    *                  long incColC,
+    *                  double alpha,
+    *                  double beta);
     * INPUT PARAMETERS:
-    *
-    * - kl      :   %rdi
-    * - kb      :   %rsi
-    * - A       :   %rdx
-    * - B       :   %rcx
-    * - nextA   :   %r8
-    * - nextB   :   %r9
-    * <--STACK-->
-    * - alpha   :   %rsp + 88
-    * - beta    :   %rsp + 64
-    * - C       :   %rsp + 56
-    * - incRowC :   %rsp + 48
-    * - incColC :   %rsp + 40
-    *
+    * <--ARGUMENT REGISTERS-->
     * - A       :   %rdi
     * - B       :   %rsi
     * - C       :   %rdx
@@ -82,10 +81,10 @@ dgemm_kernel_asm:
     * - kl      :   %r9
     * <--STACK-->
     * - kb      :   %rsp + 8
-    * - alpha   :   %rsp + 88
-    * - beta    :   %rsp + 64
-    * - incRowC :   %rsp + 48
-    * - incColC :   %rsp + 40
+    * - alpha   :   %rsp + 80
+    * - beta    :   %rsp + 102
+    * - incRowC :   %rsp + 16
+    * - incColC :   %rsp + 24
     ***************************************************************************/
 
     // Address of A stored in %rax (result register)
@@ -595,7 +594,9 @@ dgemm_kernel_asm:
 
     //movsd  4,                     %xmm0  // load alpha
     //movsd  88(%rsp),                %xmm0  // load alpha
-    movsd   88(%rsp),               %xmm0
+    // THIS WORKS IN NON-OOP ENV
+    //movsd   88(%rsp),               %xmm0
+    movsd   80(%rsp),               %xmm0
 
     //movsd  5,                     %xmm1  // load beta
     //movsd  64(%rsp),                %xmm1  // load beta

diff --git a/tests/linalg/t_dgemm_arr.cpp b/tests/linalg/t_dgemm_arr.cpp
@@ -18,7 +18,7 @@ using namespace gpmp;
 
 namespace {
 TEST(GEMMArrayTest, DGEMMPerformanceComparison) {
-    int mtx_size = 1024;
+    int mtx_size = 128;
     TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
     // define input matrices A and B
     double *A = new double[mtx_size * mtx_size];

diff --git a/tests/linalg/t_igemm_arr.cpp b/tests/linalg/t_igemm_arr.cpp
@@ -15,9 +15,12 @@ const double TOLERANCE = 1e-3;
 
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
+#define INFO_COUT                                                              \
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 
 namespace {
 TEST(GEMMArrayTest, IGEMMPerformanceComparison) {
+    INFO_COUT << "GEMM ROUTINES" << std::endl;
     int mtx_size = 1024;
     TEST_COUT << "Matrix size      : " << mtx_size << std::endl;
     // define input matrices A and B

diff --git a/tests/linalg/t_matrix.hpp b/tests/linalg/t_matrix.hpp
@@ -66,6 +66,8 @@ bool mtx_verif(const T *A, const T *B, int rows, int cols) {
     return true;
 }
 
+
+
 template <typename T> void print_matrix(const T *matrix, int rows, int cols) {
     for (int i = 0; i < rows; ++i) {
         for (int j = 0; j < cols; ++j) {

diff --git a/tests/linalg/t_vector_vector_f64.cpp b/tests/linalg/t_vector_vector_f64.cpp
@@ -13,7 +13,7 @@
 
 const double TOLERANCE = 1e-3;
 
-#define TEST_COUT std::cout << "\033[32m[          ] [ INFO ] \033[0m"
+#define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
     std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 TEST(VectorVectorTestF64, Addition) {