#EDITS: adding optimized WO intrinsics or processor accel DGEMM matri…

…x multiplication class
akielaries · Feb 20, 2024 · efe99cc · efe99cc
1 parent 8a40bde
commit efe99cc
Show file tree

Hide file tree

Showing 9 changed files with 910 additions and 0 deletions.
diff --git a/include/linalg/_dgemm.hpp b/include/linalg/_dgemm.hpp
@@ -0,0 +1,236 @@
+/*************************************************************************
+ *
+ *  Project
+ *                         _____ _____  __  __ _____
+ *                        / ____|  __ \|  \/  |  __ \
+ *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |
+ * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/
+ *| (_) | |_) |  __/ | | | |__| | |    | |  | | |
+ * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|
+ *      | |
+ *      |_|
+ *
+ * Copyright (C) Akiel Aries, <[email protected]>, et al.
+ *
+ * This software is licensed as described in the file LICENSE, which
+ * you should have received as part of this distribution. The terms
+ * among other details are referenced in the official documentation
+ * seen here : https://akielaries.github.io/openGPMP/ along with
+ * important files seen in this project.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute
+ * and/or sell copies of the Software, and permit persons to whom
+ * the Software is furnished to do so, under the terms of the
+ * LICENSE file. As this is an Open Source effort, all implementations
+ * must be of the same methodology.
+ *
+ *
+ *
+ * This software is distributed on an AS IS basis, WITHOUT
+ * WARRANTY OF ANY KIND, either express or implied.
+ *
+ ************************************************************************/
+
+/** BLOCK SIZES */
+#define BLOCK_SZ_M 384  /**< Rows of A and C */
+#define BLOCK_SZ_K 384  /**< Columns of A and rows of B */
+#define BLOCK_SZ_N 4096 /**< Columns of B and C */
+#define BLOCK_SZ_MR 4   /**< Rows of the micro-panel of A and C */
+#define BLOCK_SZ_NR 4   /**< Columns of the micro-panel of B and C */
+
+ namespace gpmp {
+ namespace linalg {
+
+/**
+ * @class DGEMM
+ * @brief Class for performing matrix multiplication on double type arrays
+ */
+class DGEMM {
+  public:
+    /**< Buffer for storing packed micro panels of A  */
+    static double DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K];
+    /**< Buffer for storing packed micro panels of B  */
+    static double DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N];
+    /**< Buffer for storing intermediate results  */
+    static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR];
+
+    /**
+     * @brief Packs micro panels of size BLOCK_SZ_MR rows by k columns from A
+     * without padding
+     *
+     * @param k Number of columns to pack
+     * @param A Pointer to the source matrix A
+     * @param incRowA Increment between consecutive rows of A
+     * @param incColA Increment between consecutive columns of A
+     * @param buffer Pointer to the buffer to store the packed micro panels
+     */
+    void pack_micro_A(int k,
+                      const double *A,
+                      int incRowA,
+                      int incColA,
+                      double *buffer);
+
+    /**
+     * @brief Packs panels from A with padding if needed
+     *
+     * @param mc Number of rows to pack
+     * @param kc Number of columns to pack
+     * @param A Pointer to the source matrix A
+     * @param incRowA Increment between consecutive rows of A
+     * @param incColA Increment between consecutive columns of A
+     * @param buffer Pointer to the buffer to store the packed panels
+     */
+    void pack_buffer_A(int mc,
+                       int kc,
+                       const double *A,
+                       int incRowA,
+                       int incColA,
+                       double *buffer);
+
+    /**
+     * @brief Packs micro panels of size BLOCK_SZ_NR columns by k rows from B
+     * without padding
+     *
+     * @param k Number of rows to pack
+     * @param B Pointer to the source matrix B
+     * @param incRowB Increment between consecutive rows of B
+     * @param incColB Increment between consecutive columns of B
+     * @param buffer Pointer to the buffer to store the packed micro panels
+     */
+    void pack_micro_B(int k,
+                      const double *B,
+                      int incRowB,
+                      int incColB,
+                      double *buffer);
+
+    /**
+     * @brief Packs panels from B with padding if needed
+     *
+     * @param kc Number of rows to pack
+     * @param nc Number of columns to pack
+     * @param B Pointer to the source matrix B
+     * @param incRowB Increment between consecutive rows of B
+     * @param incColB Increment between consecutive columns of B
+     * @param buffer Pointer to the buffer to store the packed panels
+     */
+    void pack_buffer_B(int kc,
+                       int nc,
+                       const double *B,
+                       int incRowB,
+                       int incColB,
+                       double *buffer);
+
+    /**
+     * @brief Computes the micro kernel that multiplies panels from A and B
+     *
+     * @param kc Number of columns
+     * @param alpha Scalar alpha
+     * @param A Pointer to the packed panel A
+     * @param B Pointer to the packed panel B
+     * @param beta Scalar beta
+     * @param C Pointer to the output matrix C
+     * @param incRowC Increment between consecutive rows of C
+     * @param incColC Increment between consecutive columns of C
+     */
+    void dgemm_micro_kernel(int kc,
+                            double alpha,
+                            const double *A,
+                            const double *B,
+                            double beta,
+                            double *C,
+                            int incRowC,
+                            int incColC);
+
+    /**
+     * @brief Computes Y += alpha*X (double precision AX + Y)
+     *
+     * @param m Number of rows
+     * @param n Number of columns
+     * @param alpha Scalar alpha
+     * @param X Pointer to matrix X
+     * @param incRowX Increment between consecutive rows of X
+     * @param incColX Increment between consecutive columns of X
+     * @param Y Pointer to matrix Y
+     * @param incRowY Increment between consecutive rows of Y
+     * @param incColY Increment between consecutive columns of Y
+     */
+    void dgeaxpy(int m,
+                 int n,
+                 double alpha,
+                 const double *X,
+                 int incRowX,
+                 int incColX,
+                 double *Y,
+                 int incRowY,
+                 int incColY);
+
+    /**
+     * @brief Scales elements of X by alpha
+     *
+     * @param m Number of rows
+     * @param n Number of columns
+     * @param alpha Scalar alpha
+     * @param X Pointer to matrix X
+     * @param incRowX Increment between consecutive rows of X
+     * @param incColX Increment between consecutive columns of X
+     */
+    void
+    dgescal(int m, int n, double alpha, double *X, int incRowX, int incColX);
+
+    /**
+     * @brief Macro kernel for the multiplication of blocks of A and B
+     *
+     * @param mc Number of rows in the block of C
+     * @param nc Number of columns in the block of C
+     * @param kc Number of columns in the blocks of A and rows of B
+     * @param alpha Scalar alpha
+     * @param beta Scalar beta
+     * @param C Pointer to the output matrix C
+     * @param incRowC Increment between consecutive rows of C
+     * @param incColC Increment between consecutive columns of C
+     */
+    void dgemm_macro_kernel(int mc,
+                            int nc,
+                            int kc,
+                            double alpha,
+                            double beta,
+                            double *C,
+                            int incRowC,
+                            int incColC);
+
+    /**
+     * @brief Main DGEMM entrypoint, computes C <- beta*C + alpha*A*B
+     *
+     * @param m Number of rows of A and rows of C
+     * @param n Number of columns of B and columns of C
+     * @param k Number of columns of A and rows of B
+     * @param alpha Scalar alpha
+     * @param A Pointer to matrix A
+     * @param incRowA Increment between consecutive rows of A
+     * @param incColA Increment between consecutive columns of A
+     * @param B Pointer to matrix B
+     * @param incRowB Increment between consecutive rows of B
+     * @param incColB Increment between consecutive columns of B
+     * @param beta Scalar beta
+     * @param C Pointer to matrix C
+     * @param incRowC Increment between consecutive rows of C
+     * @param incColC Increment between consecutive columns of C
+     */
+    void dgemm_nn(int m,
+                  int n,
+                  int k,
+                  double alpha,
+                  const double *A,
+                  int incRowA,
+                  int incColA,
+                  const double *B,
+                  int incRowB,
+                  int incColB,
+                  double beta,
+                  double *C,
+                  int incRowC,
+                  int incColC);
+};
+
+}
+}
diff --git a/include/linalg/mtx.hpp b/include/linalg/mtx.hpp
@@ -312,6 +312,13 @@ class Mtx {
                   int cols_a,
                   int cols_b);
 
+    void mtx_mult(const double *A,
+                  const double *B,
+                  double *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
     void mtx_tpose(const int *A, int *C, int rows, int cols);
 
     /**

diff --git a/modules/linalg/CMakeLists.txt b/modules/linalg/CMakeLists.txt
@@ -14,6 +14,7 @@ set(SOURCE_FILES
     tensor.cpp
     svd.cpp
     vector_naive.cpp
+    dgemm_arr.cpp
 )
 
 # Add files depending on the detected SIMD ISA
@@ -33,6 +34,7 @@ if (SIMD_ISA STREQUAL "AVX2")
         vector_avx2_i16.cpp
         vector_avx2_i32.cpp
         vector_avx2_f64.cpp
+
     )
 elseif (SIMD_ISA STREQUAL "ARM_NEON")
     list(APPEND SOURCE_FILES