#EDITS: some mass updated to linear algebra module and tests

akielaries · Feb 27, 2024 · d0465af · d0465af
1 parent 97926a0
commit d0465af
Show file tree

Hide file tree

Showing 27 changed files with 275 additions and 5,383 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -37,7 +37,7 @@ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -Wno-unused-result -Wparentheses -Ws
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DNDEBUG -Wall -Wextra -Wfloat-equal -Wcast-qual")
 set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wshadow -Wunreachable-code -D __GPMP_CPP_API__")
 
-project(openGPMP LANGUAGES CXX C Fortran)
+project(openGPMP LANGUAGES CXX C Fortran ASM)
 set(PROJECT_VERSION "1.0")
 
 include(CheckIncludeFileCXX)

diff --git a/experiment/test.c b/experiment/test.c
@@ -1,9 +1,20 @@
 #include <stdio.h>
 
 // Declare the assembly function as an external function
-extern int asm_function(int a, int b, int c, int d, int e, int f, int g, int h, int i, int j, int k, double l);
-
-int add (int a, int b, int c) {
+extern int asm_function(int a,
+                        int b,
+                        int c,
+                        int d,
+                        int e,
+                        int f,
+                        int g,
+                        int h,
+                        int i,
+                        int j,
+                        int k,
+                        double l);
+
+int add(int a, int b, int c) {
 
     // performs:
     //
@@ -20,9 +31,8 @@ int main() {
     int a = 5;
     int b = 7;
     int c = 10;
-    //int result = asm_function(5, 7);
+    // int result = asm_function(5, 7);
     int result = add(a, b, c);
     printf("Result: %d\n", result);
     return 0;
 }
-
diff --git a/include/linalg/_dgemm.hpp b/include/linalg/_dgemm.hpp
@@ -50,11 +50,47 @@ namespace linalg {
 class DGEMM {
   public:
     /**< Buffer for storing packed micro panels of A  */
-    static double DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K]__attribute__ ((aligned (16)));
+    static double DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K]
+        __attribute__((aligned(16)));
     /**< Buffer for storing packed micro panels of B  */
-    static double DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N]__attribute__ ((aligned (16)));
+    static double DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N]
+        __attribute__((aligned(16)));
     /**< Buffer for storing intermediate results  */
-    static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR]__attribute__ ((aligned (16)));
+    static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR]
+        __attribute__((aligned(16)));
+
+    /**
+     * @brief Performs matrix-matrix multiplication (DGEMM) using an
+     * assembly implementation It computes the product of matrices A and B,
+     * scaled by alpha and beta, and stores the result in matrix C
+     *
+     * @param A Pointer to the first matrix (A) in row-major order
+     * @param B Pointer to the second matrix (B) in row-major order
+     * @param C Pointer to the result matrix (C) in row-major order
+     * @param nextA Pointer to the next matrix A
+     * @param nextB Pointer to the next matrix B
+     * @param kl Value representing the remaining columns of matrix A
+     * @param kb Value representing the remaining rows of matrix B
+     * @param incRowC Increment for moving to the next row of matrix C
+     * @param incColC Increment for moving to the next column of matrix C
+     * @param alpha Scalar value to scale the product of matrices A and B
+     * @param beta Scalar value to scale matrix C before adding the product
+     *
+     * @note This calls an Assembly implementation depending on detected
+     * host system. x86 (SSE, AVX2) and ARM NEON supported
+     */
+    /*void dgemm_kernel_asm(const double *A,
+                          const double *B,
+                          double *C,
+                          const double *nextA,
+                          const double *nextB,
+                          long kl,
+                          long kb,
+                          long incRowC,
+                          long incColC,
+                          double alpha,
+                          double beta);
+*/
 
     /**
      * @brief Packs micro panels of size BLOCK_SZ_MR rows by k columns from A
@@ -143,6 +179,16 @@ class DGEMM {
                             int incRowC,
                             int incColC);
 
+    void dgemm_micro_kernel(long kc,
+                            double alpha,
+                            const double *A,
+                            const double *B,
+                            double beta,
+                            double *C,
+                            long incRowC,
+                            long incColC,
+                            const double *nextA,
+                            const double *nextB);
     /**
      * @brief Computes Y += alpha*X (double precision AX + Y)
      *

diff --git a/modules/linalg/CMakeLists.txt b/modules/linalg/CMakeLists.txt
@@ -15,8 +15,11 @@ set(SOURCE_FILES
     svd.cpp
     vector_naive.cpp
     igemm_arr.cpp
+
     sgemm_arr.cpp
+
     dgemm_arr.cpp
+    dgemm_kernel_sse.S
 )
 
 # Add files depending on the detected SIMD ISA