Skip to content

Commit

Permalink
#EDITS: DGEMM implementation using SSE and ASM is broken
Browse files Browse the repository at this point in the history
  • Loading branch information
akielaries committed Mar 1, 2024
1 parent 9f7edff commit e1c7e63
Show file tree
Hide file tree
Showing 7 changed files with 121 additions and 22 deletions.
18 changes: 18 additions & 0 deletions include/linalg/_dgemm.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,24 @@ class DGEMM {
int incRowC,
int incColC);

/**
* @brief Perform a micro-kernel operation for double-precision matrix-matrix multiplication (DGEMM)
*
* This function implements a micro-kernel operation for DGEMM, which is used as a building block in larger
* DGEMM routines. The micro-kernel performs a basic matrix multiplication operation with optimizations
* tailored for the SSE (Streaming SIMD Extensions) x86 architecture
*
* @param kc The size of the inner dimension of the matrices A and B
* @param alpha Scaling factor for the matrix multiplication
* @param A Pointer to the first input matrix A in row-major order
* @param B Pointer to the second input matrix B in column-major order
* @param beta Scaling factor for the matrix C
* @param C Pointer to the output matrix C in row-major order
* @param incRowC Increment for moving between rows of the matrix C
* @param incColC Increment for moving between columns of the matrix C
* @param nextA Pointer to the next block of matrix A (unused in this micro-kernel)
* @param nextB Pointer to the next block of matrix B (unused in this micro-kernel)
*/
void dgemm_micro_kernel(long kc,
double alpha,
const double *A,
Expand Down
77 changes: 76 additions & 1 deletion modules/linalg/dgemm_arr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,39 @@ void gpmp::linalg::DGEMM::pack_buffer_B(int kc,
}
}


// use assembly SSE kernel
#if defined (__SSE__)

void gpmp::linalg::DGEMM::dgemm_micro_kernel(long kc,
double alpha,
const double *A,
const double *B,
double beta,
double *C,
long incRowC,
long incColC,
const double *nextA,
const double *nextB) {
long kb = kc / 4;
long kl = kc % 4;

dgemm_kernel_asm(A,
B,
C,
nextA,
nextB,
kl,
kb,
incRowC,
incColC,
alpha,
beta);
}

// use naive implementation w/o assembly kernel using intrinsics
#else

// micro kernel that multiplies panels from A and B
void gpmp::linalg::DGEMM::dgemm_micro_kernel(int kc,
double alpha,
Expand Down Expand Up @@ -250,6 +283,8 @@ void gpmp::linalg::DGEMM::dgemm_micro_kernel(int kc,
}
}

#endif

// Compute Y += alpha*X (double precision AX + Y)
void gpmp::linalg::DGEMM::dgeaxpy(int m,
int n,
Expand Down Expand Up @@ -327,13 +362,34 @@ void gpmp::linalg::DGEMM::dgemm_macro_kernel(int mc,
int mr, nr;
int i, j;

#if defined (__SSE__)

const double *nextA;
const double *nextB;

#endif

for (j = 0; j < np; ++j) {
nr = (j != np - 1 || _nr == 0) ? BLOCK_SZ_NR : _nr;

for (i = 0; i < mp; ++i) {
mr = (i != mp - 1 || _mr == 0) ? BLOCK_SZ_MR : _mr;

if (mr == BLOCK_SZ_MR && nr == BLOCK_SZ_NR) {
#if defined (__SSE__)
dgemm_micro_kernel(
kc,
alpha,
&DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
&DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
beta,
&C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
incRowC,
incColC,
nextA,
nextB);

#else
dgemm_micro_kernel(
kc,
alpha,
Expand All @@ -343,7 +399,24 @@ void gpmp::linalg::DGEMM::dgemm_macro_kernel(int mc,
&C[i * BLOCK_SZ_MR * incRowC + j * BLOCK_SZ_NR * incColC],
incRowC,
incColC);
} else {

#endif
}

else {

#if defined (__SSE__)
dgemm_micro_kernel(kc,
alpha,
&DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
&DGEMM_BUFF_B[j * kc * BLOCK_SZ_NR],
0.0,
DGEMM_BUFF_C,
1,
BLOCK_SZ_MR,
nextA,
nextB);
#else
dgemm_micro_kernel(kc,
alpha,
&DGEMM_BUFF_A[i * kc * BLOCK_SZ_MR],
Expand All @@ -352,6 +425,8 @@ void gpmp::linalg::DGEMM::dgemm_macro_kernel(int mc,
DGEMM_BUFF_C,
1,
BLOCK_SZ_MR);

#endif
dgescal(
mr,
nr,
Expand Down
39 changes: 20 additions & 19 deletions modules/linalg/dgemm_kernel_sse.S
Original file line number Diff line number Diff line change
Expand Up @@ -59,21 +59,20 @@ dgemm_kernel_asm:
*/

/***************************************************************************
* CPP FUNC:
* dgemm_kernel_asm(const double *A,
* const double *B,
* double *C,
* const double *nextA,
* const double *nextB,
* long kl,
* long kb,
* long incRowC,
* long incColC,
* double alpha,
* double beta);
* INPUT PARAMETERS:
*
* - kl : %rdi
* - kb : %rsi
* - A : %rdx
* - B : %rcx
* - nextA : %r8
* - nextB : %r9
* <--STACK-->
* - alpha : %rsp + 88
* - beta : %rsp + 64
* - C : %rsp + 56
* - incRowC : %rsp + 48
* - incColC : %rsp + 40
*
* <--ARGUMENT REGISTERS-->
* - A : %rdi
* - B : %rsi
* - C : %rdx
Expand All @@ -82,10 +81,10 @@ dgemm_kernel_asm:
* - kl : %r9
* <--STACK-->
* - kb : %rsp + 8
* - alpha : %rsp + 88
* - beta : %rsp + 64
* - incRowC : %rsp + 48
* - incColC : %rsp + 40
* - alpha : %rsp + 80
* - beta : %rsp + 102
* - incRowC : %rsp + 16
* - incColC : %rsp + 24
***************************************************************************/

// Address of A stored in %rax (result register)
Expand Down Expand Up @@ -595,7 +594,9 @@ dgemm_kernel_asm:

//movsd 4, %xmm0 // load alpha
//movsd 88(%rsp), %xmm0 // load alpha
movsd 88(%rsp), %xmm0
// THIS WORKS IN NON-OOP ENV
//movsd 88(%rsp), %xmm0
movsd 80(%rsp), %xmm0

//movsd 5, %xmm1 // load beta
//movsd 64(%rsp), %xmm1 // load beta
Expand Down
2 changes: 1 addition & 1 deletion tests/linalg/t_dgemm_arr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ using namespace gpmp;

namespace {
TEST(GEMMArrayTest, DGEMMPerformanceComparison) {
int mtx_size = 1024;
int mtx_size = 128;
TEST_COUT << "Matrix size : " << mtx_size << std::endl;
// define input matrices A and B
double *A = new double[mtx_size * mtx_size];
Expand Down
3 changes: 3 additions & 0 deletions tests/linalg/t_igemm_arr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,12 @@ const double TOLERANCE = 1e-3;

using namespace gpmp;
#define TEST_COUT std::cerr << "\033[32m[ ] [ INFO ] \033[0m"
#define INFO_COUT \
std::cerr << "\033[32m[ ] [ INFO ] \033[0m\033[1;34m\033[1m"

namespace {
TEST(GEMMArrayTest, IGEMMPerformanceComparison) {
INFO_COUT << "GEMM ROUTINES" << std::endl;
int mtx_size = 1024;
TEST_COUT << "Matrix size : " << mtx_size << std::endl;
// define input matrices A and B
Expand Down
2 changes: 2 additions & 0 deletions tests/linalg/t_matrix.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ bool mtx_verif(const T *A, const T *B, int rows, int cols) {
return true;
}



template <typename T> void print_matrix(const T *matrix, int rows, int cols) {
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
Expand Down
2 changes: 1 addition & 1 deletion tests/linalg/t_vector_vector_f64.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

const double TOLERANCE = 1e-3;

#define TEST_COUT std::cout << "\033[32m[ ] [ INFO ] \033[0m"
#define TEST_COUT std::cerr << "\033[32m[ ] [ INFO ] \033[0m"
#define INFO_COUT \
std::cerr << "\033[32m[ ] [ INFO ] \033[0m\033[1;34m\033[1m"
TEST(VectorVectorTestF64, Addition) {
Expand Down

0 comments on commit e1c7e63

Please sign in to comment.