diff --git a/experiment/test.S b/experiment/test.S new file mode 100644 index 000000000..818a78713 --- /dev/null +++ b/experiment/test.S @@ -0,0 +1,19 @@ +.globl asm_function +asm_function: + // Assembly code to add two integers + // Parameters are passed in registers: a in %edi, b in %esi + // Result is stored in %eax + // Load a into %eax + mov %edi, %eax + + // Add b to %eax + add %esi, %eax + + // Add c to %eax + add %edx, %eax + + // Subtract d from %eax + sub %ecx, %eax + + ret // Return + diff --git a/experiment/test.c b/experiment/test.c new file mode 100644 index 000000000..750550f75 --- /dev/null +++ b/experiment/test.c @@ -0,0 +1,21 @@ +#include + +// Declare the assembly function as an external function +extern int asm_function(int a, int b, int c, int d); + +int add (int a, int b, int c) { + int d = 2; + + return asm_function(a, b, c, d); +} + +int main() { + int a = 5; + int b = 7; + int c = 10; + //int result = asm_function(5, 7); + int result = add(a, b, c); + printf("Result: %d\n", result); + return 0; +} + diff --git a/include/linalg/_dgemm.hpp b/include/linalg/_dgemm.hpp index 6142f6597..95c962564 100644 --- a/include/linalg/_dgemm.hpp +++ b/include/linalg/_dgemm.hpp @@ -50,11 +50,11 @@ namespace linalg { class DGEMM { public: /**< Buffer for storing packed micro panels of A */ - static double DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K]; + static double DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K]__attribute__ ((aligned (16))); /**< Buffer for storing packed micro panels of B */ - static double DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N]; + static double DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N]__attribute__ ((aligned (16))); /**< Buffer for storing intermediate results */ - static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR]; + static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR]__attribute__ ((aligned (16))); /** * @brief Packs micro panels of size BLOCK_SZ_MR rows by k columns from A diff --git a/modules/linalg/dgemm_asm.h b/modules/linalg/dgemm_asm.h new file mode 100644 index 000000000..63d847157 --- /dev/null +++ b/modules/linalg/dgemm_asm.h @@ -0,0 +1,7 @@ +#ifndef DGEMM_ASM_H +#define DGEMM_ASM_H + + + +#endif + diff --git a/modules/linalg/dgemm_kernel.S b/modules/linalg/dgemm_kernel.S new file mode 100644 index 000000000..475ad3c56 --- /dev/null +++ b/modules/linalg/dgemm_kernel.S @@ -0,0 +1,617 @@ +/************************************************************************* + * + * Project + * _____ _____ __ __ _____ + * / ____| __ \| \/ | __ \ + * ___ _ __ ___ _ __ | | __| |__) | \ / | |__) | + * / _ \| '_ \ / _ \ '_ \| | |_ | ___/| |\/| | ___/ + *| (_) | |_) | __/ | | | |__| | | | | | | | + * \___/| .__/ \___|_| |_|\_____|_| |_| |_|_| + * | | + * |_| + * + * Copyright (C) Akiel Aries, , et al. + * + * This software is licensed as described in the file LICENSE, which + * you should have received as part of this distribution. The terms + * among other details are referenced in the official documentation + * seen here : https://akielaries.github.io/openGPMP/ along with + * important files seen in this project. + * + * You may opt to use, copy, modify, merge, publish, distribute + * and/or sell copies of the Software, and permit persons to whom + * the Software is furnished to do so, under the terms of the + * LICENSE file. As this is an Open Source effort, all implementations + * must be of the same methodology. + * + * + * + * This software is distributed on an AS IS basis, WITHOUT + * WARRANTY OF ANY KIND, either express or implied. + * + ************************************************************************/ + +.globl dgemm_kernel_asm + +dgemm_kernel_asm: + + /* + // kb (32 bit) stored in %rsi + movq 0, %rsi + // kl (32 bit) stored in %rdi + movq 1, %rdi + // Address of A stored in %rax + movq 2, %rax + // Address of B stored in %rbx + movq 3, %rbx + // Address of nextA stored in %r9 + movq 9, %r9 + // Address of nextB stored in %r10 + movq 10, %r10 + + */ + + // Input parameters: + // kb (32 bit) stored in %rsi + movq %rsi, %rsi // Move kb into %rsi + // kl (32 bit) stored in %rdi + movq %rdi, %rdi // Move kl into %rdi + // Address of A stored in %rax + movq %rax, %rax // Move A into %rax + // Address of B stored in %rbx + movq %rbx, %rbx // Move B into %rbx + // Address of nextA stored in %r9 + movq %r9, %r9 // Move nextA into %r9 + // Address of nextB stored in %r10 + movq %r10, %r10 // Move nextB into %r10 + + // adjust addresses? + addq $128, %rax + addq $128, %rbx + + // Load data into XMM registers + // tmp0 = _mm_load_pd(A) + movapd -128(%rax), %xmm0 + // tmp1 = _mm_load_pd(A+2) + movapd -112(%rax), %xmm1 + // tmp2 = _mm_load_pd(B) + movapd -128(%rbx), %xmm2 + + // Initialize XMM registers + // ab_00_11 = _mm_setzero_pd() + xorpd %xmm8, %xmm8 + // ab_20_31 = _mm_setzero_pd() + xorpd %xmm9, %xmm9 + // ab_01_10 = _mm_setzero_pd() + xorpd %xmm10, %xmm10 + // ab_21_30 = _mm_setzero_pd() + xorpd %xmm11, %xmm11 + // ab_02_13 = _mm_setzero_pd() + xorpd %xmm12, %xmm12 + // ab_22_33 = _mm_setzero_pd() + xorpd %xmm13, %xmm13 + // ab_03_12 = _mm_setzero_pd() + xorpd %xmm14, %xmm14 + // ab_23_32 = _mm_setzero_pd() + xorpd %xmm15, %xmm15 + + // TESTS K1 + // check if kb==0 to handle remaining kl + testq %rsi, %rsi + + // jump to label .DCONSIDERLEFT if kb is zero + je .DCONSIDERLEFT + + // Loop label + .DLOOP: + + // prefetch + // Prefetch memory location calculated using base address in %rax + // Prefetching is a technique used to load data into the cache before it + // is actually accesses it can help improve performance by reducing the + // latency of memory accesses + prefetcht0 (4*35+1)*8(%rax) + + // *** 1. update + + // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + addpd %xmm3, %xmm12 + // tmp3 = _mm_load_pd(B+2) + movaps -112(%rbx), %xmm3 + // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + addpd %xmm6, %xmm13 + // tmp6 = tmp2 + movaps %xmm2, %xmm6 + // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1)) + pshufd $78, %xmm2, %xmm4 + // Multiply tmp2 by tmp0 + mulpd %xmm0, %xmm2 // tmp2 = _mm_mul_pd(tmp2, tmp0) + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1) + + // Add tmp5 to ab_03_12 + addpd %xmm5, %xmm14 // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + + // Add tmp7 to ab_23_32 + addpd %xmm7, %xmm15 // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + + // Move tmp4 to tmp7 + movaps %xmm4, %xmm7 // tmp7 = tmp4 + + // Multiply tmp4 by tmp0 + mulpd %xmm0, %xmm4 // tmp4 = _mm_mul_pd(tmp4, tmp0) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Add tmp2 to ab_00_11 + addpd %xmm2, %xmm8 // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) + + // Load values from memory (B+4) into tmp2 + movaps -96(%rbx), %xmm2 // tmp2 = _mm_load_pd(B+4) + + // Add tmp6 to ab_20_31 + addpd %xmm6, %xmm9 // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) + + // Move tmp3 to tmp6 + movaps %xmm3, %xmm6 // tmp6 = tmp3 + + // Shuffle tmp3 and store result in tmp5 + pshufd $78,%xmm3, %xmm5 // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1)) + + // Multiply tmp3 by tmp0 + mulpd %xmm0, %xmm3 // tmp3 = _mm_mul_pd(tmp3, tmp0) + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1) + + // Add tmp4 to ab_01_10 + addpd %xmm4, %xmm10 // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) + + // Add tmp7 to ab_21_30 + addpd %xmm7, %xmm11 // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) + + // Move tmp5 to tmp7 + movaps %xmm5, %xmm7 // tmp7 = tmp5 + + // Multiply tmp5 by tmp0 + mulpd %xmm0, %xmm5 // tmp5 = _mm_mul_pd(tmp5, tmp0) + + // Load values from memory (A+4) into tmp0 + movaps -96(%rax), %xmm0 // tmp0 = _mm_load_pd(A+4) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Load values from memory (A+6) into tmp1 + movaps -80(%rax), %xmm1 // tmp1 = _mm_load_pd(A+6) + + // *** 2. update + + // Add tmp3 to ab_02_13 + addpd %xmm3, %xmm12 // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + + // Load values from memory (B+6) into tmp3 + movaps -80(%rbx), %xmm3 // tmp3 = _mm_load_pd(B+6) + + // Add tmp6 to ab_22_33 + addpd %xmm6, %xmm13 // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + + // Move tmp2 to tmp6 + movaps %xmm2, %xmm6 // tmp6 = tmp2 + + // Shuffle tmp2 and store result in tmp4 + pshufd $78,%xmm2, %xmm4 // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1)) + + // Multiply tmp2 by tmp0 + mulpd %xmm0, %xmm2 // tmp2 = _mm_mul_pd(tmp2, tmp0); + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1); + + // Add tmp5 to ab_03_12 + addpd %xmm5, %xmm14 // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + + // Add tmp7 to ab_23_32 + addpd %xmm7, %xmm15 // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + + // Move tmp4 to tmp7 + movaps %xmm4, %xmm7 // tmp7 = tmp4 + + // Multiply tmp4 by tmp0 + mulpd %xmm0, %xmm4 // tmp4 = _mm_mul_pd(tmp4, tmp0) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Add tmp2 to ab_00_11 + addpd %xmm2, %xmm8 // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) + + // Load values from memory (B+8) into tmp2 + movaps -64(%rbx), %xmm2 // tmp2 = _mm_load_pd(B+8) + + // Add tmp6 to ab_20_31 + addpd %xmm6, %xmm9 // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) + + // Move tmp3 to tmp6 + movaps %xmm3, %xmm6 // tmp6 = tmp3 + + // Shuffle tmp3 and store result in tmp5 + pshufd $78,%xmm3, %xmm5 // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1)) + + // Multiply tmp3 by tmp0 + mulpd %xmm0, %xmm3 // tmp3 = _mm_mul_pd(tmp3, tmp0) + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1) + + // Add tmp4 to ab_01_10 + addpd %xmm4, %xmm10 // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) + + // Add tmp7 to ab_21_30 + addpd %xmm7, %xmm11 // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) + + // Move tmp5 to tmp7 + movaps %xmm5, %xmm7 // tmp7 = tmp5 + + // Multiply tmp5 by tmp0 + mulpd %xmm0, %xmm5 // tmp5 = _mm_mul_pd(tmp5, tmp0) + + // Load values from memory (A+8) into tmp0 + movaps -64(%rax), %xmm0 // tmp0 = _mm_load_pd(A+8) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Load values from memory (A+10) into tmp1 + movaps -48(%rax), %xmm1 // tmp1 = _mm_load_pd(A+10) + + // Prefetch memory location calculated using base address in %rax + prefetcht0 (4*37+1)*8(%rax) + + // *** 3. update + + // Add tmp3 to ab_02_13 + addpd %xmm3, %xmm12 // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + + // Load values from memory (B+10) into tmp3 + movaps -48(%rbx), %xmm3 // tmp3 = _mm_load_pd(B+10) + + // Add tmp6 to ab_22_33 + addpd %xmm6, %xmm13 // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + + // Move tmp2 to tmp6 + movaps %xmm2, %xmm6 // tmp6 = tmp2 + + // Shuffle tmp2 and store result in tmp4 + pshufd $78,%xmm2, %xmm4 // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1)) + + // Multiply tmp2 by tmp0 + mulpd %xmm0, %xmm2 // tmp2 = _mm_mul_pd(tmp2, tmp0); + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1); + + // Add tmp5 to ab_03_12 + addpd %xmm5, %xmm14 // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + + // Add tmp7 to ab_23_32 + addpd %xmm7, %xmm15 // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + + // Move tmp4 to tmp7 + movaps %xmm4, %xmm7 // tmp7 = tmp4 + + // Multiply tmp4 by tmp0 + mulpd %xmm0, %xmm4 // tmp4 = _mm_mul_pd(tmp4, tmp0) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Add tmp2 to ab_00_11 + addpd %xmm2, %xmm8 // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) + + // Load values from memory (B+12) into tmp2 + movaps -32(%rbx), %xmm2 // tmp2 = _mm_load_pd(B+12) + + // Add tmp6 to ab_20_31 + addpd %xmm6, %xmm9 // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) + + // Move tmp3 to tmp6 + movaps %xmm3, %xmm6 // tmp6 = tmp3 + + // Shuffle tmp3 and store result in tmp5 + pshufd $78,%xmm3, %xmm5 // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1)) + + // Multiply tmp3 by tmp0 + mulpd %xmm0, %xmm3 // tmp3 = _mm_mul_pd(tmp3, tmp0) + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1) + + // Add tmp4 to ab_01_10 + addpd %xmm4, %xmm10 // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) + + // Add tmp7 to ab_21_30 + addpd %xmm7, %xmm11 // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) + + // Move tmp5 to tmp7 + movaps %xmm5, %xmm7 // tmp7 = tmp5 + + // Multiply tmp5 by tmp0 + mulpd %xmm0, %xmm5 // tmp5 = _mm_mul_pd(tmp5, tmp0) + + // Load values from memory (A+12) into tmp0 + movaps -32(%rax), %xmm0 // tmp0 = _mm_load_pd(A+12) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Load values from memory (A+14) into tmp1 + movaps -16(%rax), %xmm1 // tmp1 = _mm_load_pd(A+14) + + + // *** 4. update + + // Add tmp3 to ab_02_13 + addpd %xmm3, %xmm12 // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + + // Load values from memory (B+14) into tmp3 + movaps -16(%rbx), %xmm3 // tmp3 = _mm_load_pd(B+14) + + // Add tmp6 to ab_22_33 + addpd %xmm6, %xmm13 // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + + // Move tmp2 to tmp6 + movaps %xmm2, %xmm6 // tmp6 = tmp2 + + // Shuffle tmp2 and store result in tmp4 + pshufd $78,%xmm2, %xmm4 // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1)) + + // Multiply tmp2 by tmp0 + mulpd %xmm0, %xmm2 // tmp2 = _mm_mul_pd(tmp2, tmp0); + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1); + + // Adjust pointer A + subq $-32*4, %rax // A += 16; + + // Add tmp5 to ab_03_12 + addpd %xmm5, %xmm14 // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + + // Add tmp7 to ab_23_32 + addpd %xmm7, %xmm15 // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + + // Move tmp4 to tmp7 + movaps %xmm4, %xmm7 // tmp7 = tmp4 + + // Multiply tmp4 by tmp0 + mulpd %xmm0, %xmm4 // tmp4 = _mm_mul_pd(tmp4, tmp0) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Adjust pointer nextB + subq $-128, %r10 // nextB += 16 + + // Add tmp2 to ab_00_11 + addpd %xmm2, %xmm8 // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) + + // Load values from memory (B+16) into tmp2 + movaps (%rbx),%xmm2 // tmp2 = _mm_load_pd(B+16) + + // Add tmp6 to ab_20_31 + addpd %xmm6, %xmm9 // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) + + // Move tmp3 to tmp6 + movaps %xmm3, %xmm6 // tmp6 = tmp3 + + // Shuffle tmp3 and store result in tmp5 + pshufd $78,%xmm3, %xmm5 // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1)) + + // Multiply tmp3 by tmp0 + mulpd %xmm0, %xmm3 // tmp3 = _mm_mul_pd(tmp3, tmp0) + + // Multiply tmp6 by tmp1 + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1) + + // Adjust pointer B + subq $-32*4, %rbx // B += 16; + + // Add tmp4 to ab_01_10 + addpd %xmm4, %xmm10 // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) + + // Add tmp7 to ab_21_30 + addpd %xmm7, %xmm11 // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) + + // Move tmp5 to tmp7 + movaps %xmm5, %xmm7 // tmp7 = tmp5 + + // Multiply tmp5 by tmp0 + mulpd %xmm0, %xmm5 // tmp5 = _mm_mul_pd(tmp5, tmp0) + + // Load values from memory (A+16) into tmp0 + movaps -128(%rax),%xmm0 // tmp0 = _mm_load_pd(A+16) + + // Multiply tmp7 by tmp1 + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + // Load values from memory (A+18) into tmp1 + movaps -112(%rax), %xmm1 // tmp1 = _mm_load_pd(A+18) + + + // prefetch nextB[0] + prefetcht2 0(%r10) + // prefetch nextB[8] + prefetcht2 64(%r10) + + // --l decrement + decq %rsi + // if l>= 1 go back + jne .DLOOP + + // If kl==0, writeback to AB + .DCONSIDERLEFT: + + testq %rdi, %rdi // if kl==0 writeback to AB + je .DPOSTACCUMULATE + + + // Loop for kl remaining iterations + .DLOOPLEFT: // for l = kl,..,1 do + + addpd %xmm3, %xmm12 // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + movapd -112(%rbx),%xmm3 // tmp3 = _mm_load_pd(B+2) + addpd %xmm6, %xmm13 // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + movapd %xmm2, %xmm6 // tmp6 = tmp2 + pshufd $78,%xmm2, %xmm4 // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1)) + + mulpd %xmm0, %xmm2 // tmp2 = _mm_mul_pd(tmp2, tmp0); + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1); + + addpd %xmm5, %xmm14 // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + addpd %xmm7, %xmm15 // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + movapd %xmm4, %xmm7 // tmp7 = tmp4 + mulpd %xmm0, %xmm4 // tmp4 = _mm_mul_pd(tmp4, tmp0) + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + + addpd %xmm2, %xmm8 // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) + movapd -96(%rbx), %xmm2 // tmp2 = _mm_load_pd(B+4) + addpd %xmm6, %xmm9 // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) + movapd %xmm3, %xmm6 // tmp6 = tmp3 + pshufd $78,%xmm3, %xmm5 // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1)) + + mulpd %xmm0, %xmm3 // tmp3 = _mm_mul_pd(tmp3, tmp0) + mulpd %xmm1, %xmm6 // tmp6 = _mm_mul_pd(tmp6, tmp1) + + addpd %xmm4, %xmm10 // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) + addpd %xmm7, %xmm11 // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) + movapd %xmm5, %xmm7 // tmp7 = tmp5 + mulpd %xmm0, %xmm5 // tmp5 = _mm_mul_pd(tmp5, tmp0) + movapd -96(%rax), %xmm0 // tmp0 = _mm_load_pd(A+4) + mulpd %xmm1, %xmm7 // tmp7 = _mm_mul_pd(tmp7, tmp1) + movapd -80(%rax), %xmm1 // tmp1 = _mm_load_pd(A+6) + + addq $32, %rax // A += 4; + addq $32, %rbx // B += 4; + + + decq %rdi // --l + jne .DLOOPLEFT // if l>= 1 go back + + .DPOSTACCUMULATE: // Update remaining ab_*_* registers + + addpd %xmm3, %xmm12 // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + addpd %xmm6, %xmm13 // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + + addpd %xmm5, %xmm14 // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + addpd %xmm7, %xmm15 // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + + // Update C <- beta*C + alpha*AB + +/* + movsd 4, %xmm0 // load alpha + movsd 5, %xmm1 // load beta + movq 6, %rcx // Address of C stored in %rcx + + movq 7, %r8 // load incRowC + leaq (,%r8,8), %r8 // incRowC *= sizeof(double) + movq 8, %r9 // load incColC + leaq (,%r9,8), %r9 // incRowC *= sizeof(double) +*/ + // Load alpha + movsd %xmm0, %xmm0 // Move alpha into %xmm0 + // Load beta + movsd %xmm1, %xmm1 // Move beta into %xmm1 + // Address of C stored in %rcx + movq %rdx, %rcx // Move C into %rcx + + // Load incRowC + movq %r8, %r8 // Move incRowC into %r8 + leaq (%r8, %r8, 8), %r8 // incRowC *= sizeof(double) + // Load incColC + movq %r9, %r9 // Move incColC into %r9 + leaq (%r9, %r9, 8), %r9 // incColC *= sizeof(double) + + leaq (%rcx,%r9), %r10 // Store addr of C01 in %r10 + leaq (%rcx,%r8,2), %rdx // Store addr of C20 in %rdx + leaq (%rdx,%r9), %r11 // Store addr of C21 in %r11 + + unpcklpd %xmm0, %xmm0 // duplicate alpha + unpcklpd %xmm1, %xmm1 // duplicate beta + + movlpd (%rcx), %xmm3 // load (C00, + movhpd (%r10,%r8), %xmm3 // C11) + mulpd %xmm0, %xmm8 // scale ab_00_11 by alpha + mulpd %xmm1, %xmm3 // scale (C00, C11) by beta + addpd %xmm8, %xmm3 // add results + + movlpd %xmm3, (%rcx) // write back (C00, + movhpd %xmm3, (%r10,%r8) // C11) + + movlpd (%rdx), %xmm4 // load (C20, + movhpd (%r11,%r8), %xmm4 // C31) + mulpd %xmm0, %xmm9 // scale ab_20_31 by alpha + mulpd %xmm1, %xmm4 // scale (C20, C31) by beta + addpd %xmm9, %xmm4 // add results + movlpd %xmm4, (%rdx) // write back (C20, + movhpd %xmm4, (%r11,%r8) // C31) + + movlpd (%r10), %xmm3 // load (C01, + movhpd (%rcx,%r8), %xmm3 // C10) + mulpd %xmm0, %xmm10 // scale ab_01_10 by alpha + mulpd %xmm1, %xmm3 // scale (C01, C10) by beta + addpd %xmm10, %xmm3 // add results + movlpd %xmm3, (%r10) // write back (C01, + movhpd %xmm3, (%rcx,%r8) // C10) + + movlpd (%r11), %xmm4 // load (C21, + movhpd (%rdx,%r8), %xmm4 // C30) + mulpd %xmm0, %xmm11 // scale ab_21_30 by alpha + mulpd %xmm1, %xmm4 // scale (C21, C30) by beta + addpd %xmm11, %xmm4 // add results + movlpd %xmm4, (%r11) // write back (C21, + movhpd %xmm4, (%rdx,%r8) // C30) + + leaq (%rcx,%r9,2), %rcx // Store addr of C02 in %rcx + leaq (%r10,%r9,2), %r10 // Store addr of C03 in %r10 + leaq (%rdx,%r9,2), %rdx // Store addr of C22 in $rdx + leaq (%r11,%r9,2), %r11 // Store addr of C23 in %r11 + + movlpd (%rcx), %xmm3 // load (C02, + movhpd (%r10,%r8), %xmm3 // C13) + mulpd %xmm0, %xmm12 // scale ab_02_13 by alpha + mulpd %xmm1, %xmm3 // scale (C02, C13) by beta + addpd %xmm12, %xmm3 // add results + movlpd %xmm3, (%rcx) // write back (C02, + movhpd %xmm3, (%r10,%r8) // C13) + + movlpd (%rdx), %xmm4 // load (C22, + movhpd (%r11,%r8), %xmm4 // C33) + mulpd %xmm0, %xmm13 // scale ab_22_33 by alpha + mulpd %xmm1, %xmm4 // scale (C22, C33) by beta + addpd %xmm13, %xmm4 // add results + movlpd %xmm4, (%rdx) // write back (C22, + movhpd %xmm4, (%r11,%r8) // C33) + + movlpd (%r10), %xmm3 // load (C03, + movhpd (%rcx,%r8), %xmm3 // C12) + mulpd %xmm0, %xmm14 // scale ab_03_12 by alpha + mulpd %xmm1, %xmm3 // scale (C03, C12) by beta + addpd %xmm14, %xmm3 // add results + movlpd %xmm3, (%r10) // write back (C03, + movhpd %xmm3, (%rcx,%r8) // C12) + + movlpd (%r11), %xmm4 // load (C23, + movhpd (%rdx,%r8), %xmm4 // C32) + mulpd %xmm0, %xmm15 // scale ab_23_32 by alpha + mulpd %xmm1, %xmm4 // scale (C23, C32) by beta + addpd %xmm15, %xmm4 // add results + movlpd %xmm4, (%r11) // write back (C23, + movhpd %xmm4, (%rdx,%r8) // C32) + + // return + ret + + diff --git a/modules/linalg/dgemm_nn.c b/modules/linalg/dgemm_nn.c new file mode 100644 index 000000000..fab5c44b5 --- /dev/null +++ b/modules/linalg/dgemm_nn.c @@ -0,0 +1,780 @@ +//#include "../ulmblas.h" +#include +#include +#include +#include +#include "dgemm_asm.h" + +#define MC 384 +#define KC 384 +#define NC 4096 + +#define MR 4 +#define NR 4 + +// +// Local buffers for storing panels from A, B and C +// +static double _A[MC*KC] __attribute__ ((aligned (16))); +static double _B[KC*NC] __attribute__ ((aligned (16))); +static double _C[MR*NR] __attribute__ ((aligned (16))); + +// ASM function: +extern void dgemm_kernel_asm(long kb, long kl, const double *A, + const double *B, const double *nextA, + const double *nextB, double alpha, + double beta, double *C, long incRowC, long incColC); + + +// +// Packing complete panels from A (i.e. without padding) +// +static void +pack_MRxk(int k, const double *A, int incRowA, int incColA, + double *buffer) +{ + int i, j; + + for (j=0; j0) { + for (j=0; j0) { + for (i=0; i= 1 go back + " \n\t" + " \n\t" + ".DCONSIDERLEFT%=: \n\t" + "testq %%rdi, %%rdi \n\t" // if kl==0 writeback to AB + "je .DPOSTACCUMULATE%=\n\t" + " \n\t" + ".DLOOPLEFT%=: \n\t" // for l = kl,..,1 do + " \n\t" + "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + "movapd -112(%%rbx),%%xmm3 \n\t" // tmp3 = _mm_load_pd(B+2) + "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + "movapd %%xmm2, %%xmm6 \n\t" // tmp6 = tmp2 + "pshufd $78,%%xmm2, %%xmm4 \n\t" // tmp4 = _mm_shuffle_pd(tmp2, tmp2, + " \n\t" // _MM_SHUFFLE2(0, 1)) + "mulpd %%xmm0, %%xmm2 \n\t" // tmp2 = _mm_mul_pd(tmp2, tmp0); + "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1); + " \n\t" + " \n\t" + "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + "movapd %%xmm4, %%xmm7 \n\t" // tmp7 = tmp4 + "mulpd %%xmm0, %%xmm4 \n\t" // tmp4 = _mm_mul_pd(tmp4, tmp0) + "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) + " \n\t" + " \n\t" + "addpd %%xmm2, %%xmm8 \n\t" // ab_00_11 = _mm_add_pd(ab_00_11, tmp2) + "movapd -96(%%rbx), %%xmm2 \n\t" // tmp2 = _mm_load_pd(B+4) + "addpd %%xmm6, %%xmm9 \n\t" // ab_20_31 = _mm_add_pd(ab_20_31, tmp6) + "movapd %%xmm3, %%xmm6 \n\t" // tmp6 = tmp3 + "pshufd $78,%%xmm3, %%xmm5 \n\t" // tmp5 = _mm_shuffle_pd(tmp3, tmp3, + " \n\t" // _MM_SHUFFLE2(0, 1)) + "mulpd %%xmm0, %%xmm3 \n\t" // tmp3 = _mm_mul_pd(tmp3, tmp0) + "mulpd %%xmm1, %%xmm6 \n\t" // tmp6 = _mm_mul_pd(tmp6, tmp1) + " \n\t" + " \n\t" + "addpd %%xmm4, %%xmm10 \n\t" // ab_01_10 = _mm_add_pd(ab_01_10, tmp4) + "addpd %%xmm7, %%xmm11 \n\t" // ab_21_30 = _mm_add_pd(ab_21_30, tmp7) + "movapd %%xmm5, %%xmm7 \n\t" // tmp7 = tmp5 + "mulpd %%xmm0, %%xmm5 \n\t" // tmp5 = _mm_mul_pd(tmp5, tmp0) + "movapd -96(%%rax), %%xmm0 \n\t" // tmp0 = _mm_load_pd(A+4) + "mulpd %%xmm1, %%xmm7 \n\t" // tmp7 = _mm_mul_pd(tmp7, tmp1) + "movapd -80(%%rax), %%xmm1 \n\t" // tmp1 = _mm_load_pd(A+6) + " \n\t" + " \n\t" + "addq $32, %%rax \n\t" // A += 4; + "addq $32, %%rbx \n\t" // B += 4; + " \n\t" + "decq %%rdi \n\t" // --l + "jne .DLOOPLEFT%= \n\t" // if l>= 1 go back + " \n\t" + ".DPOSTACCUMULATE%=: \n\t" // Update remaining ab_*_* registers + " \n\t" + "addpd %%xmm3, %%xmm12 \n\t" // ab_02_13 = _mm_add_pd(ab_02_13, tmp3) + "addpd %%xmm6, %%xmm13 \n\t" // ab_22_33 = _mm_add_pd(ab_22_33, tmp6) + " \n\t" // + "addpd %%xmm5, %%xmm14 \n\t" // ab_03_12 = _mm_add_pd(ab_03_12, tmp5) + "addpd %%xmm7, %%xmm15 \n\t" // ab_23_32 = _mm_add_pd(ab_23_32, tmp7) + " \n\t" +// +// Update C <- beta*C + alpha*AB +// +// + "movsd %4, %%xmm0 \n\t" // load alpha + "movsd %5, %%xmm1 \n\t" // load beta + "movq %6, %%rcx \n\t" // Address of C stored in %rcx + + "movq %7, %%r8 \n\t" // load incRowC + "leaq (,%%r8,8), %%r8 \n\t" // incRowC *= sizeof(double) + "movq %8, %%r9 \n\t" // load incColC + "leaq (,%%r9,8), %%r9 \n\t" // incRowC *= sizeof(double) + " \n\t" + "leaq (%%rcx,%%r9), %%r10 \n\t" // Store addr of C01 in %r10 + "leaq (%%rcx,%%r8,2), %%rdx \n\t" // Store addr of C20 in %rdx + "leaq (%%rdx,%%r9), %%r11 \n\t" // Store addr of C21 in %r11 + " \n\t" + "unpcklpd %%xmm0, %%xmm0 \n\t" // duplicate alpha + "unpcklpd %%xmm1, %%xmm1 \n\t" // duplicate beta + " \n\t" + " \n\t" + "movlpd (%%rcx), %%xmm3 \n\t" // load (C00, + "movhpd (%%r10,%%r8), %%xmm3 \n\t" // C11) + "mulpd %%xmm0, %%xmm8 \n\t" // scale ab_00_11 by alpha + "mulpd %%xmm1, %%xmm3 \n\t" // scale (C00, C11) by beta + "addpd %%xmm8, %%xmm3 \n\t" // add results + + "movlpd %%xmm3, (%%rcx) \n\t" // write back (C00, + "movhpd %%xmm3, (%%r10,%%r8) \n\t" // C11) + " \n\t" + "movlpd (%%rdx), %%xmm4 \n\t" // load (C20, + "movhpd (%%r11,%%r8), %%xmm4 \n\t" // C31) + "mulpd %%xmm0, %%xmm9 \n\t" // scale ab_20_31 by alpha + "mulpd %%xmm1, %%xmm4 \n\t" // scale (C20, C31) by beta + "addpd %%xmm9, %%xmm4 \n\t" // add results + "movlpd %%xmm4, (%%rdx) \n\t" // write back (C20, + "movhpd %%xmm4, (%%r11,%%r8) \n\t" // C31) + " \n\t" + " \n\t" + "movlpd (%%r10), %%xmm3 \n\t" // load (C01, + "movhpd (%%rcx,%%r8), %%xmm3 \n\t" // C10) + "mulpd %%xmm0, %%xmm10\n\t" // scale ab_01_10 by alpha + "mulpd %%xmm1, %%xmm3 \n\t" // scale (C01, C10) by beta + "addpd %%xmm10, %%xmm3 \n\t" // add results + "movlpd %%xmm3, (%%r10) \n\t" // write back (C01, + "movhpd %%xmm3, (%%rcx,%%r8) \n\t" // C10) + " \n\t" + "movlpd (%%r11), %%xmm4 \n\t" // load (C21, + "movhpd (%%rdx,%%r8), %%xmm4 \n\t" // C30) + "mulpd %%xmm0, %%xmm11\n\t" // scale ab_21_30 by alpha + "mulpd %%xmm1, %%xmm4 \n\t" // scale (C21, C30) by beta + "addpd %%xmm11, %%xmm4 \n\t" // add results + "movlpd %%xmm4, (%%r11) \n\t" // write back (C21, + "movhpd %%xmm4, (%%rdx,%%r8) \n\t" // C30) + " \n\t" + " \n\t" + "leaq (%%rcx,%%r9,2), %%rcx \n\t" // Store addr of C02 in %rcx + "leaq (%%r10,%%r9,2), %%r10 \n\t" // Store addr of C03 in %r10 + "leaq (%%rdx,%%r9,2), %%rdx \n\t" // Store addr of C22 in $rdx + "leaq (%%r11,%%r9,2), %%r11 \n\t" // Store addr of C23 in %r11 + " \n\t" + " \n\t" + "movlpd (%%rcx), %%xmm3 \n\t" // load (C02, + "movhpd (%%r10,%%r8), %%xmm3 \n\t" // C13) + "mulpd %%xmm0, %%xmm12\n\t" // scale ab_02_13 by alpha + "mulpd %%xmm1, %%xmm3 \n\t" // scale (C02, C13) by beta + "addpd %%xmm12, %%xmm3 \n\t" // add results + "movlpd %%xmm3, (%%rcx) \n\t" // write back (C02, + "movhpd %%xmm3, (%%r10,%%r8) \n\t" // C13) + " \n\t" + "movlpd (%%rdx), %%xmm4 \n\t" // load (C22, + "movhpd (%%r11, %%r8), %%xmm4 \n\t" // C33) + "mulpd %%xmm0, %%xmm13\n\t" // scale ab_22_33 by alpha + "mulpd %%xmm1, %%xmm4 \n\t" // scale (C22, C33) by beta + "addpd %%xmm13, %%xmm4 \n\t" // add results + "movlpd %%xmm4, (%%rdx) \n\t" // write back (C22, + "movhpd %%xmm4, (%%r11,%%r8) \n\t" // C33) + " \n\t" + " \n\t" + "movlpd (%%r10), %%xmm3 \n\t" // load (C03, + "movhpd (%%rcx,%%r8), %%xmm3 \n\t" // C12) + "mulpd %%xmm0, %%xmm14\n\t" // scale ab_03_12 by alpha + "mulpd %%xmm1, %%xmm3 \n\t" // scale (C03, C12) by beta + "addpd %%xmm14, %%xmm3 \n\t" // add results + "movlpd %%xmm3, (%%r10) \n\t" // write back (C03, + "movhpd %%xmm3, (%%rcx,%%r8) \n\t" // C12) + " \n\t" + "movlpd (%%r11), %%xmm4 \n\t" // load (C23, + "movhpd (%%rdx,%%r8), %%xmm4 \n\t" // C32) + "mulpd %%xmm0, %%xmm15\n\t" // scale ab_23_32 by alpha + "mulpd %%xmm1, %%xmm4 \n\t" // scale (C23, C32) by beta + "addpd %%xmm15, %%xmm4 \n\t" // add results + "movlpd %%xmm4, (%%r11) \n\t" // write back (C23, + "movhpd %%xmm4, (%%rdx,%%r8) \n\t" // C32) + : // output + : // input + "m" (kb), // 0 + "m" (kl), // 1 + "m" (A), // 2 + "m" (B), // 3 + "m" (alpha), // 4 + "m" (beta), // 5 + "m" (C), // 6 + "m" (incRowC), // 7 + "m" (incColC), // 8 + "m" (nextA), // 9 + "m" (nextB) // 10 + : // register clobber list + "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11", + "xmm0", "xmm1", "xmm2", "xmm3", + "xmm4", "xmm5", "xmm6", "xmm7", + "xmm8", "xmm9", "xmm10", "xmm11", + "xmm12", "xmm13", "xmm14", "xmm15" + ); +} +*/ + +// +// Compute Y += alpha*X +// +static void +dgeaxpy(int m, + int n, + double alpha, + const double *X, + int incRowX, + int incColX, + double *Y, + int incRowY, + int incColY) +{ + int i, j; + + + if (alpha!=1.0) { + for (j=0; j -#include -#include - // if source C++ API is being compiled, include these Fortran refs and wrappers -#elif defined(__GPMP_CPP_API__) +#if defined(__GPMP_CPP_API__) extern "C" { // Matrix add routine (FLOAT) diff --git a/modules/linalg/mtx_routines.f90 b/modules/linalg/mtx_routines.f90 index 4cb2eae17..9467405e9 100644 --- a/modules/linalg/mtx_routines.f90 +++ b/modules/linalg/mtx_routines.f90 @@ -86,15 +86,15 @@ SUBROUTINE mtx_mult_routine_int_(A, B, C, rows_a, cols_a, cols_b) bind(C) INTEGER :: i, j, k - ! Perform matrix multiplication flipping indexing to keep standard with + ! Perform matrix multiplication flipping indexing to keep standard with ! C/C++ calls DO i = 1, rows_a DO j = 1, cols_b C(j, i) = 0 DO k = 1, cols_a - - C(j, i) = C(j, i) + A(k, i) * B(j, k) + + C(j, i) = C(j, i) + A(k, i)*B(j, k) END DO diff --git a/scripts/format.sh b/scripts/format.sh index d9014da10..75d32843d 100755 --- a/scripts/format.sh +++ b/scripts/format.sh @@ -2,5 +2,5 @@ printf "[+] Formatting C, C++, CUDA files...\n" cd ../ && find . -regex '.*\.\(c\|cu\|cuh\|cpp\|hpp\|h\|cxx\)' -exec clang-format -style=file -i {} \; -#printf "[+] Formatting FORTRAN files...\n" -#cd ../ && find . -type f -name "*.f90" -exec fprettify {} --indent 4 \; +printf "[+] Formatting FORTRAN files...\n" +cd ../ && find . -type f -name "*.f90" -exec fprettify {} --indent 4 \; diff --git a/tests/linalg/TEST b/tests/linalg/TEST new file mode 100755 index 000000000..fb468c683 Binary files /dev/null and b/tests/linalg/TEST differ