diff --git a/experiment/test.S b/experiment/test.S
new file mode 100644
index 000000000..818a78713
--- /dev/null
+++ b/experiment/test.S
@@ -0,0 +1,19 @@
+.globl asm_function
+asm_function:
+    // Assembly code to add two integers
+    // Parameters are passed in registers: a in %edi, b in %esi
+    // Result is stored in %eax
+    // Load a into %eax
+    mov %edi, %eax
+    
+    // Add b to %eax
+    add %esi, %eax
+    
+    // Add c to %eax
+    add %edx, %eax
+    
+    // Subtract d from %eax
+    sub %ecx, %eax
+    
+    ret // Return
+
diff --git a/experiment/test.c b/experiment/test.c
new file mode 100644
index 000000000..750550f75
--- /dev/null
+++ b/experiment/test.c
@@ -0,0 +1,21 @@
+#include <stdio.h>
+
+// Declare the assembly function as an external function
+extern int asm_function(int a, int b, int c, int d);
+
+int add (int a, int b, int c) {
+    int d = 2;
+
+    return asm_function(a, b, c, d);
+}
+
+int main() {
+    int a = 5;
+    int b = 7;
+    int c = 10;
+    //int result = asm_function(5, 7);
+    int result = add(a, b, c);
+    printf("Result: %d\n", result);
+    return 0;
+}
+
diff --git a/include/linalg/_dgemm.hpp b/include/linalg/_dgemm.hpp
index 6142f6597..95c962564 100644
--- a/include/linalg/_dgemm.hpp
+++ b/include/linalg/_dgemm.hpp
@@ -50,11 +50,11 @@ namespace linalg {
 class DGEMM {
   public:
     /**< Buffer for storing packed micro panels of A  */
-    static double DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K];
+    static double DGEMM_BUFF_A[BLOCK_SZ_M * BLOCK_SZ_K]__attribute__ ((aligned (16)));
     /**< Buffer for storing packed micro panels of B  */
-    static double DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N];
+    static double DGEMM_BUFF_B[BLOCK_SZ_K * BLOCK_SZ_N]__attribute__ ((aligned (16)));
     /**< Buffer for storing intermediate results  */
-    static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR];
+    static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR]__attribute__ ((aligned (16)));
 
     /**
      * @brief Packs micro panels of size BLOCK_SZ_MR rows by k columns from A
diff --git a/modules/linalg/dgemm_asm.h b/modules/linalg/dgemm_asm.h
new file mode 100644
index 000000000..63d847157
--- /dev/null
+++ b/modules/linalg/dgemm_asm.h
@@ -0,0 +1,7 @@
+#ifndef DGEMM_ASM_H
+#define DGEMM_ASM_H
+
+
+
+#endif
+
diff --git a/modules/linalg/dgemm_kernel.S b/modules/linalg/dgemm_kernel.S
new file mode 100644
index 000000000..475ad3c56
--- /dev/null
+++ b/modules/linalg/dgemm_kernel.S
@@ -0,0 +1,617 @@
+/*************************************************************************
+ *
+ *  Project
+ *                         _____ _____  __  __ _____
+ *                        / ____|  __ \|  \/  |  __ \
+ *  ___  _ __   ___ _ __ | |  __| |__) | \  / | |__) |
+ * / _ \| '_ \ / _ \ '_ \| | |_ |  ___/| |\/| |  ___/
+ *| (_) | |_) |  __/ | | | |__| | |    | |  | | |
+ * \___/| .__/ \___|_| |_|\_____|_|    |_|  |_|_|
+ *      | |
+ *      |_|
+ *
+ * Copyright (C) Akiel Aries, <akiel@akiel.org>, et al.
+ *
+ * This software is licensed as described in the file LICENSE, which
+ * you should have received as part of this distribution. The terms
+ * among other details are referenced in the official documentation
+ * seen here : https://akielaries.github.io/openGPMP/ along with
+ * important files seen in this project.
+ *
+ * You may opt to use, copy, modify, merge, publish, distribute
+ * and/or sell copies of the Software, and permit persons to whom
+ * the Software is furnished to do so, under the terms of the
+ * LICENSE file. As this is an Open Source effort, all implementations
+ * must be of the same methodology.
+ *
+ *
+ *
+ * This software is distributed on an AS IS basis, WITHOUT
+ * WARRANTY OF ANY KIND, either express or implied.
+ *
+ ************************************************************************/
+
+.globl dgemm_kernel_asm
+
+dgemm_kernel_asm:
+    
+    /*
+    // kb (32 bit) stored in %rsi
+    movq    0,      %rsi
+    // kl (32 bit) stored in %rdi
+    movq    1,      %rdi
+    // Address of A stored in %rax
+    movq    2,      %rax
+    // Address of B stored in %rbx
+    movq    3,      %rbx
+    // Address of nextA stored in %r9
+    movq    9,      %r9
+    // Address of nextB stored in %r10
+    movq    10,     %r10
+
+    */
+
+    // Input parameters:
+    // kb (32 bit) stored in %rsi
+    movq    %rsi, %rsi          // Move kb into %rsi
+    // kl (32 bit) stored in %rdi
+    movq    %rdi, %rdi          // Move kl into %rdi
+    // Address of A stored in %rax
+    movq    %rax, %rax          // Move A into %rax
+    // Address of B stored in %rbx
+    movq    %rbx, %rbx          // Move B into %rbx
+    // Address of nextA stored in %r9
+    movq    %r9, %r9            // Move nextA into %r9
+    // Address of nextB stored in %r10
+    movq    %r10, %r10           // Move nextB into %r10
+
+    // adjust addresses?
+    addq    $128,   %rax
+    addq    $128,   %rbx
+
+    // Load data into XMM registers
+    // tmp0 = _mm_load_pd(A)
+    movapd -128(%rax), %xmm0
+    // tmp1 = _mm_load_pd(A+2)
+    movapd -112(%rax), %xmm1 
+    // tmp2 = _mm_load_pd(B)
+    movapd -128(%rbx), %xmm2
+
+    // Initialize XMM registers
+    // ab_00_11 = _mm_setzero_pd()
+    xorpd %xmm8,  %xmm8
+    // ab_20_31 = _mm_setzero_pd()
+    xorpd %xmm9,  %xmm9
+    // ab_01_10 = _mm_setzero_pd()
+    xorpd %xmm10, %xmm10
+    // ab_21_30 = _mm_setzero_pd()
+    xorpd %xmm11, %xmm11
+    // ab_02_13 = _mm_setzero_pd()
+    xorpd %xmm12, %xmm12
+    // ab_22_33 = _mm_setzero_pd()
+    xorpd %xmm13, %xmm13
+    // ab_03_12 = _mm_setzero_pd()
+    xorpd %xmm14, %xmm14
+    // ab_23_32 = _mm_setzero_pd()
+    xorpd %xmm15, %xmm15
+    
+    // TESTS K1
+    // check if kb==0 to handle remaining kl
+    testq %rsi, %rsi
+
+    // jump to label .DCONSIDERLEFT if kb is zero
+    je .DCONSIDERLEFT
+
+    // Loop label
+    .DLOOP:
+
+    // prefetch
+    // Prefetch memory location calculated using base address in %rax
+    // Prefetching is a technique used to load data into the cache before it 
+    // is actually accesses it can help improve performance by reducing the 
+    // latency of memory accesses
+    prefetcht0 (4*35+1)*8(%rax)
+
+    // *** 1. update
+
+    // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    addpd   %xmm3,      %xmm12
+    // tmp3     = _mm_load_pd(B+2)
+    movaps  -112(%rbx), %xmm3
+    // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    addpd   %xmm6,      %xmm13
+    // tmp6     = tmp2
+    movaps  %xmm2,      %xmm6
+    // tmp4     = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1))
+    pshufd  $78,        %xmm2,      %xmm4
+    // Multiply tmp2 by tmp0
+    mulpd   %xmm0,      %xmm2        // tmp2 = _mm_mul_pd(tmp2, tmp0)
+
+    // Multiply tmp6 by tmp1
+    mulpd   %xmm1,      %xmm6        // tmp6 = _mm_mul_pd(tmp6, tmp1)
+
+    // Add tmp5 to ab_03_12
+    addpd   %xmm5,      %xmm14       // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+
+    // Add tmp7 to ab_23_32
+    addpd   %xmm7,      %xmm15       // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+
+    // Move tmp4 to tmp7
+    movaps  %xmm4,      %xmm7       // tmp7 = tmp4
+
+    // Multiply tmp4 by tmp0
+    mulpd   %xmm0,      %xmm4        // tmp4 = _mm_mul_pd(tmp4, tmp0)
+
+    // Multiply tmp7 by tmp1
+    mulpd   %xmm1,      %xmm7        // tmp7 = _mm_mul_pd(tmp7, tmp1)
+    
+    // Add tmp2 to ab_00_11
+    addpd %xmm2, %xmm8         // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+
+    // Load values from memory (B+4) into tmp2
+    movaps -96(%rbx), %xmm2    // tmp2 = _mm_load_pd(B+4)
+
+    // Add tmp6 to ab_20_31
+    addpd %xmm6, %xmm9         // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+
+    // Move tmp3 to tmp6
+    movaps %xmm3, %xmm6        // tmp6 = tmp3
+
+    // Shuffle tmp3 and store result in tmp5
+    pshufd $78,%xmm3, %xmm5    // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1))
+
+    // Multiply tmp3 by tmp0
+    mulpd %xmm0, %xmm3         // tmp3 = _mm_mul_pd(tmp3, tmp0)
+
+    // Multiply tmp6 by tmp1
+    mulpd %xmm1, %xmm6         // tmp6 = _mm_mul_pd(tmp6, tmp1)
+
+    // Add tmp4 to ab_01_10
+    addpd %xmm4, %xmm10        // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+
+    // Add tmp7 to ab_21_30
+    addpd %xmm7, %xmm11        // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+
+    // Move tmp5 to tmp7
+    movaps %xmm5, %xmm7        // tmp7 = tmp5
+
+    // Multiply tmp5 by tmp0
+    mulpd %xmm0, %xmm5         // tmp5 = _mm_mul_pd(tmp5, tmp0)
+
+    // Load values from memory (A+4) into tmp0
+    movaps -96(%rax), %xmm0    // tmp0 = _mm_load_pd(A+4)
+
+    // Multiply tmp7 by tmp1
+    mulpd %xmm1, %xmm7         // tmp7 = _mm_mul_pd(tmp7, tmp1)
+
+    // Load values from memory (A+6) into tmp1
+    movaps -80(%rax), %xmm1    // tmp1 = _mm_load_pd(A+6)
+
+    // *** 2. update
+
+    // Add tmp3 to ab_02_13
+    addpd %xmm3, %xmm12        // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+
+    // Load values from memory (B+6) into tmp3
+    movaps -80(%rbx), %xmm3    // tmp3 = _mm_load_pd(B+6)
+
+    // Add tmp6 to ab_22_33
+    addpd %xmm6, %xmm13        // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+
+    // Move tmp2 to tmp6
+    movaps %xmm2, %xmm6        // tmp6 = tmp2
+
+    // Shuffle tmp2 and store result in tmp4
+    pshufd $78,%xmm2, %xmm4    // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1))
+
+    // Multiply tmp2 by tmp0
+    mulpd %xmm0, %xmm2         // tmp2 = _mm_mul_pd(tmp2, tmp0);
+
+    // Multiply tmp6 by tmp1
+    mulpd %xmm1, %xmm6         // tmp6 = _mm_mul_pd(tmp6, tmp1);
+
+    // Add tmp5 to ab_03_12
+    addpd %xmm5, %xmm14        // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+
+    // Add tmp7 to ab_23_32
+    addpd %xmm7, %xmm15        // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+
+    // Move tmp4 to tmp7
+    movaps %xmm4, %xmm7        // tmp7 = tmp4
+
+    // Multiply tmp4 by tmp0
+    mulpd %xmm0, %xmm4         // tmp4 = _mm_mul_pd(tmp4, tmp0)
+
+    // Multiply tmp7 by tmp1
+    mulpd %xmm1, %xmm7         // tmp7 = _mm_mul_pd(tmp7, tmp1)
+
+    // Add tmp2 to ab_00_11
+    addpd %xmm2, %xmm8         // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+
+    // Load values from memory (B+8) into tmp2
+    movaps -64(%rbx), %xmm2    // tmp2 = _mm_load_pd(B+8)
+
+    // Add tmp6 to ab_20_31
+    addpd %xmm6, %xmm9         // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+
+    // Move tmp3 to tmp6
+    movaps %xmm3, %xmm6        // tmp6 = tmp3
+
+    // Shuffle tmp3 and store result in tmp5
+    pshufd $78,%xmm3, %xmm5    // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1))
+
+    // Multiply tmp3 by tmp0
+    mulpd %xmm0, %xmm3         // tmp3 = _mm_mul_pd(tmp3, tmp0)
+
+    // Multiply tmp6 by tmp1
+    mulpd %xmm1, %xmm6         // tmp6 = _mm_mul_pd(tmp6, tmp1)
+
+    // Add tmp4 to ab_01_10
+    addpd %xmm4, %xmm10        // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+
+    // Add tmp7 to ab_21_30
+    addpd %xmm7, %xmm11        // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+
+    // Move tmp5 to tmp7
+    movaps %xmm5, %xmm7        // tmp7 = tmp5
+
+    // Multiply tmp5 by tmp0
+    mulpd %xmm0, %xmm5         // tmp5 = _mm_mul_pd(tmp5, tmp0)
+
+    // Load values from memory (A+8) into tmp0
+    movaps -64(%rax), %xmm0    // tmp0 = _mm_load_pd(A+8)
+
+    // Multiply tmp7 by tmp1
+    mulpd %xmm1, %xmm7         // tmp7 = _mm_mul_pd(tmp7, tmp1)
+
+    // Load values from memory (A+10) into tmp1
+    movaps -48(%rax), %xmm1    // tmp1 = _mm_load_pd(A+10)
+
+    // Prefetch memory location calculated using base address in %rax
+    prefetcht0 (4*37+1)*8(%rax)
+
+    // *** 3. update
+    
+    // Add tmp3 to ab_02_13
+    addpd %xmm3, %xmm12        // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+
+    // Load values from memory (B+10) into tmp3
+    movaps -48(%rbx), %xmm3    // tmp3 = _mm_load_pd(B+10)
+
+    // Add tmp6 to ab_22_33
+    addpd %xmm6, %xmm13        // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+
+    // Move tmp2 to tmp6
+    movaps %xmm2, %xmm6        // tmp6 = tmp2
+
+    // Shuffle tmp2 and store result in tmp4
+    pshufd $78,%xmm2, %xmm4    // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1))
+
+    // Multiply tmp2 by tmp0
+    mulpd %xmm0, %xmm2         // tmp2 = _mm_mul_pd(tmp2, tmp0);
+
+    // Multiply tmp6 by tmp1
+    mulpd %xmm1, %xmm6         // tmp6 = _mm_mul_pd(tmp6, tmp1);
+
+    // Add tmp5 to ab_03_12
+    addpd %xmm5, %xmm14        // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+
+    // Add tmp7 to ab_23_32
+    addpd %xmm7, %xmm15        // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+
+    // Move tmp4 to tmp7
+    movaps %xmm4, %xmm7        // tmp7 = tmp4
+
+    // Multiply tmp4 by tmp0
+    mulpd %xmm0, %xmm4         // tmp4 = _mm_mul_pd(tmp4, tmp0)
+
+    // Multiply tmp7 by tmp1
+    mulpd %xmm1, %xmm7         // tmp7 = _mm_mul_pd(tmp7, tmp1)
+
+    // Add tmp2 to ab_00_11
+    addpd %xmm2, %xmm8         // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+
+    // Load values from memory (B+12) into tmp2
+    movaps -32(%rbx), %xmm2    // tmp2 = _mm_load_pd(B+12)
+
+    // Add tmp6 to ab_20_31
+    addpd %xmm6, %xmm9         // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+
+    // Move tmp3 to tmp6
+    movaps %xmm3, %xmm6        // tmp6 = tmp3
+
+    // Shuffle tmp3 and store result in tmp5
+    pshufd $78,%xmm3, %xmm5    // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1))
+
+    // Multiply tmp3 by tmp0
+    mulpd %xmm0, %xmm3         // tmp3 = _mm_mul_pd(tmp3, tmp0)
+
+    // Multiply tmp6 by tmp1
+    mulpd %xmm1, %xmm6         // tmp6 = _mm_mul_pd(tmp6, tmp1)
+
+    // Add tmp4 to ab_01_10
+    addpd %xmm4, %xmm10        // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+
+    // Add tmp7 to ab_21_30
+    addpd %xmm7, %xmm11        // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+
+    // Move tmp5 to tmp7
+    movaps %xmm5, %xmm7        // tmp7 = tmp5
+
+    // Multiply tmp5 by tmp0
+    mulpd %xmm0, %xmm5         // tmp5 = _mm_mul_pd(tmp5, tmp0)
+
+    // Load values from memory (A+12) into tmp0
+    movaps -32(%rax), %xmm0    // tmp0 = _mm_load_pd(A+12)
+
+    // Multiply tmp7 by tmp1
+    mulpd %xmm1, %xmm7         // tmp7 = _mm_mul_pd(tmp7, tmp1)
+
+    // Load values from memory (A+14) into tmp1
+    movaps -16(%rax), %xmm1    // tmp1 = _mm_load_pd(A+14)
+
+    
+    // *** 4. update
+
+    // Add tmp3 to ab_02_13
+    addpd %xmm3, %xmm12        // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+
+    // Load values from memory (B+14) into tmp3
+    movaps -16(%rbx), %xmm3    // tmp3 = _mm_load_pd(B+14)
+
+    // Add tmp6 to ab_22_33
+    addpd %xmm6, %xmm13        // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+
+    // Move tmp2 to tmp6
+    movaps %xmm2, %xmm6        // tmp6 = tmp2
+
+    // Shuffle tmp2 and store result in tmp4
+    pshufd $78,%xmm2, %xmm4    // tmp4 = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1))
+
+    // Multiply tmp2 by tmp0
+    mulpd %xmm0, %xmm2         // tmp2 = _mm_mul_pd(tmp2, tmp0);
+
+    // Multiply tmp6 by tmp1
+    mulpd %xmm1, %xmm6         // tmp6 = _mm_mul_pd(tmp6, tmp1);
+
+    // Adjust pointer A
+    subq $-32*4, %rax          // A += 16;
+
+    // Add tmp5 to ab_03_12
+    addpd %xmm5, %xmm14        // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+
+    // Add tmp7 to ab_23_32
+    addpd %xmm7, %xmm15        // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+
+    // Move tmp4 to tmp7
+    movaps %xmm4, %xmm7        // tmp7 = tmp4
+
+    // Multiply tmp4 by tmp0
+    mulpd %xmm0, %xmm4         // tmp4 = _mm_mul_pd(tmp4, tmp0)
+
+    // Multiply tmp7 by tmp1
+    mulpd %xmm1, %xmm7         // tmp7 = _mm_mul_pd(tmp7, tmp1)
+
+    // Adjust pointer nextB
+    subq $-128, %r10           // nextB += 16
+
+    // Add tmp2 to ab_00_11
+    addpd %xmm2, %xmm8         // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+
+    // Load values from memory (B+16) into tmp2
+    movaps (%rbx),%xmm2        // tmp2 = _mm_load_pd(B+16)
+
+    // Add tmp6 to ab_20_31
+    addpd %xmm6, %xmm9         // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+
+    // Move tmp3 to tmp6
+    movaps %xmm3, %xmm6        // tmp6 = tmp3
+
+    // Shuffle tmp3 and store result in tmp5
+    pshufd $78,%xmm3, %xmm5    // tmp5 = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1))
+
+    // Multiply tmp3 by tmp0
+    mulpd %xmm0, %xmm3         // tmp3 = _mm_mul_pd(tmp3, tmp0)
+
+    // Multiply tmp6 by tmp1
+    mulpd %xmm1, %xmm6         // tmp6 = _mm_mul_pd(tmp6, tmp1)
+
+    // Adjust pointer B
+    subq $-32*4, %rbx          // B += 16;
+
+    // Add tmp4 to ab_01_10
+    addpd %xmm4, %xmm10        // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+
+    // Add tmp7 to ab_21_30
+    addpd %xmm7, %xmm11        // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+
+    // Move tmp5 to tmp7
+    movaps %xmm5, %xmm7        // tmp7 = tmp5
+
+    // Multiply tmp5 by tmp0
+    mulpd %xmm0, %xmm5         // tmp5 = _mm_mul_pd(tmp5, tmp0)
+
+    // Load values from memory (A+16) into tmp0
+    movaps -128(%rax),%xmm0    // tmp0 = _mm_load_pd(A+16)
+
+    // Multiply tmp7 by tmp1
+    mulpd %xmm1, %xmm7         // tmp7 = _mm_mul_pd(tmp7, tmp1)
+
+    // Load values from memory (A+18) into tmp1
+    movaps -112(%rax), %xmm1   // tmp1 = _mm_load_pd(A+18)
+
+    
+    // prefetch nextB[0]
+    prefetcht2        0(%r10)
+    // prefetch nextB[8]
+    prefetcht2       64(%r10)
+
+    // --l decrement
+    decq      %rsi
+    // if l>= 1 go back
+    jne       .DLOOP
+
+    // If kl==0, writeback to AB
+    .DCONSIDERLEFT:
+
+    testq     %rdi,   %rdi         // if kl==0 writeback to AB
+    je        .DPOSTACCUMULATE
+
+    
+    // Loop for kl remaining iterations
+    .DLOOPLEFT:                   // for l = kl,..,1 do
+
+    addpd     %xmm3,  %xmm12       // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    movapd -112(%rbx),%xmm3        // tmp3     = _mm_load_pd(B+2)
+    addpd     %xmm6,  %xmm13       // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    movapd    %xmm2,  %xmm6        // tmp6     = tmp2
+    pshufd $78,%xmm2, %xmm4        // tmp4     = _mm_shuffle_pd(tmp2, tmp2, _MM_SHUFFLE2(0, 1))
+    
+    mulpd     %xmm0,  %xmm2        // tmp2     = _mm_mul_pd(tmp2, tmp0);
+    mulpd     %xmm1,  %xmm6        // tmp6     = _mm_mul_pd(tmp6, tmp1);
+
+    addpd     %xmm5,  %xmm14       // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    addpd     %xmm7,  %xmm15       // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+    movapd    %xmm4,  %xmm7        // tmp7     = tmp4
+    mulpd     %xmm0,  %xmm4        // tmp4     = _mm_mul_pd(tmp4, tmp0)
+    mulpd     %xmm1,  %xmm7        // tmp7     = _mm_mul_pd(tmp7, tmp1)
+
+    addpd     %xmm2,  %xmm8        // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+    movapd -96(%rbx), %xmm2        // tmp2     = _mm_load_pd(B+4)
+    addpd     %xmm6,  %xmm9        // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+    movapd    %xmm3,  %xmm6        // tmp6     = tmp3
+    pshufd $78,%xmm3, %xmm5        // tmp5     = _mm_shuffle_pd(tmp3, tmp3, _MM_SHUFFLE2(0, 1))
+
+    mulpd     %xmm0,  %xmm3        // tmp3     = _mm_mul_pd(tmp3, tmp0)
+    mulpd     %xmm1,  %xmm6        // tmp6     = _mm_mul_pd(tmp6, tmp1)
+
+    addpd     %xmm4,  %xmm10       // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+    addpd     %xmm7,  %xmm11       // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+    movapd    %xmm5,  %xmm7        // tmp7     = tmp5
+    mulpd     %xmm0,  %xmm5        // tmp5     = _mm_mul_pd(tmp5, tmp0)
+    movapd -96(%rax), %xmm0        // tmp0     = _mm_load_pd(A+4)
+    mulpd     %xmm1,  %xmm7        // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    movapd -80(%rax), %xmm1        // tmp1     = _mm_load_pd(A+6)
+
+    addq      $32,     %rax        // A += 4;
+    addq      $32,     %rbx        // B += 4;
+ 
+    
+    decq      %rdi                // --l
+    jne       .DLOOPLEFT         // if l>= 1 go back
+
+    .DPOSTACCUMULATE:            // Update remaining ab_*_* registers
+
+    addpd    %xmm3,   %xmm12      // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    addpd    %xmm6,   %xmm13      // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+
+    addpd    %xmm5,   %xmm14      // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    addpd    %xmm7,   %xmm15      // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+
+    //  Update C <- beta*C + alpha*AB
+
+/*
+    movsd  4,                  %xmm0  // load alpha
+    movsd  5,                  %xmm1  // load beta
+    movq   6,                  %rcx   // Address of C stored in %rcx
+
+    movq   7,                  %r8    // load incRowC
+    leaq   (,%r8,8),            %r8    // incRowC *= sizeof(double)
+    movq   8,                  %r9    // load incColC
+    leaq   (,%r9,8),            %r9    // incRowC *= sizeof(double)
+*/
+    // Load alpha
+    movsd   %xmm0, %xmm0        // Move alpha into %xmm0
+    // Load beta
+    movsd   %xmm1, %xmm1        // Move beta into %xmm1
+    // Address of C stored in %rcx
+    movq    %rdx, %rcx          // Move C into %rcx
+
+    // Load incRowC
+    movq    %r8, %r8            // Move incRowC into %r8
+    leaq    (%r8, %r8, 8), %r8  // incRowC *= sizeof(double)
+    // Load incColC
+    movq    %r9, %r9            // Move incColC into %r9
+    leaq    (%r9, %r9, 8), %r9  // incColC *= sizeof(double)
+
+    leaq    (%rcx,%r9),          %r10  // Store addr of C01 in %r10
+    leaq    (%rcx,%r8,2),        %rdx  // Store addr of C20 in %rdx
+    leaq    (%rdx,%r9),          %r11  // Store addr of C21 in %r11
+
+    unpcklpd %xmm0,               %xmm0 // duplicate alpha
+    unpcklpd %xmm1,               %xmm1 // duplicate beta
+
+    movlpd  (%rcx),               %xmm3 // load (C00,
+    movhpd  (%r10,%r8),          %xmm3 //       C11)
+    mulpd   %xmm0,               %xmm8 // scale ab_00_11 by alpha
+    mulpd   %xmm1,               %xmm3 // scale (C00, C11) by beta
+    addpd   %xmm8,               %xmm3 // add results
+
+    movlpd  %xmm3,               (%rcx) // write back (C00,
+    movhpd  %xmm3,               (%r10,%r8) //             C11)
+
+    movlpd  (%rdx),               %xmm4 // load (C20,
+    movhpd  (%r11,%r8),          %xmm4 //       C31)
+    mulpd   %xmm0,               %xmm9 // scale ab_20_31 by alpha
+    mulpd   %xmm1,               %xmm4 // scale (C20, C31) by beta
+    addpd   %xmm9,               %xmm4 // add results
+    movlpd  %xmm4,               (%rdx) // write back (C20,
+    movhpd  %xmm4,               (%r11,%r8) //             C31)
+
+    movlpd  (%r10),               %xmm3 // load (C01,
+    movhpd  (%rcx,%r8),          %xmm3 //       C10)
+    mulpd   %xmm0,               %xmm10 // scale ab_01_10 by alpha
+    mulpd   %xmm1,               %xmm3  // scale (C01, C10) by beta
+    addpd   %xmm10,              %xmm3  // add results
+    movlpd  %xmm3,               (%r10) // write back (C01,
+    movhpd  %xmm3,               (%rcx,%r8) //             C10)
+
+    movlpd  (%r11),               %xmm4 // load (C21,
+    movhpd  (%rdx,%r8),          %xmm4 //       C30)
+    mulpd   %xmm0,               %xmm11 // scale ab_21_30 by alpha
+    mulpd   %xmm1,               %xmm4  // scale (C21, C30) by beta
+    addpd   %xmm11,              %xmm4  // add results
+    movlpd  %xmm4,               (%r11) // write back (C21,
+    movhpd  %xmm4,               (%rdx,%r8) //             C30)
+        
+    leaq    (%rcx,%r9,2),      %rcx  // Store addr of C02 in %rcx
+    leaq    (%r10,%r9,2),      %r10  // Store addr of C03 in %r10
+    leaq    (%rdx,%r9,2),      %rdx  // Store addr of C22 in $rdx
+    leaq    (%r11,%r9,2),      %r11  // Store addr of C23 in %r11
+
+    movlpd  (%rcx),             %xmm3 // load (C02,
+    movhpd  (%r10,%r8),        %xmm3 //       C13)
+    mulpd   %xmm0,              %xmm12 // scale ab_02_13 by alpha
+    mulpd   %xmm1,              %xmm3  // scale (C02, C13) by beta
+    addpd   %xmm12,             %xmm3  // add results
+    movlpd  %xmm3,        (%rcx)       // write back (C02,
+    movhpd  %xmm3,        (%r10,%r8)  //             C13)
+
+    movlpd  (%rdx),             %xmm4 // load (C22,
+    movhpd  (%r11,%r8),        %xmm4 //       C33)
+    mulpd   %xmm0,              %xmm13 // scale ab_22_33 by alpha
+    mulpd   %xmm1,              %xmm4  // scale (C22, C33) by beta
+    addpd   %xmm13,             %xmm4  // add results
+    movlpd  %xmm4,              (%rdx) // write back (C22,
+    movhpd  %xmm4,        (%r11,%r8) //             C33)
+
+    movlpd  (%r10),             %xmm3 // load (C03,
+    movhpd  (%rcx,%r8),        %xmm3 //       C12)
+    mulpd   %xmm0,              %xmm14 // scale ab_03_12 by alpha
+    mulpd   %xmm1,              %xmm3  // scale (C03, C12) by beta
+    addpd   %xmm14,             %xmm3  // add results
+    movlpd  %xmm3,        (%r10)      // write back (C03,
+    movhpd  %xmm3,        (%rcx,%r8) //             C12)
+
+    movlpd  (%r11),             %xmm4 // load (C23,
+    movhpd  (%rdx,%r8),        %xmm4 //       C32)
+    mulpd   %xmm0,              %xmm15 // scale ab_23_32 by alpha
+    mulpd   %xmm1,              %xmm4  // scale (C23, C32) by beta
+    addpd   %xmm15,             %xmm4  // add results
+    movlpd  %xmm4,        (%r11)      // write back (C23,
+    movhpd  %xmm4,        (%rdx,%r8) //             C32)
+
+    // return
+    ret
+
+
diff --git a/modules/linalg/dgemm_nn.c b/modules/linalg/dgemm_nn.c
new file mode 100644
index 000000000..fab5c44b5
--- /dev/null
+++ b/modules/linalg/dgemm_nn.c
@@ -0,0 +1,780 @@
+//#include "../ulmblas.h"
+#include <stdio.h>
+#include <time.h>
+#include <emmintrin.h>
+#include <immintrin.h>
+#include "dgemm_asm.h"
+
+#define MC  384
+#define KC  384
+#define NC  4096
+
+#define MR  4
+#define NR  4
+
+//
+//  Local buffers for storing panels from A, B and C
+//
+static double _A[MC*KC] __attribute__ ((aligned (16)));
+static double _B[KC*NC] __attribute__ ((aligned (16)));
+static double _C[MR*NR] __attribute__ ((aligned (16)));
+
+// ASM function:
+extern void dgemm_kernel_asm(long kb, long kl, const double *A, 
+                            const double *B, const double *nextA, 
+                            const double *nextB, double alpha, 
+                            double beta, double *C, long incRowC, long incColC);
+                            
+
+//
+//  Packing complete panels from A (i.e. without padding)
+//
+static void
+pack_MRxk(int k, const double *A, int incRowA, int incColA,
+          double *buffer)
+{
+    int i, j;
+
+    for (j=0; j<k; ++j) {
+        for (i=0; i<MR; ++i) {
+            buffer[i] = A[i*incRowA];
+        }
+        buffer += MR;
+        A      += incColA;
+    }
+}
+
+//
+//  Packing panels from A with padding if required
+//
+static void
+pack_A(int mc, int kc, const double *A, int incRowA, int incColA,
+       double *buffer)
+{
+    int mp  = mc / MR;
+    int _mr = mc % MR;
+
+    int i, j;
+
+    for (i=0; i<mp; ++i) {
+        pack_MRxk(kc, A, incRowA, incColA, buffer);
+        buffer += kc*MR;
+        A      += MR*incRowA;
+    }
+    if (_mr>0) {
+        for (j=0; j<kc; ++j) {
+            for (i=0; i<_mr; ++i) {
+                buffer[i] = A[i*incRowA];
+            }
+            for (i=_mr; i<MR; ++i) {
+                buffer[i] = 0.0;
+            }
+            buffer += MR;
+            A      += incColA;
+        }
+    }
+}
+
+//
+//  Packing complete panels from B (i.e. without padding)
+//
+static void
+pack_kxNR(int k, const double *B, int incRowB, int incColB,
+          double *buffer)
+{
+    int i, j;
+
+    for (i=0; i<k; ++i) {
+        for (j=0; j<NR; ++j) {
+            buffer[j] = B[j*incColB];
+        }
+        buffer += NR;
+        B      += incRowB;
+    }
+}
+
+//
+//  Packing panels from B with padding if required
+//
+static void
+pack_B(int kc, int nc, const double *B, int incRowB, int incColB,
+       double *buffer)
+{
+    int np  = nc / NR;
+    int _nr = nc % NR;
+
+    int i, j;
+
+    for (j=0; j<np; ++j) {
+        pack_kxNR(kc, B, incRowB, incColB, buffer);
+        buffer += kc*NR;
+        B      += NR*incColB;
+    }
+    if (_nr>0) {
+        for (i=0; i<kc; ++i) {
+            for (j=0; j<_nr; ++j) {
+                buffer[j] = B[j*incColB];
+            }
+            for (j=_nr; j<NR; ++j) {
+                buffer[j] = 0.0;
+            }
+            buffer += NR;
+            B      += incRowB;
+        }
+    }
+}
+
+//
+//  Micro kernel for multiplying panels from A and B.
+//
+static void
+dgemm_micro_kernel(long kc,
+                   double alpha, const double *A, const double *B,
+                   double beta,
+                   double *C, long incRowC, long incColC,
+                   const double *nextA, const double *nextB)
+{
+    long kb = kc / 4;
+    long kl = kc % 4;
+    printf("kb = %ld kl = %ld\n", kb, kl);
+    printf("A[0] = %f B[0] = %f\n", A[0], B[0]);
+    printf("nextA[0] = %f nextB[0] = %f\n", nextA[0], nextB[0]);
+    printf("alpha = %f beta = %f\n", alpha, beta);
+    printf("incRowC = %ld incColC = %ld\n", incRowC, incColC);
+
+    dgemm_kernel_asm(kb, kl, A, B, nextA, nextB, alpha, beta, C, incRowC, incColC);
+
+}
+
+//
+//  Compute AB = A*B
+//
+    /*__asm__ volatile
+    (
+    "movq      %0,      %%rsi    \n\t"  // kb (32 bit) stored in %rsi
+    "movq      %1,      %%rdi    \n\t"  // kl (32 bit) stored in %rdi
+    "movq      %2,      %%rax    \n\t"  // Address of A stored in %rax
+    "movq      %3,      %%rbx    \n\t"  // Address of B stored in %rbx
+    "movq      %9,      %%r9     \n\t"  // Address of nextA stored in %r9
+    "movq      %10,     %%r10    \n\t"  // Address of nextB stored in %r10
+    "                            \n\t"
+    "addq      $128,    %%rax    \n\t"
+    "addq      $128,    %%rbx    \n\t"
+    "                            \n\t"
+    "movapd -128(%%rax),%%xmm0   \n\t"  // tmp0 = _mm_load_pd(A)
+    "movapd -112(%%rax),%%xmm1   \n\t"  // tmp1 = _mm_load_pd(A+2)
+    "movapd -128(%%rbx),%%xmm2   \n\t"  // tmp2 = _mm_load_pd(B)
+    "                            \n\t"
+    "xorpd     %%xmm8,  %%xmm8   \n\t"  // ab_00_11 = _mm_setzero_pd()
+    "xorpd     %%xmm9,  %%xmm9   \n\t"  // ab_20_31 = _mm_setzero_pd()
+    "xorpd     %%xmm10, %%xmm10  \n\t"  // ab_01_10 = _mm_setzero_pd()
+    "xorpd     %%xmm11, %%xmm11  \n\t"  // ab_21_30 = _mm_setzero_pd()
+    "xorpd     %%xmm12, %%xmm12  \n\t"  // ab_02_13 = _mm_setzero_pd()
+    "xorpd     %%xmm13, %%xmm13  \n\t"  // ab_22_33 = _mm_setzero_pd()
+    "xorpd     %%xmm14, %%xmm14  \n\t"  // ab_03_12 = _mm_setzero_pd()
+    "xorpd     %%xmm15, %%xmm15  \n\t"  // ab_23_32 = _mm_setzero_pd()
+    "                            \n\t"
+    "xorpd     %%xmm3,  %%xmm3   \n\t"  // tmp3 = _mm_setzero_pd
+    "xorpd     %%xmm4,  %%xmm4   \n\t"  // tmp4 = _mm_setzero_pd
+    "xorpd     %%xmm5,  %%xmm5   \n\t"  // tmp5 = _mm_setzero_pd
+    "xorpd     %%xmm6,  %%xmm6   \n\t"  // tmp6 = _mm_setzero_pd
+    "xorpd     %%xmm7,  %%xmm7   \n\t"  // tmp7 = _mm_setzero_pd
+    "testq     %%rdi,   %%rdi    \n\t"  // if kl==0 writeback to AB
+    "                            \n\t"
+    "                            \n\t"
+    "testq     %%rsi,   %%rsi    \n\t"  // if kb==0 handle remaining kl
+    "je        .DCONSIDERLEFT%=  \n\t"  // update iterations
+    "                            \n\t"
+    ".DLOOP%=:                   \n\t"  // for l = kb,..,1 do
+    "                            \n\t"
+    "prefetcht0 (4*35+1)*8(%%rax)\n\t"
+    "                            \n\t"
+    "                            \n\t"  // 1. update
+    "addpd     %%xmm3,  %%xmm12  \n\t"  // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    "movaps -112(%%rbx),%%xmm3   \n\t"  // tmp3     = _mm_load_pd(B+2)
+    "addpd     %%xmm6,  %%xmm13  \n\t"  // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    "movaps    %%xmm2,  %%xmm6   \n\t"  // tmp6     = tmp2
+    "pshufd $78,%%xmm2, %%xmm4   \n\t"  // tmp4     = _mm_shuffle_pd(tmp2, tmp2,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm2   \n\t"  // tmp2     = _mm_mul_pd(tmp2, tmp0);
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1);
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm5,  %%xmm14  \n\t"  // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    "addpd     %%xmm7,  %%xmm15  \n\t"  // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+    "movaps    %%xmm4,  %%xmm7   \n\t"  // tmp7     = tmp4
+    "mulpd     %%xmm0,  %%xmm4   \n\t"  // tmp4     = _mm_mul_pd(tmp4, tmp0)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm2,  %%xmm8   \n\t"  // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+    "movaps -96(%%rbx), %%xmm2   \n\t"  // tmp2     = _mm_load_pd(B+4)
+    "addpd     %%xmm6,  %%xmm9   \n\t"  // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+    "movaps    %%xmm3,  %%xmm6   \n\t"  // tmp6     = tmp3
+    "pshufd $78,%%xmm3, %%xmm5   \n\t"  // tmp5     = _mm_shuffle_pd(tmp3, tmp3,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm3   \n\t"  // tmp3     = _mm_mul_pd(tmp3, tmp0)
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm4,  %%xmm10  \n\t"  // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+    "addpd     %%xmm7,  %%xmm11  \n\t"  // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+    "movaps    %%xmm5,  %%xmm7   \n\t"  // tmp7     = tmp5
+    "mulpd     %%xmm0,  %%xmm5   \n\t"  // tmp5     = _mm_mul_pd(tmp5, tmp0)
+    "movaps -96(%%rax), %%xmm0   \n\t"  // tmp0     = _mm_load_pd(A+4)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "movaps -80(%%rax), %%xmm1   \n\t"  // tmp1     = _mm_load_pd(A+6)
+    "                            \n\t"
+    "                            \n\t"
+    "                            \n\t"
+    "                            \n\t"  // 2. update
+    "addpd     %%xmm3,  %%xmm12  \n\t"  // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    "movaps -80(%%rbx), %%xmm3   \n\t"  // tmp3     = _mm_load_pd(B+6)
+    "addpd     %%xmm6,  %%xmm13  \n\t"  // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    "movaps    %%xmm2,  %%xmm6   \n\t"  // tmp6     = tmp2
+    "pshufd $78,%%xmm2, %%xmm4   \n\t"  // tmp4     = _mm_shuffle_pd(tmp2, tmp2,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm2   \n\t"  // tmp2     = _mm_mul_pd(tmp2, tmp0);
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1);
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm5,  %%xmm14  \n\t"  // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    "addpd     %%xmm7,  %%xmm15  \n\t"  // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+    "movaps    %%xmm4,  %%xmm7   \n\t"  // tmp7     = tmp4
+    "mulpd     %%xmm0,  %%xmm4   \n\t"  // tmp4     = _mm_mul_pd(tmp4, tmp0)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm2,  %%xmm8   \n\t"  // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+    "movaps -64(%%rbx), %%xmm2   \n\t"  // tmp2     = _mm_load_pd(B+8)
+    "addpd     %%xmm6,  %%xmm9   \n\t"  // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+    "movaps    %%xmm3,  %%xmm6   \n\t"  // tmp6     = tmp3
+    "pshufd $78,%%xmm3, %%xmm5   \n\t"  // tmp5     = _mm_shuffle_pd(tmp3, tmp3,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm3   \n\t"  // tmp3     = _mm_mul_pd(tmp3, tmp0)
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm4,  %%xmm10  \n\t"  // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+    "addpd     %%xmm7,  %%xmm11  \n\t"  // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+    "movaps    %%xmm5,  %%xmm7   \n\t"  // tmp7     = tmp5
+    "mulpd     %%xmm0,  %%xmm5   \n\t"  // tmp5     = _mm_mul_pd(tmp5, tmp0)
+    "movaps -64(%%rax), %%xmm0   \n\t"  // tmp0     = _mm_load_pd(A+8)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "movaps -48(%%rax), %%xmm1   \n\t"  // tmp1     = _mm_load_pd(A+10)
+    "                            \n\t"
+    "                            \n\t"
+    "prefetcht0 (4*37+1)*8(%%rax)\n\t"
+    "                            \n\t"
+    "                            \n\t"  // 3. update
+    "addpd     %%xmm3,  %%xmm12  \n\t"  // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    "movaps -48(%%rbx), %%xmm3   \n\t"  // tmp3     = _mm_load_pd(B+10)
+    "addpd     %%xmm6,  %%xmm13  \n\t"  // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    "movaps    %%xmm2,  %%xmm6   \n\t"  // tmp6     = tmp2
+    "pshufd $78,%%xmm2, %%xmm4   \n\t"  // tmp4     = _mm_shuffle_pd(tmp2, tmp2,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm2   \n\t"  // tmp2     = _mm_mul_pd(tmp2, tmp0);
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1);
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm5,  %%xmm14  \n\t"  // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    "addpd     %%xmm7,  %%xmm15  \n\t"  // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+    "movaps    %%xmm4,  %%xmm7   \n\t"  // tmp7     = tmp4
+    "mulpd     %%xmm0,  %%xmm4   \n\t"  // tmp4     = _mm_mul_pd(tmp4, tmp0)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm2,  %%xmm8   \n\t"  // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+    "movaps -32(%%rbx), %%xmm2   \n\t"  // tmp2     = _mm_load_pd(B+12)
+    "addpd     %%xmm6,  %%xmm9   \n\t"  // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+    "movaps    %%xmm3,  %%xmm6   \n\t"  // tmp6     = tmp3
+    "pshufd $78,%%xmm3, %%xmm5   \n\t"  // tmp5     = _mm_shuffle_pd(tmp3, tmp3,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm3   \n\t"  // tmp3     = _mm_mul_pd(tmp3, tmp0)
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm4,  %%xmm10  \n\t"  // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+    "addpd     %%xmm7,  %%xmm11  \n\t"  // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+    "movaps    %%xmm5,  %%xmm7   \n\t"  // tmp7     = tmp5
+    "mulpd     %%xmm0,  %%xmm5   \n\t"  // tmp5     = _mm_mul_pd(tmp5, tmp0)
+    "movaps -32(%%rax), %%xmm0   \n\t"  // tmp0     = _mm_load_pd(A+12)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "movaps -16(%%rax), %%xmm1   \n\t"  // tmp1     = _mm_load_pd(A+14)
+    "                            \n\t"
+    "                            \n\t"
+    "                            \n\t"  // 4. update
+    "addpd     %%xmm3,  %%xmm12  \n\t"  // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    "movaps -16(%%rbx), %%xmm3   \n\t"  // tmp3     = _mm_load_pd(B+14)
+    "addpd     %%xmm6,  %%xmm13  \n\t"  // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    "movaps    %%xmm2,  %%xmm6   \n\t"  // tmp6     = tmp2
+    "pshufd $78,%%xmm2, %%xmm4   \n\t"  // tmp4     = _mm_shuffle_pd(tmp2, tmp2,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm2   \n\t"  // tmp2     = _mm_mul_pd(tmp2, tmp0);
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1);
+    "                            \n\t"
+    "subq     $-32*4,   %%rax    \n\t"  // A += 16;
+    "                            \n\t"
+    "addpd     %%xmm5,  %%xmm14  \n\t"  // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    "addpd     %%xmm7,  %%xmm15  \n\t"  // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+    "movaps    %%xmm4,  %%xmm7   \n\t"  // tmp7     = tmp4
+    "mulpd     %%xmm0,  %%xmm4   \n\t"  // tmp4     = _mm_mul_pd(tmp4, tmp0)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "                            \n\t"
+    "subq     $-128,    %%r10    \n\t"  // nextB += 16
+    "                            \n\t"
+    "addpd     %%xmm2,  %%xmm8   \n\t"  // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+    "movaps    (%%rbx),%%xmm2   \n\t"  // tmp2     = _mm_load_pd(B+16)
+    "addpd     %%xmm6,  %%xmm9   \n\t"  // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+    "movaps    %%xmm3,  %%xmm6   \n\t"  // tmp6     = tmp3
+    "pshufd $78,%%xmm3, %%xmm5   \n\t"  // tmp5     = _mm_shuffle_pd(tmp3, tmp3,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm3   \n\t"  // tmp3     = _mm_mul_pd(tmp3, tmp0)
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1)
+    "                            \n\t"
+    "subq     $-32*4,   %%rbx    \n\t"  // B += 16;
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm4,  %%xmm10  \n\t"  // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+    "addpd     %%xmm7,  %%xmm11  \n\t"  // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+    "movaps    %%xmm5,  %%xmm7   \n\t"  // tmp7     = tmp5
+    "mulpd     %%xmm0,  %%xmm5   \n\t"  // tmp5     = _mm_mul_pd(tmp5, tmp0)
+    "movaps -128(%%rax),%%xmm0   \n\t"  // tmp0     = _mm_load_pd(A+16)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "movaps -112(%%rax), %%xmm1   \n\t"  // tmp1     = _mm_load_pd(A+18)
+    "                            \n\t"
+    "prefetcht2        0(%%r10)  \n\t"  // prefetch nextB[0]
+    "prefetcht2       64(%%r10)  \n\t"  // prefetch nextB[8]
+    "                            \n\t"
+    "decq      %%rsi             \n\t"  // --l
+    "jne       .DLOOP%=          \n\t"  // if l>= 1 go back
+    "                            \n\t"
+    "                            \n\t"
+    ".DCONSIDERLEFT%=:           \n\t"
+    "testq     %%rdi,   %%rdi    \n\t"  // if kl==0 writeback to AB
+    "je        .DPOSTACCUMULATE%=\n\t"
+    "                            \n\t"
+    ".DLOOPLEFT%=:               \n\t"  // for l = kl,..,1 do
+    "                            \n\t"
+    "addpd     %%xmm3,  %%xmm12  \n\t"  // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    "movapd -112(%%rbx),%%xmm3   \n\t"  // tmp3     = _mm_load_pd(B+2)
+    "addpd     %%xmm6,  %%xmm13  \n\t"  // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    "movapd    %%xmm2,  %%xmm6   \n\t"  // tmp6     = tmp2
+    "pshufd $78,%%xmm2, %%xmm4   \n\t"  // tmp4     = _mm_shuffle_pd(tmp2, tmp2,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm2   \n\t"  // tmp2     = _mm_mul_pd(tmp2, tmp0);
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1);
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm5,  %%xmm14  \n\t"  // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    "addpd     %%xmm7,  %%xmm15  \n\t"  // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+    "movapd    %%xmm4,  %%xmm7   \n\t"  // tmp7     = tmp4
+    "mulpd     %%xmm0,  %%xmm4   \n\t"  // tmp4     = _mm_mul_pd(tmp4, tmp0)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm2,  %%xmm8   \n\t"  // ab_00_11 = _mm_add_pd(ab_00_11, tmp2)
+    "movapd -96(%%rbx), %%xmm2   \n\t"  // tmp2     = _mm_load_pd(B+4)
+    "addpd     %%xmm6,  %%xmm9   \n\t"  // ab_20_31 = _mm_add_pd(ab_20_31, tmp6)
+    "movapd    %%xmm3,  %%xmm6   \n\t"  // tmp6     = tmp3
+    "pshufd $78,%%xmm3, %%xmm5   \n\t"  // tmp5     = _mm_shuffle_pd(tmp3, tmp3,
+    "                            \n\t"  //                   _MM_SHUFFLE2(0, 1))
+    "mulpd     %%xmm0,  %%xmm3   \n\t"  // tmp3     = _mm_mul_pd(tmp3, tmp0)
+    "mulpd     %%xmm1,  %%xmm6   \n\t"  // tmp6     = _mm_mul_pd(tmp6, tmp1)
+    "                            \n\t"
+    "                            \n\t"
+    "addpd     %%xmm4,  %%xmm10  \n\t"  // ab_01_10 = _mm_add_pd(ab_01_10, tmp4)
+    "addpd     %%xmm7,  %%xmm11  \n\t"  // ab_21_30 = _mm_add_pd(ab_21_30, tmp7)
+    "movapd    %%xmm5,  %%xmm7   \n\t"  // tmp7     = tmp5
+    "mulpd     %%xmm0,  %%xmm5   \n\t"  // tmp5     = _mm_mul_pd(tmp5, tmp0)
+    "movapd -96(%%rax), %%xmm0   \n\t"  // tmp0     = _mm_load_pd(A+4)
+    "mulpd     %%xmm1,  %%xmm7   \n\t"  // tmp7     = _mm_mul_pd(tmp7, tmp1)
+    "movapd -80(%%rax), %%xmm1   \n\t"  // tmp1     = _mm_load_pd(A+6)
+    "                            \n\t"
+    "                            \n\t"
+    "addq      $32,     %%rax    \n\t"  // A += 4;
+    "addq      $32,     %%rbx    \n\t"  // B += 4;
+    "                            \n\t"
+    "decq      %%rdi             \n\t"  // --l
+    "jne       .DLOOPLEFT%=      \n\t"  // if l>= 1 go back
+    "                            \n\t"
+    ".DPOSTACCUMULATE%=:         \n\t"  // Update remaining ab_*_* registers
+    "                            \n\t"
+    "addpd    %%xmm3,   %%xmm12  \n\t"  // ab_02_13 = _mm_add_pd(ab_02_13, tmp3)
+    "addpd    %%xmm6,   %%xmm13  \n\t"  // ab_22_33 = _mm_add_pd(ab_22_33, tmp6)
+    "                            \n\t"  //
+    "addpd    %%xmm5,   %%xmm14  \n\t"  // ab_03_12 = _mm_add_pd(ab_03_12, tmp5)
+    "addpd    %%xmm7,   %%xmm15  \n\t"  // ab_23_32 = _mm_add_pd(ab_23_32, tmp7)
+    "                            \n\t"
+//
+//  Update C <- beta*C + alpha*AB
+//
+//
+    "movsd  %4,                  %%xmm0 \n\t"  // load alpha
+    "movsd  %5,                  %%xmm1 \n\t"  // load beta
+    "movq   %6,                  %%rcx  \n\t"  // Address of C stored in %rcx
+
+    "movq   %7,                  %%r8   \n\t"  // load incRowC
+    "leaq   (,%%r8,8),           %%r8   \n\t"  //      incRowC *= sizeof(double)
+    "movq   %8,                  %%r9   \n\t"  // load incColC
+    "leaq   (,%%r9,8),           %%r9   \n\t"  //      incRowC *= sizeof(double)
+    "                                   \n\t"
+    "leaq (%%rcx,%%r9),          %%r10  \n\t"  // Store addr of C01 in %r10
+    "leaq (%%rcx,%%r8,2),        %%rdx  \n\t"  // Store addr of C20 in %rdx
+    "leaq (%%rdx,%%r9),          %%r11  \n\t"  // Store addr of C21 in %r11
+    "                                   \n\t"
+    "unpcklpd %%xmm0,            %%xmm0 \n\t"  // duplicate alpha
+    "unpcklpd %%xmm1,            %%xmm1 \n\t"  // duplicate beta
+    "                                   \n\t"
+    "                                   \n\t"
+    "movlpd (%%rcx),             %%xmm3 \n\t"  // load (C00,
+    "movhpd (%%r10,%%r8),        %%xmm3 \n\t"  //       C11)
+    "mulpd  %%xmm0,              %%xmm8 \n\t"  // scale ab_00_11 by alpha
+    "mulpd  %%xmm1,              %%xmm3 \n\t"  // scale (C00, C11) by beta
+    "addpd  %%xmm8,              %%xmm3 \n\t"  // add results
+
+    "movlpd %%xmm3,        (%%rcx)       \n\t"  // write back (C00,
+    "movhpd %%xmm3,        (%%r10,%%r8)  \n\t"  //             C11)
+    "                                   \n\t"
+    "movlpd (%%rdx),             %%xmm4 \n\t"  // load (C20,
+    "movhpd (%%r11,%%r8),        %%xmm4 \n\t"  //       C31)
+    "mulpd  %%xmm0,              %%xmm9 \n\t"  // scale ab_20_31 by alpha
+    "mulpd  %%xmm1,              %%xmm4 \n\t"  // scale (C20, C31) by beta
+    "addpd  %%xmm9,              %%xmm4 \n\t"  // add results
+    "movlpd %%xmm4,        (%%rdx)       \n\t"  // write back (C20,
+    "movhpd %%xmm4,        (%%r11,%%r8)  \n\t"  //             C31)
+    "                                   \n\t"
+    "                                   \n\t"
+    "movlpd (%%r10),             %%xmm3 \n\t"  // load (C01,
+    "movhpd (%%rcx,%%r8),        %%xmm3 \n\t"  //       C10)
+    "mulpd  %%xmm0,              %%xmm10\n\t"  // scale ab_01_10 by alpha
+    "mulpd  %%xmm1,              %%xmm3 \n\t"  // scale (C01, C10) by beta
+    "addpd  %%xmm10,             %%xmm3 \n\t"  // add results
+    "movlpd %%xmm3,        (%%r10)      \n\t"  // write back (C01,
+    "movhpd %%xmm3,        (%%rcx,%%r8) \n\t"  //             C10)
+    "                                   \n\t"
+    "movlpd (%%r11),             %%xmm4 \n\t"  // load (C21,
+    "movhpd (%%rdx,%%r8),        %%xmm4 \n\t"  //       C30)
+    "mulpd  %%xmm0,              %%xmm11\n\t"  // scale ab_21_30 by alpha
+    "mulpd  %%xmm1,              %%xmm4 \n\t"  // scale (C21, C30) by beta
+    "addpd  %%xmm11,             %%xmm4 \n\t"  // add results
+    "movlpd %%xmm4,        (%%r11)      \n\t"  // write back (C21,
+    "movhpd %%xmm4,        (%%rdx,%%r8) \n\t"  //             C30)
+    "                                   \n\t"
+    "                                   \n\t"
+    "leaq   (%%rcx,%%r9,2),      %%rcx  \n\t"  // Store addr of C02 in %rcx
+    "leaq   (%%r10,%%r9,2),      %%r10  \n\t"  // Store addr of C03 in %r10
+    "leaq   (%%rdx,%%r9,2),      %%rdx  \n\t"  // Store addr of C22 in $rdx
+    "leaq   (%%r11,%%r9,2),      %%r11  \n\t"  // Store addr of C23 in %r11
+    "                                   \n\t"
+    "                                   \n\t"
+    "movlpd (%%rcx),             %%xmm3 \n\t"  // load (C02,
+    "movhpd (%%r10,%%r8),        %%xmm3 \n\t"  //       C13)
+    "mulpd  %%xmm0,              %%xmm12\n\t"  // scale ab_02_13 by alpha
+    "mulpd  %%xmm1,              %%xmm3 \n\t"  // scale (C02, C13) by beta
+    "addpd  %%xmm12,             %%xmm3 \n\t"  // add results
+    "movlpd %%xmm3,        (%%rcx)      \n\t"  // write back (C02,
+    "movhpd %%xmm3,        (%%r10,%%r8) \n\t"  //             C13)
+    "                                   \n\t"
+    "movlpd (%%rdx),             %%xmm4 \n\t"  // load (C22,
+    "movhpd (%%r11, %%r8),       %%xmm4 \n\t"  //       C33)
+    "mulpd  %%xmm0,              %%xmm13\n\t"  // scale ab_22_33 by alpha
+    "mulpd  %%xmm1,              %%xmm4 \n\t"  // scale (C22, C33) by beta
+    "addpd  %%xmm13,             %%xmm4 \n\t"  // add results
+    "movlpd %%xmm4,             (%%rdx) \n\t"  // write back (C22,
+    "movhpd %%xmm4,        (%%r11,%%r8) \n\t"  //             C33)
+    "                                   \n\t"
+    "                                   \n\t"
+    "movlpd (%%r10),             %%xmm3 \n\t"  // load (C03,
+    "movhpd (%%rcx,%%r8),        %%xmm3 \n\t"  //       C12)
+    "mulpd  %%xmm0,              %%xmm14\n\t"  // scale ab_03_12 by alpha
+    "mulpd  %%xmm1,              %%xmm3 \n\t"  // scale (C03, C12) by beta
+    "addpd  %%xmm14,             %%xmm3 \n\t"  // add results
+    "movlpd %%xmm3,        (%%r10)      \n\t"  // write back (C03,
+    "movhpd %%xmm3,        (%%rcx,%%r8) \n\t"  //             C12)
+    "                                   \n\t"
+    "movlpd (%%r11),             %%xmm4 \n\t"  // load (C23,
+    "movhpd (%%rdx,%%r8),        %%xmm4 \n\t"  //       C32)
+    "mulpd  %%xmm0,              %%xmm15\n\t"  // scale ab_23_32 by alpha
+    "mulpd  %%xmm1,              %%xmm4 \n\t"  // scale (C23, C32) by beta
+    "addpd  %%xmm15,             %%xmm4 \n\t"  // add results
+    "movlpd %%xmm4,        (%%r11)      \n\t"  // write back (C23,
+    "movhpd %%xmm4,        (%%rdx,%%r8) \n\t"  //             C32)
+    : // output
+    : // input
+        "m" (kb),       // 0
+        "m" (kl),       // 1
+        "m" (A),        // 2
+        "m" (B),        // 3
+        "m" (alpha),    // 4
+        "m" (beta),     // 5
+        "m" (C),        // 6
+        "m" (incRowC),  // 7
+        "m" (incColC),  // 8
+        "m" (nextA),    // 9
+        "m" (nextB)     // 10
+    : // register clobber list
+        "rax", "rbx", "rcx", "rdx", "rsi", "rdi", "r8", "r9", "r10", "r11",
+        "xmm0", "xmm1", "xmm2", "xmm3",
+        "xmm4", "xmm5", "xmm6", "xmm7",
+        "xmm8", "xmm9", "xmm10", "xmm11",
+        "xmm12", "xmm13", "xmm14", "xmm15"
+    );
+}
+*/
+
+//
+//  Compute Y += alpha*X
+//
+static void
+dgeaxpy(int           m,
+        int           n,
+        double        alpha,
+        const double  *X,
+        int           incRowX,
+        int           incColX,
+        double        *Y,
+        int           incRowY,
+        int           incColY)
+{
+    int i, j;
+
+
+    if (alpha!=1.0) {
+        for (j=0; j<n; ++j) {
+            for (i=0; i<m; ++i) {
+                Y[i*incRowY+j*incColY] += alpha*X[i*incRowX+j*incColX];
+            }
+        }
+    } else {
+        for (j=0; j<n; ++j) {
+            for (i=0; i<m; ++i) {
+                Y[i*incRowY+j*incColY] += X[i*incRowX+j*incColX];
+            }
+        }
+    }
+}
+
+//
+//  Compute X *= alpha
+//
+static void
+dgescal(int     m,
+        int     n,
+        double  alpha,
+        double  *X,
+        int     incRowX,
+        int     incColX)
+{
+    int i, j;
+
+    if (alpha!=0.0) {
+        for (j=0; j<n; ++j) {
+            for (i=0; i<m; ++i) {
+                X[i*incRowX+j*incColX] *= alpha;
+            }
+        }
+    } else {
+        for (j=0; j<n; ++j) {
+            for (i=0; i<m; ++i) {
+                X[i*incRowX+j*incColX] = 0.0;
+            }
+        }
+    }
+}
+
+//
+//  Macro Kernel for the multiplication of blocks of A and B.  We assume that
+//  these blocks were previously packed to buffers _A and _B.
+//
+static void
+dgemm_macro_kernel(int     mc,
+                   int     nc,
+                   int     kc,
+                   double  alpha,
+                   double  beta,
+                   double  *C,
+                   int     incRowC,
+                   int     incColC)
+{
+    int mp = (mc+MR-1) / MR;
+    int np = (nc+NR-1) / NR;
+
+    int _mr = mc % MR;
+    int _nr = nc % NR;
+
+    int mr, nr;
+    int i, j;
+
+    const double *nextA;
+    const double *nextB;
+
+    for (j=0; j<np; ++j) {
+        nr    = (j!=np-1 || _nr==0) ? NR : _nr;
+        nextB = &_B[j*kc*NR];
+
+        for (i=0; i<mp; ++i) {
+            mr    = (i!=mp-1 || _mr==0) ? MR : _mr;
+            nextA = &_A[(i+1)*kc*MR];
+
+            if (i==mp-1) {
+                nextA = _A;
+                nextB = &_B[(j+1)*kc*NR];
+                if (j==np-1) {
+                    nextB = _B;
+                }
+            }
+
+            if (mr==MR && nr==NR) {
+                dgemm_micro_kernel(kc, alpha, &_A[i*kc*MR], &_B[j*kc*NR],
+                                   beta,
+                                   &C[i*MR*incRowC+j*NR*incColC],
+                                   incRowC, incColC,
+                                   nextA, nextB);
+            } else {
+                dgemm_micro_kernel(kc, alpha, &_A[i*kc*MR], &_B[j*kc*NR],
+                                   0.0,
+                                   _C, 1, MR,
+                                   nextA, nextB);
+                dgescal(mr, nr, beta,
+                        &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
+                dgeaxpy(mr, nr, 1.0, _C, 1, MR,
+                        &C[i*MR*incRowC+j*NR*incColC], incRowC, incColC);
+            }
+        }
+    }
+}
+
+//
+//  Compute C <- beta*C + alpha*A*B
+//
+void dgemm_nn(int            m,
+                  int            n,
+                  int            k,
+                  double         alpha,
+                  const double   *A,
+                  int            incRowA,
+                  int            incColA,
+                  const double   *B,
+                  int            incRowB,
+                  int            incColB,
+                  double         beta,
+                  double         *C,
+                  int            incRowC,
+                  int            incColC)
+{
+    int mb = (m+MC-1) / MC;
+    int nb = (n+NC-1) / NC;
+    int kb = (k+KC-1) / KC;
+
+    int _mc = m % MC;
+    int _nc = n % NC;
+    int _kc = k % KC;
+
+    int mc, nc, kc;
+    int i, j, l;
+
+    double _beta;
+
+    if (alpha==0.0 || k==0) {
+        dgescal(m, n, beta, C, incRowC, incColC);
+        return;
+    }
+
+    for (j=0; j<nb; ++j) {
+        nc = (j!=nb-1 || _nc==0) ? NC : _nc;
+
+        for (l=0; l<kb; ++l) {
+            kc    = (l!=kb-1 || _kc==0) ? KC   : _kc;
+            _beta = (l==0) ? beta : 1.0;
+
+            pack_B(kc, nc,
+                   &B[l*KC*incRowB+j*NC*incColB], incRowB, incColB,
+                   _B);
+
+            for (i=0; i<mb; ++i) {
+                mc = (i!=mb-1 || _mc==0) ? MC : _mc;
+
+                pack_A(mc, kc,
+                       &A[i*MC*incRowA+l*KC*incColA], incRowA, incColA,
+                       _A);
+
+                dgemm_macro_kernel(mc, nc, kc, alpha, _beta,
+                                   &C[i*MC*incRowC+j*NC*incColC],
+                                   incRowC, incColC);
+            }
+        }
+    }
+}
+
+//#define N 1024
+#define N 256
+
+void fill_matrix(double *mat, int rows, int cols) {
+    for (int i = 0; i < rows * cols; ++i) {
+        mat[i] = (double)rand() / RAND_MAX;
+    }
+}
+
+void print_matrix(double *mat, int rows, int cols) {
+    for (int i = 0; i < rows; ++i) {
+        for (int j = 0; j < cols; ++j) {
+            printf("%.2f ", mat[i * cols + j]);
+        }
+        printf("\n");
+    }
+}
+
+void naive_matrix_multiply(double *A, double *B, double *C, int size) {
+    for (int i = 0; i < size; ++i) {
+        for (int j = 0; j < size; ++j) {
+            double sum = 0.0;
+            for (int k = 0; k < size; ++k) {
+                sum += A[i * size + k] * B[k * size + j];
+            }
+            C[i * size + j] = sum;
+        }
+    }
+}
+
+int main() {
+    double *A = (double *)malloc(N * N * sizeof(double));
+    double *B = (double *)malloc(N * N * sizeof(double));
+    double *C_naive = (double *)calloc(N * N, sizeof(double));
+    double *C_optimized = (double *)calloc(N * N, sizeof(double));
+
+    srand(time(NULL)); // Seed for random number generation
+
+    printf("generating values for mtx\n");
+    fill_matrix(A, N, N);
+    fill_matrix(B, N, N);
+
+    // Start timing for naive implementation
+    clock_t start_naive = clock();
+    naive_matrix_multiply(A, B, C_naive, N);
+    clock_t end_naive = clock();
+    double time_naive = (double)(end_naive - start_naive) / CLOCKS_PER_SEC;
+
+    // Start timing for optimized implementation
+    clock_t start_optimized = clock();
+    dgemm_nn(N, N, N, 1.0, A, N, 1, B, N, 1, 0.0, C_optimized, N, 1);
+    clock_t end_optimized = clock();
+    double time_optimized = (double)(end_optimized - start_optimized) / CLOCKS_PER_SEC;
+
+    printf("Naive implementation took %.6f seconds\n", time_naive);
+    printf("Optimized implementation took %.6f seconds\n", time_optimized);
+
+    //printf("\nMatrix C (Result of A * B):\n");
+    //print_matrix(C_naive, N, N);
+
+    //printf("\nOpt. Matrix C (Result of A * B):\n");
+    //print_matrix(C_optimized, N, N);
+    // Free allocated memory
+    free(A);
+    free(B);
+    free(C_naive);
+    free(C_optimized);
+
+    return 0;
+}
+
diff --git a/modules/linalg/mtx_arr_f90.cpp b/modules/linalg/mtx_arr_f90.cpp
index 0710d037c..f6fea278a 100644
--- a/modules/linalg/mtx_arr_f90.cpp
+++ b/modules/linalg/mtx_arr_f90.cpp
@@ -44,16 +44,8 @@
  *
  ************************************************************************/
 
-// if Python API compilation is NOT occuring include these methods
-#if defined(__GPMP_PYTHON_API__)
-
-#define __PYTHON_API__
-#include <boost/bind/bind.hpp>
-#include <boost/python.hpp>
-#include <boost/python/numpy.hpp>
-
 // if source C++ API is being compiled, include these Fortran refs and wrappers
-#elif defined(__GPMP_CPP_API__)
+#if defined(__GPMP_CPP_API__)
 
 extern "C" {
 // Matrix add routine (FLOAT)
diff --git a/modules/linalg/mtx_routines.f90 b/modules/linalg/mtx_routines.f90
index 4cb2eae17..9467405e9 100644
--- a/modules/linalg/mtx_routines.f90
+++ b/modules/linalg/mtx_routines.f90
@@ -86,15 +86,15 @@ SUBROUTINE mtx_mult_routine_int_(A, B, C, rows_a, cols_a, cols_b) bind(C)
 
     INTEGER :: i, j, k
 
-    ! Perform matrix multiplication flipping indexing to keep standard with 
+    ! Perform matrix multiplication flipping indexing to keep standard with
     ! C/C++ calls
     DO i = 1, rows_a
         DO j = 1, cols_b
 
             C(j, i) = 0
             DO k = 1, cols_a
-                
-                C(j, i) = C(j, i) + A(k, i) * B(j, k)
+
+                C(j, i) = C(j, i) + A(k, i)*B(j, k)
 
             END DO
 
diff --git a/scripts/format.sh b/scripts/format.sh
index d9014da10..75d32843d 100755
--- a/scripts/format.sh
+++ b/scripts/format.sh
@@ -2,5 +2,5 @@
 printf "[+] Formatting C, C++, CUDA files...\n"
 cd ../ && find . -regex '.*\.\(c\|cu\|cuh\|cpp\|hpp\|h\|cxx\)' -exec clang-format -style=file -i {} \;
 
-#printf "[+] Formatting FORTRAN files...\n"
-#cd ../ && find . -type f -name "*.f90" -exec fprettify {} --indent 4 \;
+printf "[+] Formatting FORTRAN files...\n"
+cd ../ && find . -type f -name "*.f90" -exec fprettify {} --indent 4 \;
diff --git a/tests/linalg/TEST b/tests/linalg/TEST
new file mode 100755
index 000000000..fb468c683
Binary files /dev/null and b/tests/linalg/TEST differ