diff --git a/include/linalg/_dgemm.hpp b/include/linalg/_dgemm.hpp
index 67d2155af..4f10342f3 100644
--- a/include/linalg/_dgemm.hpp
+++ b/include/linalg/_dgemm.hpp
@@ -59,39 +59,6 @@ class DGEMM {
     static double DGEMM_BUFF_C[BLOCK_SZ_MR * BLOCK_SZ_NR]
         __attribute__((aligned(16)));
 
-    /**
-     * @brief Performs matrix-matrix multiplication (DGEMM) using an
-     * assembly implementation It computes the product of matrices A and B,
-     * scaled by alpha and beta, and stores the result in matrix C
-     *
-     * @param A Pointer to the first matrix (A) in row-major order
-     * @param B Pointer to the second matrix (B) in row-major order
-     * @param C Pointer to the result matrix (C) in row-major order
-     * @param nextA Pointer to the next matrix A
-     * @param nextB Pointer to the next matrix B
-     * @param kl Value representing the remaining columns of matrix A
-     * @param kb Value representing the remaining rows of matrix B
-     * @param incRowC Increment for moving to the next row of matrix C
-     * @param incColC Increment for moving to the next column of matrix C
-     * @param alpha Scalar value to scale the product of matrices A and B
-     * @param beta Scalar value to scale matrix C before adding the product
-     *
-     * @note This calls an Assembly implementation depending on detected
-     * host system. x86 (SSE, AVX2) and ARM NEON supported
-     */
-    /*void dgemm_kernel_asm(const double *A,
-                          const double *B,
-                          double *C,
-                          const double *nextA,
-                          const double *nextB,
-                          long kl,
-                          long kb,
-                          long incRowC,
-                          long incColC,
-                          double alpha,
-                          double beta);
-*/
-
     /**
      * @brief Packs micro panels of size BLOCK_SZ_MR rows by k columns from A
      * without padding
diff --git a/modules/linalg/dgemm_arr.cpp b/modules/linalg/dgemm_arr.cpp
index ac732c51d..43531acc5 100644
--- a/modules/linalg/dgemm_arr.cpp
+++ b/modules/linalg/dgemm_arr.cpp
@@ -48,6 +48,26 @@ extern "C" {
 #endif
 
 // ASM micro kernel function
+/**
+ * @brief Performs matrix-matrix multiplication (DGEMM) using an
+ * assembly implementation It computes the product of matrices A and B,
+ * scaled by alpha and beta, and stores the result in matrix C
+ *
+ * @param A Pointer to the first matrix (A) in row-major order
+ * @param B Pointer to the second matrix (B) in row-major order
+ * @param C Pointer to the result matrix (C) in row-major order
+ * @param nextA Pointer to the next matrix A
+ * @param nextB Pointer to the next matrix B
+ * @param kl Value representing the remaining columns of matrix A
+ * @param kb Value representing the remaining rows of matrix B
+ * @param incRowC Increment for moving to the next row of matrix C
+ * @param incColC Increment for moving to the next column of matrix C
+ * @param alpha Scalar value to scale the product of matrices A and B
+ * @param beta Scalar value to scale matrix C before adding the product
+ *
+ * @note This calls an Assembly implementation depending on detected
+ * host system. x86 (SSE, AVX2) and ARM NEON supported
+ */
 extern void dgemm_kernel_asm(const double *A,
                              const double *B,
                              double *C,
diff --git a/modules/linalg/dgemm_asm.h b/modules/linalg/dgemm_asm.h
deleted file mode 100644
index 02667c3bc..000000000
--- a/modules/linalg/dgemm_asm.h
+++ /dev/null
@@ -1,4 +0,0 @@
-#ifndef DGEMM_ASM_H
-#define DGEMM_ASM_H
-
-#endif
diff --git a/tests/linalg/t_matrix_arr_f64.cpp b/tests/linalg/t_matrix_arr_f64.cpp
index 95f4a74d9..3430f9190 100644
--- a/tests/linalg/t_matrix_arr_f64.cpp
+++ b/tests/linalg/t_matrix_arr_f64.cpp
@@ -18,7 +18,7 @@ const double TOLERANCE = 1e-3;
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 
 namespace {
 
diff --git a/tests/linalg/t_matrix_arr_f90.cpp b/tests/linalg/t_matrix_arr_f90.cpp
index 2da14b370..0cd49396d 100644
--- a/tests/linalg/t_matrix_arr_f90.cpp
+++ b/tests/linalg/t_matrix_arr_f90.cpp
@@ -16,7 +16,7 @@
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 namespace {
 
 TEST(FORTRAN90MatrixArrayTestI32, AdditionPerformanceComparison) {
diff --git a/tests/linalg/t_matrix_arr_i16.cpp b/tests/linalg/t_matrix_arr_i16.cpp
index b9178e768..af52cee8a 100644
--- a/tests/linalg/t_matrix_arr_i16.cpp
+++ b/tests/linalg/t_matrix_arr_i16.cpp
@@ -16,7 +16,7 @@
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 namespace {
 
 TEST(MatrixArrayTestI16, AdditionComparisonSmall) {
diff --git a/tests/linalg/t_matrix_arr_i32.cpp b/tests/linalg/t_matrix_arr_i32.cpp
index 8d2f726a5..47824e11a 100644
--- a/tests/linalg/t_matrix_arr_i32.cpp
+++ b/tests/linalg/t_matrix_arr_i32.cpp
@@ -16,7 +16,7 @@
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 namespace {
 
 TEST(MatrixArrayTestI32, AdditionComparisonSmall) {
diff --git a/tests/linalg/t_matrix_arr_i8.cpp b/tests/linalg/t_matrix_arr_i8.cpp
index 356192fcd..21c5b0c8c 100644
--- a/tests/linalg/t_matrix_arr_i8.cpp
+++ b/tests/linalg/t_matrix_arr_i8.cpp
@@ -16,7 +16,7 @@
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 namespace {
 
 TEST(MatrixArrayTestI8, AdditionComparisonSmall) {
diff --git a/tests/linalg/t_matrix_arr_naive.cpp b/tests/linalg/t_matrix_arr_naive.cpp
index a8bf13fc0..002e0a599 100644
--- a/tests/linalg/t_matrix_arr_naive.cpp
+++ b/tests/linalg/t_matrix_arr_naive.cpp
@@ -13,7 +13,7 @@
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 namespace {
 TEST(MatrixArrayTest, BasicTest) {
     INFO_COUT << "MATRIX (as Arrays) NAIVE" << std::endl;
diff --git a/tests/linalg/t_matrix_vector_f64.cpp b/tests/linalg/t_matrix_vector_f64.cpp
index a0b8f46ad..399f940c0 100644
--- a/tests/linalg/t_matrix_vector_f64.cpp
+++ b/tests/linalg/t_matrix_vector_f64.cpp
@@ -16,7 +16,7 @@
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 namespace {
 
 TEST(MatrixVectorTestF64, AdditionComparisonSmall) {
diff --git a/tests/linalg/t_matrix_vector_i32.cpp b/tests/linalg/t_matrix_vector_i32.cpp
index fac655c5d..9a2e49333 100644
--- a/tests/linalg/t_matrix_vector_i32.cpp
+++ b/tests/linalg/t_matrix_vector_i32.cpp
@@ -16,7 +16,7 @@
 using namespace gpmp;
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 namespace {
 
 // test case to compare the results of the intrinsics implementation with the
diff --git a/tests/linalg/t_mtx.f90 b/tests/linalg/t_mtx.f90
index 986ac461a..beacd71e2 100644
--- a/tests/linalg/t_mtx.f90
+++ b/tests/linalg/t_mtx.f90
@@ -62,7 +62,7 @@ subroutine test_mtx_add()
     do i = 1, nrows
         do j = 1, ncols
             if (c(i, j) /= a(i, j) + b(i, j)) then
-                print *, ''//achar(27)//'[31m [!] LINALG MTX ADD (FLOAT) FAILED'//achar(27)//'[0m'
+                print *, ''//achar(27)//'[34m [!] LINALG MTX ADD (FLOAT) FAILED'//achar(27)//'[0m'
                 failed = .true.
                 exit
             end if
@@ -76,7 +76,7 @@ subroutine test_mtx_add()
     do i = 1, nrows
         do j = 1, ncols
             if (c(i, j) /= a(i, j) + b(i, j)) then
-                print *, ''//achar(27)//'[31m [!] LINALG MTX ADD (INT) FAILED'//achar(27)//'[0m'
+                print *, ''//achar(27)//'[34m [!] LINALG MTX ADD (INT) FAILED'//achar(27)//'[0m'
                 failed = .true.
                 exit
             end if
diff --git a/tests/linalg/t_vector_vector_f64.cpp b/tests/linalg/t_vector_vector_f64.cpp
index dd48ddfdd..c4c5a94a2 100644
--- a/tests/linalg/t_vector_vector_f64.cpp
+++ b/tests/linalg/t_vector_vector_f64.cpp
@@ -13,9 +13,9 @@
 
 const double TOLERANCE = 1e-3;
 
-#define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
+#define TEST_COUT std::cout << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 TEST(VectorVectorTestF64, Addition) {
     INFO_COUT << "Vector (as Vectors) FLOAT64" << std::endl;
 
diff --git a/tests/linalg/t_vector_vector_i16.cpp b/tests/linalg/t_vector_vector_i16.cpp
index 6cba052ca..aba3c432f 100644
--- a/tests/linalg/t_vector_vector_i16.cpp
+++ b/tests/linalg/t_vector_vector_i16.cpp
@@ -15,7 +15,7 @@ const double TOLERANCE = 1e-3;
 
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 /*****************************************************************************/
 /** VECTOR<INT32> TESTS */
 
diff --git a/tests/linalg/t_vector_vector_i32.cpp b/tests/linalg/t_vector_vector_i32.cpp
index 4923d5e7e..1519887d4 100644
--- a/tests/linalg/t_vector_vector_i32.cpp
+++ b/tests/linalg/t_vector_vector_i32.cpp
@@ -15,7 +15,7 @@ const double TOLERANCE = 1e-3;
 
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 /*****************************************************************************/
 /** VECTOR<INT32> TESTS */
 
diff --git a/tests/linalg/t_vector_vector_i64.cpp b/tests/linalg/t_vector_vector_i64.cpp
index e5b4f1df3..c30d1e754 100644
--- a/tests/linalg/t_vector_vector_i64.cpp
+++ b/tests/linalg/t_vector_vector_i64.cpp
@@ -15,7 +15,7 @@ const double TOLERANCE = 1e-3;
 
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 /*****************************************************************************/
 /** VECTOR<INT32> TESTS */
 
diff --git a/tests/linalg/t_vector_vector_i8.cpp b/tests/linalg/t_vector_vector_i8.cpp
index 89e01e0e8..618feadf1 100644
--- a/tests/linalg/t_vector_vector_i8.cpp
+++ b/tests/linalg/t_vector_vector_i8.cpp
@@ -15,7 +15,7 @@ const double TOLERANCE = 1e-3;
 
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 /*****************************************************************************/
 /** VECTOR<INT8> TESTS */
 
diff --git a/tests/linalg/t_vector_vector_naive.cpp b/tests/linalg/t_vector_vector_naive.cpp
index c6d9d4ae9..92174662c 100644
--- a/tests/linalg/t_vector_vector_naive.cpp
+++ b/tests/linalg/t_vector_vector_naive.cpp
@@ -15,7 +15,7 @@ const double TOLERANCE = 1e-3;
 
 #define TEST_COUT std::cerr << "\033[32m[          ] [ INFO ] \033[0m"
 #define INFO_COUT                                                              \
-    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;31m\033[1m"
+    std::cerr << "\033[32m[          ] [ INFO ] \033[0m\033[1;34m\033[1m"
 TEST(VectorVectorTestDouble, Addition) {
     INFO_COUT << "Vector (as Vectors) NAIVE" << std::endl;