#EDITS: updates to linalg module and file seperation

akielaries · Feb 18, 2024 · 4e1e46c · 4e1e46c
1 parent b8cedfb
commit 4e1e46c
Show file tree

Hide file tree

Showing 12 changed files with 738 additions and 52 deletions.
diff --git a/include/linalg/mtx.hpp b/include/linalg/mtx.hpp
@@ -187,6 +187,84 @@ class Mtx {
                  const std::vector<std::vector<float>> &B,
                  std::vector<std::vector<float>> &C);
 
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of 8 bit ints
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void
+    mtx_sub(const int8_t *A, const int8_t *B, int8_t *C, int rows, int cols);
+
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of 16 bit ints
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void
+    mtx_sub(const int16_t *A, const int16_t *B, int16_t *C, int rows, int cols);
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of type int
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void mtx_sub(const int *A, const int *B, int *C, int rows, int cols);
+
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of type double
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void
+    mtx_sub(const double *A, const double *B, double *C, int rows, int cols);
+
+    /**
+     * @brief Perform matrix subtraction using Intel intrinsics, accepts
+     * flat arrays of type float
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @note Matrices must be of at least size 8x8
+     * @overload
+     */
+    void mtx_sub(const float *A, const float *B, float *C, int rows, int cols);
+
+    void mtx_mult(const int8_t *A,
+                  const int8_t *B,
+                  int8_t *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
+    void mtx_mult(const int16_t *A,
+                  const int16_t *B,
+                  int16_t *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
+    void mtx_mult(const int *A,
+                  const int *B,
+                  int *C,
+                  int rows_a,
+                  int cols_a,
+                  int cols_b);
+
     /**
      * @brief Perform matrix subtraction using Intel intrinsics, accepts
      * vectors of type int
@@ -373,7 +451,52 @@ class Mtx {
      * @overload
      */
     template <typename T>
-    void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols);
+    void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols) {
+        // MTX A AND B MUST BE SAME SIZE
+        for (int i = 0; i < rows; ++i) {
+            for (int j = 0; j < cols; ++j) {
+                // perform matrix addition
+                C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
+            }
+        }
+    }
+    /**
+     * @brief Perform matrix subtraction on two matrices as flat arrays
+     * @param A Input matrix A
+     * @param B Input matrix B
+     * @param C Output matrix C
+     * @param rows Number of rows
+     * @param cols Number of columns
+     * @overload
+     */
+    template <typename T>
+    void std_mtx_sub(const T *A, const T *B, T *C, int rows, int cols) {
+        // MTX A AND B MUST BE SAME SIZE
+        for (int i = 0; i < rows; ++i) {
+            for (int j = 0; j < cols; ++j) {
+                // perform matrix addition
+                C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
+            }
+        }
+    }
+
+    template <typename T>
+    void std_mtx_mult(const T *A,
+                      const T *B,
+                      T *C,
+                      int rowsA,
+                      int colsA,
+                      int colsB) {
+        for (int i = 0; i < rowsA; ++i) {
+            for (int j = 0; j < colsB; ++j) {
+                T sum = 0; // Use T type for sum
+                for (int k = 0; k < colsA; ++k) {
+                    sum += A[i * colsA + k] * B[k * colsB + j];
+                }
+                C[i * colsB + j] = sum;
+            }
+        }
+    }
 
     /**
      * @brief Perform matrix addition on two matrices as flat vectors

diff --git a/modules/linalg/mtx_avx2_arr_i16.cpp b/modules/linalg/mtx_avx2_arr_i16.cpp
@@ -85,6 +85,70 @@ void gpmp::linalg::Mtx::mtx_add(const int16_t *A,
     }
 }
 
+void gpmp::linalg::Mtx::mtx_sub(const int16_t *A,
+                                const int16_t *B,
+                                int16_t *C,
+                                int rows,
+                                int cols) {
+    for (int i = 0; i < rows; ++i) {
+        int j = 0;
+        for (; j < cols - 15; j += 16) {
+            __m256i a = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&A[i * cols + j]));
+            __m256i b = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&B[i * cols + j]));
+            __m256i c = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&C[i * cols + j]));
+
+            // Perform vectorized subtraction and accumulate the result
+            c = _mm256_sub_epi16(a, b);
+
+            // Store the result back to the C matrix
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]),
+                                c);
+        }
+
+        for (; j < cols; ++j) {
+            C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
+        }
+    }
+}
+
+void gpmp::linalg::Mtx::mtx_mult(const int16_t *A,
+              const int16_t *B,
+              int16_t *C,
+              int rows_a,
+              int cols_a,
+              int cols_b) {
+    for (int i = 0; i < rows_a; ++i) {
+        for (int j = 0; j < cols_b; j += 16) {
+            __m256i c = _mm256_setzero_si256();
+
+            for (int k = 0; k < cols_a; ++k) {
+                __m256i a = _mm256_set1_epi16(A[i * cols_a + k]);
+                __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j]));
+
+                __m256i prod = _mm256_mullo_epi16(a, b);
+                c = _mm256_add_epi16(c, prod);
+            }
+
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c); 
+        }
+
+        // Handle remaining elements
+        for (int j = cols_b - cols_b % 16; j < cols_b; ++j) {
+            int sum = 0;
+
+            for (int k = 0; k < cols_a; ++k) {
+                sum += A[i * cols_a + k] * B[k * cols_b + j];
+            }
+
+            C[i * cols_b + j] = sum;
+        }
+    }
+}
+
+
 #endif
 
 // x86

diff --git a/modules/linalg/mtx_avx2_arr_i32.cpp b/modules/linalg/mtx_avx2_arr_i32.cpp
@@ -89,12 +89,50 @@ void gpmp::linalg::Mtx::mtx_add(const int *A,
                 C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
             }
         }
-    } else {
+    }
+
+    else {
         // use standard matrix addition
         std_mtx_add(A, B, C, rows, cols);
     }
 }
 
+void gpmp::linalg::Mtx::mtx_mult(const int *A,
+                                 const int *B,
+                                 int *C,
+                                 int rows_a,
+                                 int cols_a,
+                                 int cols_b) {
+    for (int i = 0; i < rows_a; ++i) {
+        for (int j = 0; j < cols_b; j += 8) {
+            __m256i c = _mm256_setzero_si256();
+
+            for (int k = 0; k < cols_a; ++k) {
+                __m256i a = _mm256_set1_epi32(A[i * cols_a + k]);
+                __m256i b = _mm256_loadu_si256(
+                    reinterpret_cast<const __m256i *>(&B[k * cols_b + j]));
+
+                __m256i prod = _mm256_mullo_epi32(a, b);
+                c = _mm256_add_epi32(c, prod);
+            }
+
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]),
+                                c);
+        }
+
+        // Handle remaining elements
+        for (int j = cols_b - cols_b % 8; j < cols_b; ++j) {
+            int sum = 0;
+
+            for (int k = 0; k < cols_a; ++k) {
+                sum += A[i * cols_a + k] * B[k * cols_b + j];
+            }
+
+            C[i * cols_b + j] = sum;
+        }
+    }
+}
+
 #endif
 
 // x86

diff --git a/modules/linalg/mtx_avx2_arr_i8.cpp b/modules/linalg/mtx_avx2_arr_i8.cpp
@@ -34,6 +34,7 @@
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
+#include <cstring>
 #include <iostream>
 #include <vector>
 
@@ -85,6 +86,75 @@ void gpmp::linalg::Mtx::mtx_add(const int8_t *A,
     }
 }
 
+void gpmp::linalg::Mtx::mtx_sub(const int8_t *A,
+                                const int8_t *B,
+                                int8_t *C,
+                                int rows,
+                                int cols) {
+    for (int i = 0; i < rows; ++i) {
+        int j = 0;
+        for (; j < cols - 31; j += 32) {
+            __m256i a = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&A[i * cols + j]));
+            __m256i b = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&B[i * cols + j]));
+            __m256i c = _mm256_loadu_si256(
+                reinterpret_cast<const __m256i *>(&C[i * cols + j]));
+
+            // Perform vectorized subtraction and accumulate the result
+            c = _mm256_sub_epi8(a, b);
+
+            // Store the result back to the C matrix
+            _mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]),
+                                c);
+        }
+
+        for (; j < cols; ++j) {
+            C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
+        }
+    }
+}
+
+void gpmp::linalg::Mtx::mtx_mult(const int8_t *A,
+              const int8_t *B,
+              int8_t *C,
+              int rows_a,
+              int cols_a,
+              int cols_b) {
+
+    for (int i = 0; i < rows_a; ++i) {
+        for (int j = 0; j < cols_b; j += 32) {
+            __m256i c = _mm256_setzero_si256(); 
+
+            for (int k = 0; k < cols_a; ++k) {
+                __m256i a = _mm256_set1_epi8(A[i * cols_a + k]); 
+                __m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j])); 
+
+                __m256i prod = _mm256_maddubs_epi16(a, b); 
+                c = _mm256_add_epi16(c, prod); 
+            }
+
+            c = _mm256_srai_epi16(c, 8);
+            c = _mm256_packs_epi16(c, _mm256_setzero_si256()); 
+
+            _mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c); 
+        }
+
+        // Handle remaining elements
+        for (int j = cols_b - cols_b % 32; j < cols_b; ++j) {
+            int sum = 0;
+
+            for (int k = 0; k < cols_a; ++k) {
+                sum += A[i * cols_a + k] * B[k * cols_b + j];
+            }
+
+            C[i * cols_b + j] = sum;
+        }
+    }
+
+}
+
+
 #endif
 
 // x86

diff --git a/modules/linalg/mtx_naive.cpp b/modules/linalg/mtx_naive.cpp
@@ -42,53 +42,6 @@
  * Standard/Naive Matrix Operations on Arrays
  *
  ************************************************************************/
-// naive matrix addition algorithm on arrays
-template <typename T>
-void gpmp::linalg::Mtx::std_mtx_add(const T *A,
-                                    const T *B,
-                                    T *C,
-                                    int rows,
-                                    int cols) {
-    // MTX A AND B MUST BE SAME SIZE
-    for (int i = 0; i < rows; ++i) {
-        for (int j = 0; j < cols; ++j) {
-            // perform matrix addition
-            C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
-        }
-    }
-}
-
-// instantiations for types accepted by templated std_mtx_add function for
-// flat arrays
-template void gpmp::linalg::Mtx::std_mtx_add(const int8_t *A,
-                                             const int8_t *B,
-                                             int8_t *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const int16_t *A,
-                                             const int16_t *B,
-                                             int16_t *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const int *A,
-                                             const int *B,
-                                             int *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const double *A,
-                                             const double *B,
-                                             double *C,
-                                             int rows,
-                                             int cols);
-
-template void gpmp::linalg::Mtx::std_mtx_add(const float *A,
-                                             const float *B,
-                                             float *C,
-                                             int rows,
-                                             int cols);
 
 /************************************************************************
  *