Skip to content

Commit

Permalink
#EDITS: updates to linalg module and file seperation
Browse files Browse the repository at this point in the history
  • Loading branch information
akielaries committed Feb 18, 2024
1 parent b8cedfb commit 4e1e46c
Show file tree
Hide file tree
Showing 12 changed files with 738 additions and 52 deletions.
125 changes: 124 additions & 1 deletion include/linalg/mtx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,84 @@ class Mtx {
const std::vector<std::vector<float>> &B,
std::vector<std::vector<float>> &C);

/**
* @brief Perform matrix subtraction using Intel intrinsics, accepts
* flat arrays of 8 bit ints
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void
mtx_sub(const int8_t *A, const int8_t *B, int8_t *C, int rows, int cols);

/**
* @brief Perform matrix subtraction using Intel intrinsics, accepts
* flat arrays of 16 bit ints
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void
mtx_sub(const int16_t *A, const int16_t *B, int16_t *C, int rows, int cols);
/**
* @brief Perform matrix subtraction using Intel intrinsics, accepts
* flat arrays of type int
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void mtx_sub(const int *A, const int *B, int *C, int rows, int cols);

/**
* @brief Perform matrix subtraction using Intel intrinsics, accepts
* flat arrays of type double
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void
mtx_sub(const double *A, const double *B, double *C, int rows, int cols);

/**
* @brief Perform matrix subtraction using Intel intrinsics, accepts
* flat arrays of type float
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void mtx_sub(const float *A, const float *B, float *C, int rows, int cols);

void mtx_mult(const int8_t *A,
const int8_t *B,
int8_t *C,
int rows_a,
int cols_a,
int cols_b);

void mtx_mult(const int16_t *A,
const int16_t *B,
int16_t *C,
int rows_a,
int cols_a,
int cols_b);

void mtx_mult(const int *A,
const int *B,
int *C,
int rows_a,
int cols_a,
int cols_b);

/**
* @brief Perform matrix subtraction using Intel intrinsics, accepts
* vectors of type int
Expand Down Expand Up @@ -373,7 +451,52 @@ class Mtx {
* @overload
*/
template <typename T>
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols);
void std_mtx_add(const T *A, const T *B, T *C, int rows, int cols) {
// MTX A AND B MUST BE SAME SIZE
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
// perform matrix addition
C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
}
}
}
/**
* @brief Perform matrix subtraction on two matrices as flat arrays
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @param rows Number of rows
* @param cols Number of columns
* @overload
*/
template <typename T>
void std_mtx_sub(const T *A, const T *B, T *C, int rows, int cols) {
// MTX A AND B MUST BE SAME SIZE
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
// perform matrix addition
C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
}
}
}

template <typename T>
void std_mtx_mult(const T *A,
const T *B,
T *C,
int rowsA,
int colsA,
int colsB) {
for (int i = 0; i < rowsA; ++i) {
for (int j = 0; j < colsB; ++j) {
T sum = 0; // Use T type for sum
for (int k = 0; k < colsA; ++k) {
sum += A[i * colsA + k] * B[k * colsB + j];
}
C[i * colsB + j] = sum;
}
}
}

/**
* @brief Perform matrix addition on two matrices as flat vectors
Expand Down
64 changes: 64 additions & 0 deletions modules/linalg/mtx_avx2_arr_i16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,70 @@ void gpmp::linalg::Mtx::mtx_add(const int16_t *A,
}
}

void gpmp::linalg::Mtx::mtx_sub(const int16_t *A,
const int16_t *B,
int16_t *C,
int rows,
int cols) {
for (int i = 0; i < rows; ++i) {
int j = 0;
for (; j < cols - 15; j += 16) {
__m256i a = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&A[i * cols + j]));
__m256i b = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&B[i * cols + j]));
__m256i c = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&C[i * cols + j]));

// Perform vectorized subtraction and accumulate the result
c = _mm256_sub_epi16(a, b);

// Store the result back to the C matrix
_mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]),
c);
}

for (; j < cols; ++j) {
C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
}
}
}

void gpmp::linalg::Mtx::mtx_mult(const int16_t *A,
const int16_t *B,
int16_t *C,
int rows_a,
int cols_a,
int cols_b) {
for (int i = 0; i < rows_a; ++i) {
for (int j = 0; j < cols_b; j += 16) {
__m256i c = _mm256_setzero_si256();

for (int k = 0; k < cols_a; ++k) {
__m256i a = _mm256_set1_epi16(A[i * cols_a + k]);
__m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j]));

__m256i prod = _mm256_mullo_epi16(a, b);
c = _mm256_add_epi16(c, prod);
}

_mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c);
}

// Handle remaining elements
for (int j = cols_b - cols_b % 16; j < cols_b; ++j) {
int sum = 0;

for (int k = 0; k < cols_a; ++k) {
sum += A[i * cols_a + k] * B[k * cols_b + j];
}

C[i * cols_b + j] = sum;
}
}
}


#endif

// x86
Expand Down
40 changes: 39 additions & 1 deletion modules/linalg/mtx_avx2_arr_i32.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,50 @@ void gpmp::linalg::Mtx::mtx_add(const int *A,
C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
}
}
} else {
}

else {
// use standard matrix addition
std_mtx_add(A, B, C, rows, cols);
}
}

void gpmp::linalg::Mtx::mtx_mult(const int *A,
const int *B,
int *C,
int rows_a,
int cols_a,
int cols_b) {
for (int i = 0; i < rows_a; ++i) {
for (int j = 0; j < cols_b; j += 8) {
__m256i c = _mm256_setzero_si256();

for (int k = 0; k < cols_a; ++k) {
__m256i a = _mm256_set1_epi32(A[i * cols_a + k]);
__m256i b = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&B[k * cols_b + j]));

__m256i prod = _mm256_mullo_epi32(a, b);
c = _mm256_add_epi32(c, prod);
}

_mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]),
c);
}

// Handle remaining elements
for (int j = cols_b - cols_b % 8; j < cols_b; ++j) {
int sum = 0;

for (int k = 0; k < cols_a; ++k) {
sum += A[i * cols_a + k] * B[k * cols_b + j];
}

C[i * cols_b + j] = sum;
}
}
}

#endif

// x86
Expand Down
70 changes: 70 additions & 0 deletions modules/linalg/mtx_avx2_arr_i8.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <cassert>
#include <cstddef>
#include <cstdint>
#include <cstring>
#include <iostream>
#include <vector>

Expand Down Expand Up @@ -85,6 +86,75 @@ void gpmp::linalg::Mtx::mtx_add(const int8_t *A,
}
}

void gpmp::linalg::Mtx::mtx_sub(const int8_t *A,
const int8_t *B,
int8_t *C,
int rows,
int cols) {
for (int i = 0; i < rows; ++i) {
int j = 0;
for (; j < cols - 31; j += 32) {
__m256i a = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&A[i * cols + j]));
__m256i b = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&B[i * cols + j]));
__m256i c = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&C[i * cols + j]));

// Perform vectorized subtraction and accumulate the result
c = _mm256_sub_epi8(a, b);

// Store the result back to the C matrix
_mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols + j]),
c);
}

for (; j < cols; ++j) {
C[i * cols + j] = A[i * cols + j] - B[i * cols + j];
}
}
}

void gpmp::linalg::Mtx::mtx_mult(const int8_t *A,
const int8_t *B,
int8_t *C,
int rows_a,
int cols_a,
int cols_b) {

for (int i = 0; i < rows_a; ++i) {
for (int j = 0; j < cols_b; j += 32) {
__m256i c = _mm256_setzero_si256();

for (int k = 0; k < cols_a; ++k) {
__m256i a = _mm256_set1_epi8(A[i * cols_a + k]);
__m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j]));

__m256i prod = _mm256_maddubs_epi16(a, b);
c = _mm256_add_epi16(c, prod);
}

c = _mm256_srai_epi16(c, 8);
c = _mm256_packs_epi16(c, _mm256_setzero_si256());

_mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c);
}

// Handle remaining elements
for (int j = cols_b - cols_b % 32; j < cols_b; ++j) {
int sum = 0;

for (int k = 0; k < cols_a; ++k) {
sum += A[i * cols_a + k] * B[k * cols_b + j];
}

C[i * cols_b + j] = sum;
}
}

}


#endif

// x86
Expand Down
47 changes: 0 additions & 47 deletions modules/linalg/mtx_naive.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -42,53 +42,6 @@
* Standard/Naive Matrix Operations on Arrays
*
************************************************************************/
// naive matrix addition algorithm on arrays
template <typename T>
void gpmp::linalg::Mtx::std_mtx_add(const T *A,
const T *B,
T *C,
int rows,
int cols) {
// MTX A AND B MUST BE SAME SIZE
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
// perform matrix addition
C[i * cols + j] = A[i * cols + j] + B[i * cols + j];
}
}
}

// instantiations for types accepted by templated std_mtx_add function for
// flat arrays
template void gpmp::linalg::Mtx::std_mtx_add(const int8_t *A,
const int8_t *B,
int8_t *C,
int rows,
int cols);

template void gpmp::linalg::Mtx::std_mtx_add(const int16_t *A,
const int16_t *B,
int16_t *C,
int rows,
int cols);

template void gpmp::linalg::Mtx::std_mtx_add(const int *A,
const int *B,
int *C,
int rows,
int cols);

template void gpmp::linalg::Mtx::std_mtx_add(const double *A,
const double *B,
double *C,
int rows,
int cols);

template void gpmp::linalg::Mtx::std_mtx_add(const float *A,
const float *B,
float *C,
int rows,
int cols);

/************************************************************************
*
Expand Down
Loading

0 comments on commit 4e1e46c

Please sign in to comment.