Skip to content

Commit

Permalink
#EDITS: updates to linalg module and file seperation
Browse files Browse the repository at this point in the history
  • Loading branch information
akielaries committed Feb 18, 2024
1 parent 4e1e46c commit 1e02cdf
Show file tree
Hide file tree
Showing 7 changed files with 462 additions and 57 deletions.
66 changes: 58 additions & 8 deletions experiment/blas2.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,64 @@
#include <chrono>
#include <iostream>

const int matrixSize = 8192;
const int matrixSize = 1024;

void run_openblas_mtx_add();
void run_openblas_mtx_mul();

int main() {
run_openblas_mtx_add();
run_openblas_mtx_mul();

return 0;
}

void run_openblas_mtx_mul() {
// Create matrices A, B, and C
double *A = new double[matrixSize * matrixSize];
double *B = new double[matrixSize * matrixSize];
double *C = new double[matrixSize * matrixSize];

// Initialize matrices A and B with random values
for (int i = 0; i < matrixSize * matrixSize; ++i) {
A[i] = rand() % 100;
B[i] = rand() % 100;
}

// Measure the time for matrix multiplication using OpenBLAS
auto start_std = std::chrono::high_resolution_clock::now();

// Use OpenBLAS to multiply matrices A and B and store the result in matrix
// C
cblas_dgemm(CblasRowMajor,
CblasNoTrans,
CblasNoTrans,
matrixSize,
matrixSize,
matrixSize,
1.0,
A,
matrixSize,
B,
matrixSize,
0.0,
C,
matrixSize);

auto end_std = std::chrono::high_resolution_clock::now();

std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;

// Calculate and print the elapsed time
std::cout << "Matrix multiplication using OpenBLAS took "
<< elapsed_seconds_std.count() << " seconds." << std::endl;

// Cleanup
delete[] A;
delete[] B;
delete[] C;
}

void run_openblas_mtx_add() {
// Create matrices A, B, and C
double *A = new double[matrixSize * matrixSize];
Expand All @@ -24,19 +73,20 @@ void run_openblas_mtx_add() {
}

// Measure the time for matrix addition using OpenBLAS
auto start_time = std::chrono::steady_clock::now();
// auto start_time = std::chrono::steady_clock::now();
auto start_std = std::chrono::high_resolution_clock::now();

// Use OpenBLAS to add matrices A and B and store the result in matrix C
cblas_daxpy(matrixSize * matrixSize, 1.0, A, 1, C, 1);

auto end_time = std::chrono::steady_clock::now();
// auto end_time = std::chrono::steady_clock::now();
auto end_std = std::chrono::high_resolution_clock::now();

std::chrono::duration<double> elapsed_seconds_std = end_std - start_std;

// Calculate and print the elapsed time
std::cout << "OpenBLAS - Time elapsed: "
<< std::chrono::duration_cast<std::chrono::milliseconds>(
end_time - start_time)
.count()
<< " ms" << std::endl;
std::cout << "TIME-ELAPSED: " << elapsed_seconds_std.count() << "seconds"
<< std::endl;

// Cleanup
delete[] A;
Expand Down
68 changes: 56 additions & 12 deletions include/linalg/mtx.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,27 +244,62 @@ class Mtx {
*/
void mtx_sub(const float *A, const float *B, float *C, int rows, int cols);

/**
* @brief Perform matrix multiplication using Intel intrinsics, accepts
* flat arrays of 8 bit integers
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void mtx_mult(const int8_t *A,
const int8_t *B,
int8_t *C,
int rows_a,
int cols_a,
int cols_b);

/**
* @brief Perform matrix multiplication using Intel intrinsics, accepts
* flat arrays of 16 bit integers
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void mtx_mult(const int16_t *A,
const int16_t *B,
int16_t *C,
int rows_a,
int cols_a,
int cols_b);

/**
* @brief Perform matrix multiplication using Intel intrinsics, accepts
* flat arrays of 32 bit integers
* @param A Input matrix A
* @param B Input matrix B
* @param C Output matrix C
* @note Matrices must be of at least size 8x8
* @overload
*/
void mtx_mult(const int *A,
const int *B,
int *C,
int rows_a,
int cols_a,
int cols_b);

void mtx_mult(const int *A,
const int *B,
int64_t *C,
int rows_a,
int cols_a,
int cols_b);

void mtx_tpose(const int *A, int *C, int rows, int cols);

/**
* @brief Perform matrix subtraction using Intel intrinsics, accepts
* vectors of type int
Expand Down Expand Up @@ -480,24 +515,33 @@ class Mtx {
}
}

template <typename T>
// template <typename T>
template <typename T, typename U>
void std_mtx_mult(const T *A,
const T *B,
T *C,
int rowsA,
int colsA,
int colsB) {
for (int i = 0; i < rowsA; ++i) {
for (int j = 0; j < colsB; ++j) {
T sum = 0; // Use T type for sum
for (int k = 0; k < colsA; ++k) {
sum += A[i * colsA + k] * B[k * colsB + j];
U *C,
int rows_a,
int cols_a,
int cols_b) {
for (int i = 0; i < rows_a; ++i) {
for (int j = 0; j < cols_b; ++j) {
U sum = 0; // Use T type for sum
for (int k = 0; k < cols_a; ++k) {
sum += A[i * cols_a + k] * B[k * cols_b + j];
}
C[i * colsB + j] = sum;
C[i * cols_b + j] = sum;
}
}
}

template <typename T>
void std_mtx_tpose(const T *A, T *At, int rows, int cols) {
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
At[j * rows + i] = A[i * cols + j];
}
}
}
/**
* @brief Perform matrix addition on two matrices as flat vectors
* @param A Input matrix A
Expand Down
29 changes: 15 additions & 14 deletions modules/linalg/mtx_avx2_arr_i16.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -115,40 +115,41 @@ void gpmp::linalg::Mtx::mtx_sub(const int16_t *A,
}

void gpmp::linalg::Mtx::mtx_mult(const int16_t *A,
const int16_t *B,
int16_t *C,
int rows_a,
int cols_a,
int cols_b) {
const int16_t *B,
int16_t *C,
int rows_a,
int cols_a,
int cols_b) {
for (int i = 0; i < rows_a; ++i) {
for (int j = 0; j < cols_b; j += 16) {
__m256i c = _mm256_setzero_si256();

for (int k = 0; k < cols_a; ++k) {
__m256i a = _mm256_set1_epi16(A[i * cols_a + k]);
__m256i b = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(&B[k * cols_b + j]));

__m256i b = _mm256_loadu_si256(
reinterpret_cast<const __m256i *>(&B[k * cols_b + j]));

__m256i prod = _mm256_mullo_epi16(a, b);
c = _mm256_add_epi16(c, prod);
}

_mm256_storeu_si256(reinterpret_cast<__m256i*>(&C[i * cols_b + j]), c);

_mm256_storeu_si256(reinterpret_cast<__m256i *>(&C[i * cols_b + j]),
c);
}

// Handle remaining elements
for (int j = cols_b - cols_b % 16; j < cols_b; ++j) {
int sum = 0;

for (int k = 0; k < cols_a; ++k) {
sum += A[i * cols_a + k] * B[k * cols_b + j];
}

C[i * cols_b + j] = sum;
}
}
}


#endif

// x86
Expand Down
Loading

0 comments on commit 1e02cdf

Please sign in to comment.