Skip to content

Commit

Permalink
#EDITS: updating naming for vector functions
Browse files Browse the repository at this point in the history
  • Loading branch information
akielaries committed Feb 11, 2024
1 parent bc3eeb3 commit f2a5080
Show file tree
Hide file tree
Showing 2 changed files with 297 additions and 71 deletions.
226 changes: 226 additions & 0 deletions modules/linalg/vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,232 @@ double gpmp::linalg::dot_product(const std::vector<double> &vec1,
// x86
#endif

/************************************************************************
*
* Vector Operations for ARM NEON CPUs
*
************************************************************************/
#if defined(__ARM_ARCH_ISA_A64) || defined(__ARM_NEON) || \
defined(__ARM_ARCH) || defined(__aarch64__)

// ARM intrinsic function header
#include <arm_neon.h>
/************************************************************************
*
* Vector Operations on Vectors
*
************************************************************************/

// Vector addition using ARM NEON intrinsics, operates on integer types
void vector_add(const std::vector<int> &vec1,
const std::vector<int> &vec2,
std::vector<int> &result) {
const size_t size = vec1.size();
const int *data1 = vec1.data();
const int *data2 = vec2.data();
int *result_data = result.data();

// Check if size is a multiple of 4
if (size % 4 == 0) {
for (size_t i = 0; i < size; i += 4) {
// Load 4 elements from vec1 and vec2
int32x4_t a = vld1q_s32(data1 + i);
int32x4_t b = vld1q_s32(data2 + i);

// Perform vectorized addition
int32x4_t c = vaddq_s32(a, b);

// Store the result back to result vector
vst1q_s32(result_data + i, c);
}
} else {
// If size is not a multiple of 4, perform standard addition
for (size_t i = 0; i < size; ++i) {
result_data[i] = data1[i] + data2[i];
}
}
}

// Vector addition using ARM NEON intrinsics, operates on double types
void vector_add(const std::vector<double> &vec1,
const std::vector<double> &vec2,
std::vector<double> &result) {
const size_t size = vec1.size();
const double *data1 = vec1.data();
const double *data2 = vec2.data();
double *result_data = result.data();

// Check if size is a multiple of 2
if (size % 2 == 0) {
for (size_t i = 0; i < size; i += 2) {
// Load 2 elements from vec1 and vec2
float64x2_t a = vld1q_f64(data1 + i);
float64x2_t b = vld1q_f64(data2 + i);

// Perform vectorized addition
float64x2_t c = vaddq_f64(a, b);

// Store the result back to result vector
vst1q_f64(result_data + i, c);
}
} else {
// If size is not a multiple of 2, perform standard addition
for (size_t i = 0; i < size; ++i) {
result_data[i] = data1[i] + data2[i];
}
}
}

// Vector subtraction using ARM NEON intrinsics, operates on integer types
void vector_sub(const std::vector<int> &vec1,
const std::vector<int> &vec2,
std::vector<int> &result) {
const int vecSize = vec1.size();
const int remainder = vecSize % 8;
const int vecSizeAligned = vecSize - remainder;

for (int i = 0; i < vecSizeAligned; i += 8) {
int32x4_t vec1Data1 = vld1q_s32(&vec1[i]);
int32x4_t vec1Data2 = vld1q_s32(&vec1[i + 4]);
int32x4_t vec2Data1 = vld1q_s32(&vec2[i]);
int32x4_t vec2Data2 = vld1q_s32(&vec2[i + 4]);

int32x4_t sub1 = vsubq_s32(vec1Data1, vec2Data1);
int32x4_t sub2 = vsubq_s32(vec1Data2, vec2Data2);

vst1q_s32(&result[i], sub1);
vst1q_s32(&result[i + 4], sub2);
}

for (int i = vecSizeAligned; i < vecSize; ++i) {
result[i] = vec1[i] - vec2[i];
}
}

// Vector subtraction using ARM NEON intrinsics, operates on double types
void vector_sub(const std::vector<double> &vec1,
const std::vector<double> &vec2,
std::vector<double> &result) {
const int vecSize = vec1.size();
const int remainder = vecSize % 4;
const int vecSizeAligned = vecSize - remainder;

for (int i = 0; i < vecSizeAligned; i += 4) {
float64x2_t vec1Data1 = vld1q_f64(&vec1[i]);
float64x2_t vec1Data2 = vld1q_f64(&vec1[i + 2]);
float64x2_t vec2Data1 = vld1q_f64(&vec2[i]);
float64x2_t vec2Data2 = vld1q_f64(&vec2[i + 2]);

float64x2_t sub1 = vsubq_f64(vec1Data1, vec2Data1);
float64x2_t sub2 = vsubq_f64(vec1Data2, vec2Data2);

vst1q_f64(&result[i], sub1);
vst1q_f64(&result[i + 2], sub2);
}

for (int i = vecSizeAligned; i < vecSize; ++i) {
result[i] = vec1[i] - vec2[i];
}
}

// Vector multiplication using ARM NEON intrinsics, operates on integer types
void vector_mult(const std::vector<int> &vec,
int scalar,
std::vector<int> &result) {
const size_t size = vec.size();
const int *data = vec.data();
int *result_data = result.data();

const int32x4_t scalarVector = vdupq_n_s32(scalar);

for (size_t i = 0; i < size; i += 4) {
int32x4_t vecData = vld1q_s32(data + i);

int32x4_t mulResult = vmulq_s32(vecData, scalarVector);

vst1q_s32(result_data + i, mulResult);
}
}

// Vector multiplication using ARM NEON intrinsics, operates on double types
void vector_mult(const std::vector<double> &vec,
double scalar,
std::vector<double> &result) {
const int vecSize = vec.size();
const int remainder = vecSize % 2;
const int vecSizeAligned = vecSize - remainder;

const float64x2_t scalarVector = vdupq_n_f64(scalar);

for (int i = 0; i < vecSizeAligned; i += 2) {
float64x2_t vecData = vld1q_f64(&vec[i]);

float64x2_t mulResult = vmulq_f64(vecData, scalarVector);

vst1q_f64(&result[i], mulResult);
}

for (int i = vecSizeAligned; i < vecSize; ++i) {
result[i] = vec[i] * scalar;
}
}

// Dot product using ARM NEON intrinsics, operates on integer types
int dot_product(const std::vector<int> &vec1, const std::vector<int> &vec2) {
const int vecSize = vec1.size();
const int32_t *data1 = vec1.data();
const int32_t *data2 = vec2.data();

int32x4_t sumVec = vdupq_n_s32(0);

for (int i = 0; i < vecSize; i += 4) {
int32x4_t vec1Data = vld1q_s32(data1 + i);
int32x4_t vec2Data = vld1q_s32(data2 + i);

int32x4_t mulResult = vmulq_s32(vec1Data, vec2Data);

sumVec = vaddq_s32(sumVec, mulResult);
}

int32_t sum = vgetq_lane_s32(sumVec, 0) + vgetq_lane_s32(sumVec, 1) +
vgetq_lane_s32(sumVec, 2) + vgetq_lane_s32(sumVec, 3);

for (int i = vecSize & ~3; i < vecSize; ++i) {
sum += data1[i] * data2[i];
}

return sum;
}

// Dot product using ARM NEON intrinsics, operates on double types
double dot_product(const std::vector<double> &vec1,
const std::vector<double> &vec2) {
const int vecSize = vec1.size();
const double *data1 = vec1.data();
const double *data2 = vec2.data();

float64x2_t sumVec = vdupq_n_f64(0);

for (int i = 0; i < vecSize; i += 2) {
float64x2_t vec1Data = vld1q_f64(data1 + i);
float64x2_t vec2Data = vld1q_f64(data2 + i);

float64x2_t mulResult = vmulq_f64(vec1Data, vec2Data);

sumVec = vaddq_f64(sumVec, mulResult);
}

double sum = vgetq_lane_f64(sumVec, 0) + vgetq_lane_f64(sumVec, 1);

for (int i = vecSize & ~1; i < vecSize; ++i) {
sum += data1[i] * data2[i];
}

return sum;
}

// ARM NEON
#endif
void gpmp::linalg::std_vector_add(const std::vector<double> &vec1,
const std::vector<double> &vec2,
std::vector<double> &result) {
Expand Down
Loading

0 comments on commit f2a5080

Please sign in to comment.