Skip to content

Commit

Permalink
add reduction gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
chenxuhao committed Apr 17, 2023
1 parent 365b0ef commit 8a21b84
Show file tree
Hide file tree
Showing 7 changed files with 181 additions and 24 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
KERNELS = bc bfs blackscholes cc cf fft histogram kmeans lbm nbody pagerank \
saxpy scan sgemm sssp spmv stencil streamcluster tc vc
reduction saxpy scan sgemm sssp spmv stencil streamcluster tc vc

.PHONY: all
all: $(KERNELS)
Expand Down
4 changes: 2 additions & 2 deletions src/common.mk
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ NVLIBS = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs -lcuda -lcudart
MPI_LIBS = -L$(MPI_HOME)/lib -lmpi
CILKFLAGS = -O3 -fopenmp=libiomp5 -fopencilk
CILK_INC = -I$(GCC_HOME)/include -I$(CILK_CLANG)/include

CUINC = -I$(CUDA_HOME)/include
INCLUDES = -I../../include
VPATH += ../common
OBJS = VertexSet.o graph.o
Expand All @@ -53,7 +53,7 @@ endif
$(CXX) $(CXXFLAGS) $(INCLUDES) -c $<

%.o: %.cu
$(NVCC) $(NVFLAGS) $(INCLUDES) -c $<
$(NVCC) $(NVFLAGS) $(INCLUDES) $(CUINC) -c $<

%.o: %.cxx
$(CLANGXX) $(CILKFLAGS) $(INCLUDES) $(CILK_INC) -c $<
Expand Down
8 changes: 6 additions & 2 deletions src/reduction/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,12 @@ include ../common.mk
OBJS = main.o
all: reduce_omp_base

reduce_omp_base: $(OBJS)
$(CC) $(CFLAGS) $(INCLUDES) $(OBJS) -o $@ -lgomp
reduce_omp_base: $(OBJS) omp_base.o
$(CC) $(CFLAGS) $(INCLUDES) $(OBJS) omp_base.o -o $@ -lgomp
mv $@ $(BIN)

reduce_gpu_base: $(OBJS) gpu_base.o
$(CC) $(CFLAGS) $(INCLUDES) $(OBJS) gpu_base.o -o $@ $(NVLIBS)
mv $@ $(BIN)

clean:
Expand Down
141 changes: 141 additions & 0 deletions src/reduction/gpu_base.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#include <stdio.h>
#include <assert.h>
#include <cuda_runtime.h>
//#include <helper_cuda.h>
//#include <helper_functions.h>

#define checkCudaErrors( call) do { \
cudaError err = call; \
if( cudaSuccess != err) { \
fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n", \
__FILE__, __LINE__, cudaGetErrorString( err) ); \
exit(EXIT_FAILURE); \
} } while (0)

// This version uses contiguous threads, but its interleaved addressing results in many shared memory bank conflicts.
template <class T>
__global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) {
extern __shared__ T sdata[];
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();
// do reduction in shared mem
for (unsigned int s=1; s < blockDim.x; s *= 2) {
int index = 2 * s * tid;
if (index < blockDim.x) {
sdata[index] += sdata[index + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

// This version uses sequential addressing -- no divergence or bank conflicts.
template <class T>
__global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) {
extern __shared__ T sdata[];
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
__syncthreads();
// do reduction in shared mem
for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
__syncthreads();
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}

unsigned int nextPow2(unsigned int x) {
--x;
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
return ++x;
}

#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif

void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads) {
threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
blocks = (n + threads - 1) / threads;
}

// Wrapper function for kernel launch
template <class T>
void reduce(int size, int threads, int blocks, T *d_idata, T *d_odata) {
dim3 dimBlock(threads, 1, 1);
dim3 dimGrid(blocks, 1, 1);
// when there is only one warp per block, we need to allocate two warps
// worth of shared memory so that we don't index shared memory out of bounds
int smemSize = (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
reduce2<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
}

typedef int T;
extern "C"
void reduction(int n, T *h_idata, T &sum, T &max_num, T &min_num) {
assert(n > 0);
unsigned int bytes = n * sizeof(T);
int cpuFinalThreshold = 1;
int numThreads = 512;
int numBlocks = (n-1)/numThreads + 1;
if (numBlocks == 1) cpuFinalThreshold = 1;

T *h_odata = (T *) malloc(numBlocks*sizeof(T));
T *d_idata = NULL;
T *d_odata = NULL;
checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T)));
// copy data directly to device memory
checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice));

bool cpuFinalReduction = false;
bool needReadBack = true;
T gpu_result = 0;
reduce<T>(n, numThreads, numBlocks, d_idata, d_odata);
int blocks = 0, threads = 0, maxBlocks = 0, maxThreads = 0;
if (cpuFinalReduction) {
// sum partial sums from each block on CPU
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost));
for (int i=0; i<numBlocks; i++) {
gpu_result += h_odata[i];
}
needReadBack = false;
} else {
int s = numBlocks;
while (s > cpuFinalThreshold) {
int threads = 0, blocks = 0;
getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads);
reduce<T>(s, threads, blocks, d_odata, d_odata);
s = (s + threads - 1) / threads;
//s = (s + (threads*2-1)) / (threads*2);
}
if (s > 1) {
// copy result from device to host
checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost));
for (int i=0; i < s; i++) {
gpu_result += h_odata[i];
}
needReadBack = false;
}
}
cudaDeviceSynchronize();
if (needReadBack)
checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost));
sum = gpu_result;
}

25 changes: 6 additions & 19 deletions src/reduction/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,35 +3,22 @@
#include <omp.h>

#define SIZE 1000000
int reduction(int n, int *arr, int *max_num, int *min_num);

int main() {
int sum = 0, i;
int *arr = malloc(SIZE * sizeof(int));

// Initialize the array with random values
srand(0);
int i;
for (i = 0; i < SIZE; i++) {
arr[i] = rand() % 100;
}

// Compute sum in parallel using OpenMP
#pragma omp parallel for reduction(+: sum)
for (i = 0; i < SIZE; i++) {
sum += arr[i];
}

int max_num = arr[0];
// Find the maximum value in parallel using OpenMP
#pragma omp parallel for reduction(max:max_num)
for (int i = 1; i < SIZE; i++) {
// Check if the current number is greater than the current maximum value
if (arr[i] > max_num) {
// Update the maximum value if the current number is greater
max_num = arr[i];
}
}

int max = arr[0], min = arr[0];
int sum = reduction(SIZE, arr, &max, &min);
printf("The sum of the array is %d\n", sum);
printf("The max of the array is %d\n", max);
printf("The min of the array is %d\n", min);
free(arr);
return 0;
}
Expand Down
24 changes: 24 additions & 0 deletions src/reduction/omp_base.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
#include <omp.h>

int reduction(int n, int *arr, int *max, int *min) {
int i, sum = 0;
// Compute sum in parallel using OpenMP
#pragma omp parallel for reduction(+: sum)
for (i = 0; i < n; i++) {
sum += arr[i];
}

int max_num = arr[0];
int min_num = arr[0];
// Find the maximum value in parallel using OpenMP
#pragma omp parallel for reduction(max:max_num) reduction(min:min_num)
for (i = 1; i < n; i++) {
// Check if the current number is greater than the current maximum value
// Update the maximum value if the current number is greater
if (arr[i] > max_num) max_num = arr[i];
if (arr[i] < min_num) min_num = arr[i];
}
*max = max_num;
*min = min_num;
return sum;
}
1 change: 1 addition & 0 deletions src/reduction/run-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
../../bin/reduce_omp_base

0 comments on commit 8a21b84

Please sign in to comment.