add reduction gpu

chenxuhao · Apr 17, 2023 · 8a21b84 · 8a21b84
1 parent 365b0ef
commit 8a21b84
Show file tree

Hide file tree

Showing 7 changed files with 181 additions and 24 deletions.
diff --git a/Makefile b/Makefile
@@ -1,5 +1,5 @@
 KERNELS = bc bfs blackscholes cc cf fft histogram kmeans lbm nbody pagerank \
-		  saxpy scan sgemm sssp spmv stencil streamcluster tc vc
+		  reduction saxpy scan sgemm sssp spmv stencil streamcluster tc vc
 
 .PHONY: all
 all: $(KERNELS)

diff --git a/src/common.mk b/src/common.mk
@@ -33,7 +33,7 @@ NVLIBS = -L$(CUDA_HOME)/lib64 -L$(CUDA_HOME)/lib64/stubs -lcuda -lcudart
 MPI_LIBS = -L$(MPI_HOME)/lib -lmpi
 CILKFLAGS = -O3 -fopenmp=libiomp5 -fopencilk
 CILK_INC = -I$(GCC_HOME)/include -I$(CILK_CLANG)/include
-
+CUINC = -I$(CUDA_HOME)/include
 INCLUDES = -I../../include
 VPATH += ../common
 OBJS = VertexSet.o graph.o
@@ -53,7 +53,7 @@ endif
 	$(CXX) $(CXXFLAGS) $(INCLUDES) -c $<
 
 %.o: %.cu
-	$(NVCC) $(NVFLAGS) $(INCLUDES) -c $<
+	$(NVCC) $(NVFLAGS) $(INCLUDES) $(CUINC) -c $<
 
 %.o: %.cxx
 	$(CLANGXX) $(CILKFLAGS) $(INCLUDES) $(CILK_INC) -c $<

diff --git a/src/reduction/Makefile b/src/reduction/Makefile
@@ -2,8 +2,12 @@ include ../common.mk
 OBJS = main.o 
 all: reduce_omp_base
 
-reduce_omp_base: $(OBJS)
-	$(CC) $(CFLAGS) $(INCLUDES) $(OBJS) -o $@ -lgomp
+reduce_omp_base: $(OBJS) omp_base.o
+	$(CC) $(CFLAGS) $(INCLUDES) $(OBJS) omp_base.o -o $@ -lgomp
+	mv $@ $(BIN)
+
+reduce_gpu_base: $(OBJS) gpu_base.o
+	$(CC) $(CFLAGS) $(INCLUDES) $(OBJS) gpu_base.o -o $@ $(NVLIBS)
 	mv $@ $(BIN)
 
 clean:

diff --git a/src/reduction/gpu_base.cu b/src/reduction/gpu_base.cu
@@ -0,0 +1,141 @@
+#include <stdio.h>
+#include <assert.h>
+#include <cuda_runtime.h>
+//#include <helper_cuda.h>
+//#include <helper_functions.h>
+
+#define checkCudaErrors( call) do {										\
+   cudaError err = call;												\
+   if( cudaSuccess != err) {											\
+       fprintf(stderr, "Cuda error in file '%s' in line %i : %s.\n",	\
+               __FILE__, __LINE__, cudaGetErrorString( err) );			\
+   exit(EXIT_FAILURE);													\
+   } } while (0)
+
+// This version uses contiguous threads, but its interleaved addressing results in many shared memory bank conflicts.
+template <class T>
+__global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) {
+  extern __shared__ T sdata[];
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+  sdata[tid] = (i < n) ? g_idata[i] : 0;
+  __syncthreads();
+  // do reduction in shared mem
+  for (unsigned int s=1; s < blockDim.x; s *= 2) {
+    int index = 2 * s * tid;
+    if (index < blockDim.x) {
+      sdata[index] += sdata[index + s];
+    }
+    __syncthreads();
+  }
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+// This version uses sequential addressing -- no divergence or bank conflicts.
+template <class T>
+__global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) {
+  extern __shared__ T sdata[];
+  // load shared mem
+  unsigned int tid = threadIdx.x;
+  unsigned int i = blockIdx.x*blockDim.x + threadIdx.x;
+  sdata[tid] = (i < n) ? g_idata[i] : 0;
+  __syncthreads();
+  // do reduction in shared mem
+  for (unsigned int s=blockDim.x/2; s>0; s>>=1) {
+    if (tid < s) {
+      sdata[tid] += sdata[tid + s];
+    }
+    __syncthreads();
+  }
+  // write result for this block to global mem
+  if (tid == 0) g_odata[blockIdx.x] = sdata[0];
+}
+
+unsigned int nextPow2(unsigned int x) {
+  --x;
+  x |= x >> 1;
+  x |= x >> 2;
+  x |= x >> 4;
+  x |= x >> 8;
+  x |= x >> 16;
+  return ++x;
+}
+
+#ifndef MIN
+#define MIN(x,y) ((x < y) ? x : y)
+#endif
+
+void getNumBlocksAndThreads(int n, int maxBlocks, int maxThreads, int &blocks, int &threads) {
+  threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
+  blocks = (n + threads - 1) / threads;
+}
+
+// Wrapper function for kernel launch
+template <class T>
+void reduce(int size, int threads, int blocks, T *d_idata, T *d_odata) {
+  dim3 dimBlock(threads, 1, 1);
+  dim3 dimGrid(blocks, 1, 1);
+  // when there is only one warp per block, we need to allocate two warps
+  // worth of shared memory so that we don't index shared memory out of bounds
+  int smemSize = (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
+  reduce2<T><<< dimGrid, dimBlock, smemSize >>>(d_idata, d_odata, size);
+}
+
+typedef int T;
+extern "C"
+void reduction(int n, T *h_idata, T &sum, T &max_num, T &min_num) {
+  assert(n > 0);
+  unsigned int bytes = n * sizeof(T);
+  int cpuFinalThreshold = 1;
+  int numThreads = 512;
+  int numBlocks = (n-1)/numThreads + 1;
+  if (numBlocks == 1) cpuFinalThreshold = 1;
+
+  T *h_odata = (T *) malloc(numBlocks*sizeof(T));
+  T *d_idata = NULL;
+  T *d_odata = NULL;
+  checkCudaErrors(cudaMalloc((void **) &d_idata, bytes));
+  checkCudaErrors(cudaMalloc((void **) &d_odata, numBlocks*sizeof(T)));
+  // copy data directly to device memory
+  checkCudaErrors(cudaMemcpy(d_idata, h_idata, bytes, cudaMemcpyHostToDevice));
+  checkCudaErrors(cudaMemcpy(d_odata, h_idata, numBlocks*sizeof(T), cudaMemcpyHostToDevice));
+
+  bool cpuFinalReduction = false;
+  bool needReadBack = true;
+  T gpu_result = 0;
+  reduce<T>(n, numThreads, numBlocks, d_idata, d_odata);
+  int blocks = 0, threads = 0, maxBlocks = 0, maxThreads = 0;
+  if (cpuFinalReduction) {
+    // sum partial sums from each block on CPU
+    // copy result from device to host
+    checkCudaErrors(cudaMemcpy(h_odata, d_odata, numBlocks*sizeof(T), cudaMemcpyDeviceToHost));
+    for (int i=0; i<numBlocks; i++) {
+      gpu_result += h_odata[i];
+    }
+    needReadBack = false;
+  } else {
+    int s = numBlocks;
+    while (s > cpuFinalThreshold) {
+      int threads = 0, blocks = 0;
+      getNumBlocksAndThreads(s, maxBlocks, maxThreads, blocks, threads);
+      reduce<T>(s, threads, blocks, d_odata, d_odata);
+      s = (s + threads - 1) / threads;
+      //s = (s + (threads*2-1)) / (threads*2);
+    }
+    if (s > 1) {
+      // copy result from device to host
+      checkCudaErrors(cudaMemcpy(h_odata, d_odata, s * sizeof(T), cudaMemcpyDeviceToHost));
+      for (int i=0; i < s; i++) {
+        gpu_result += h_odata[i];
+      }
+      needReadBack = false;
+    }
+  }
+  cudaDeviceSynchronize();
+  if (needReadBack)
+    checkCudaErrors(cudaMemcpy(&gpu_result, d_odata, sizeof(T), cudaMemcpyDeviceToHost));
+  sum = gpu_result;
+}
+
diff --git a/src/reduction/main.c b/src/reduction/main.c
@@ -3,35 +3,22 @@
 #include <omp.h>
 
 #define SIZE 1000000
+int reduction(int n, int *arr, int *max_num, int *min_num);
 
 int main() {
-  int sum = 0, i;
   int *arr = malloc(SIZE * sizeof(int));
 
   // Initialize the array with random values
   srand(0);
+  int i;
   for (i = 0; i < SIZE; i++) {
     arr[i] = rand() % 100;
   }
-
-  // Compute sum in parallel using OpenMP
-  #pragma omp parallel for reduction(+: sum)
-  for (i = 0; i < SIZE; i++) {
-    sum += arr[i];
-  }
-
-  int max_num = arr[0];
-  // Find the maximum value in parallel using OpenMP
-  #pragma omp parallel for reduction(max:max_num)
-  for (int i = 1; i < SIZE; i++) {
-    // Check if the current number is greater than the current maximum value
-    if (arr[i] > max_num) {
-      // Update the maximum value if the current number is greater
-      max_num = arr[i];
-    }
-  }
-
+  int max = arr[0], min = arr[0];
+  int sum = reduction(SIZE, arr, &max, &min);
   printf("The sum of the array is %d\n", sum);
+  printf("The max of the array is %d\n", max);
+  printf("The min of the array is %d\n", min);
   free(arr);
   return 0;
 }

diff --git a/src/reduction/omp_base.c b/src/reduction/omp_base.c
@@ -0,0 +1,24 @@
+#include <omp.h>
+
+int reduction(int n, int *arr, int *max, int *min) {
+  int i, sum = 0;
+  // Compute sum in parallel using OpenMP
+  #pragma omp parallel for reduction(+: sum)
+  for (i = 0; i < n; i++) {
+    sum += arr[i];
+  }
+
+  int max_num = arr[0];
+  int min_num = arr[0];
+  // Find the maximum value in parallel using OpenMP
+  #pragma omp parallel for reduction(max:max_num) reduction(min:min_num)
+  for (i = 1; i < n; i++) {
+    // Check if the current number is greater than the current maximum value
+    // Update the maximum value if the current number is greater
+    if (arr[i] > max_num) max_num = arr[i];
+    if (arr[i] < min_num) min_num = arr[i];
+  }
+  *max = max_num;
+  *min = min_num;
+  return sum;
+}
diff --git a/src/reduction/run-test.sh b/src/reduction/run-test.sh
@@ -0,0 +1 @@
+../../bin/reduce_omp_base