Added version of hemi::parallel_for with ExecutionPolicy argument, an…

…d associated tests.
elezar · Aug 25, 2015 · 7f6fed5 · 7f6fed5
1 parent abd7c43
commit 7f6fed5
Show file tree

Hide file tree

Showing 3 changed files with 172 additions and 17 deletions.
diff --git a/hemi/parallel_for.h b/hemi/parallel_for.h
@@ -16,6 +16,7 @@
 
 #include "hemi.h"
 #include "launch.h"
+#include "configure.h"
 #include "grid_stride_range.h"
 
 // TODO, add versions with execution policy
@@ -24,9 +25,17 @@
 
 namespace hemi 
 {
+	class ExecutionPolicy; // forward decl
+
 	template <typename index_type, typename F>
 	void parallel_for(index_type first, index_type last, F function) {
-		hemi::launch([=] HEMI_LAMBDA () {
+		ExecutionPolicy p;
+		parallel_for(p, first, last, function);
+	}
+
+	template <typename index_type, typename F>
+	void parallel_for(const ExecutionPolicy &p, index_type first, index_type last, F function) {
+		hemi::launch(p, [=] HEMI_LAMBDA () {
 			for (auto idx : grid_stride_range(first, last)) function(idx);
 		});
 	}

diff --git a/test/Makefile b/test/Makefile
@@ -105,13 +105,13 @@ test_portable_launch_device.o: test_portable_launch.cpp
 	nvcc -x cu $(CPPFLAGS) $(NVCCFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -c -o $@
 
 test_parallel_for_host.o: test_parallel_for.cpp
-	$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include -L$(CUDA_DIR)/lib  -lcudart $^ -o $@
+	$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -o $@
 
 test_parallel_for_device.o: test_parallel_for.cpp
 	nvcc -x cu $(CPPFLAGS) $(NVCCFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -c -o $@
 
 test_array_host.o: test_array.cpp
-	$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include -L$(CUDA_DIR)/lib  -lcudart $^ -o $@
+	$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -o $@
 
 test_array_device.o: test_array.cpp 
 	nvcc -x cu $(CPPFLAGS) $(NVCCFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -c -o $@

diff --git a/test/test_parallel_for.cpp b/test/test_parallel_for.cpp
@@ -5,38 +5,184 @@
 // Separate function because __device__ lambda can't be declared
 // inside a private member function, and TEST() defines TestBody()
 // private to the test class
-void runParallelFor(int *result) 
+void runParallelFor(int *count, int *gdim, int *bdim) 
 {
 	hemi::parallel_for(0, 100, [=] HEMI_LAMBDA (int) {
+		*gdim = hemi::globalBlockCount();
+		*bdim = hemi::localThreadCount();
+
+#ifdef HEMI_DEV_CODE
+        atomicAdd(count, 1);
+#else
+        (*count)++;
+#endif
+	});
+}
+
+void runParallelForEP(const hemi::ExecutionPolicy &ep, int *count, int *gdim, int *bdim) 
+{
+	hemi::parallel_for(ep, 0, 100, [=] HEMI_LAMBDA (int) {
+		*gdim = hemi::globalBlockCount();
+		*bdim = hemi::localThreadCount();
+
 #ifdef HEMI_DEV_CODE
-		atomicAdd(result, 1);
+        atomicAdd(count, 1);
 #else
-		(*result)++;
+        (*count)++;
 #endif
 	});
 }
 
-TEST(ParallelForTest, ComputesCorrectSum) {
+class ParallelForTest : public ::testing::Test {
+protected:
+  virtual void SetUp() {
+#ifdef HEMI_CUDA_COMPILER
+  	ASSERT_SUCCESS(cudaMalloc(&dCount, sizeof(int)));
+    ASSERT_SUCCESS(cudaMalloc(&dBdim, sizeof(int)));
+	ASSERT_SUCCESS(cudaMalloc(&dGdim, sizeof(int)));
+
+	int devId;
+	ASSERT_SUCCESS(cudaGetDevice(&devId));
+	ASSERT_SUCCESS(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, devId));
+#else
+	dCount = new int;
+	dBdim = new int;
+	dGdim = new int;
+
+	smCount = 1;
+#endif
+  }
+
+  virtual void TearDown() {
+#ifdef HEMI_CUDA_COMPILER
+  	ASSERT_SUCCESS(cudaFree(dCount));
+  	ASSERT_SUCCESS(cudaFree(dBdim));
+  	ASSERT_SUCCESS(cudaFree(dGdim));
+#else
+  	delete dCount;
+  	delete dBdim;
+  	delete dGdim;
+#endif
+  }	
+
+  void Zero() {
+#ifdef HEMI_CUDA_COMPILER
+    ASSERT_SUCCESS(cudaMemset(dCount, 0, sizeof(int)));
+    ASSERT_SUCCESS(cudaMemset(dBdim, 0, sizeof(int)));
+    ASSERT_SUCCESS(cudaMemset(dGdim, 0, sizeof(int)));
+#else
+    *dCount = 0;
+    *dBdim = 0;
+    *dGdim = 0;
+#endif
+    count = 0;
+    bdim = 0;
+    gdim = 0;
+  }
+
+  void CopyBack() {
+#ifdef HEMI_CUDA_COMPILER
+    ASSERT_SUCCESS(cudaMemcpy(&count, dCount, sizeof(int), cudaMemcpyDefault));
+	ASSERT_SUCCESS(cudaMemcpy(&bdim, dBdim, sizeof(int), cudaMemcpyDefault));
+	ASSERT_SUCCESS(cudaMemcpy(&gdim, dGdim, sizeof(int), cudaMemcpyDefault));
+#else
+    count = *dCount;
+    bdim = *dBdim;
+    gdim = *dGdim;
+#endif
+  }
+
+  int smCount;
+
+  int *dCount;
+  int *dBdim;
+  int *dGdim;
+
+  int count; 
+  int bdim;
+  int gdim;
+};
+
+TEST_F(ParallelForTest, ComputesCorrectSum) {
+    Zero();
+    runParallelFor(dCount, dGdim, dBdim);
+    CopyBack();
+
+    ASSERT_EQ(count, 100);
+}
+
+
+TEST_F(ParallelForTest, AutoConfigMaximalLaunch) {
+    Zero();
+	runParallelFor(dCount, dGdim, dBdim);
+	ASSERT_SUCCESS(cudaDeviceSynchronize());
+
+    CopyBack();
 
-	int hResult = 0;
-	int *dResult;
+	ASSERT_GE(gdim, smCount);
+	ASSERT_EQ(gdim%smCount, 0);
 #ifdef HEMI_CUDA_COMPILER
-	ASSERT_SUCCESS(cudaMalloc((void**)&dResult, sizeof(int)));
+	ASSERT_GE(bdim, 32);
 #else
-	dResult = new int;
+    ASSERT_EQ(bdim, 1);
 #endif
+}
+
+TEST_F(ParallelForTest, ExplicitBlockSize) 
+{
+    Zero();
+	hemi::ExecutionPolicy ep;
+	ep.setBlockSize(128);
+	runParallelForEP(ep, dCount, dGdim, dBdim);
+	ASSERT_SUCCESS(hemi::deviceSynchronize());
 
-	ASSERT_SUCCESS(cudaMemcpy(dResult, &hResult, sizeof(int), cudaMemcpyDefault));
+    CopyBack();
 
+	ASSERT_GE(gdim, smCount);
+	ASSERT_EQ(gdim%smCount, 0);
+#ifdef HEMI_CUDA_COMPILER
+	ASSERT_EQ(bdim, 128);	
+#else
+    ASSERT_EQ(bdim, 1);
+#endif
+}
 
-	runParallelFor(dResult);
+TEST_F(ParallelForTest, ExplicitGridSize) 
+{
+    Zero();
+	hemi::ExecutionPolicy ep;
+	ep.setGridSize(100);
+	runParallelForEP(ep, dCount, dGdim, dBdim);
+	ASSERT_SUCCESS(cudaDeviceSynchronize());
 
-	ASSERT_SUCCESS(cudaMemcpy(&hResult, dResult, sizeof(int), cudaMemcpyDefault));
-	ASSERT_EQ(hResult, 100);
+    CopyBack();
 
 #ifdef HEMI_CUDA_COMPILER
-	ASSERT_SUCCESS(cudaFree(dResult));
+	ASSERT_EQ(gdim, 100);
+	ASSERT_GE(bdim, 32);
 #else
-	delete dResult;
+    ASSERT_EQ(gdim, 1);
+    ASSERT_EQ(bdim, 1);
+#endif	
+}
+
+TEST_F(ParallelForTest, InvalidConfigShouldFail)
+{
+    Zero();
+	// Fail due to block size too large
+	hemi::ExecutionPolicy ep;
+	ep.setBlockSize(10000);
+	runParallelForEP(ep, dCount, dGdim, dBdim);
+#ifdef HEMI_CUDA_COMPILER
+	ASSERT_FAILURE(checkCudaErrors());
+#endif
+
+	// Fail due to excessive shared memory size
+	ep.setBlockSize(0);
+	ep.setGridSize(0);
+	ep.setSharedMemBytes(1000000);
+	runParallelForEP(ep, dCount, dGdim, dBdim);
+#ifdef HEMI_CUDA_COMPILER
+	ASSERT_FAILURE(checkCudaErrors());
 #endif
 }