Skip to content

Commit

Permalink
Added version of hemi::parallel_for with ExecutionPolicy argument, an…
Browse files Browse the repository at this point in the history
…d associated tests.
  • Loading branch information
harrism committed Aug 25, 2015
1 parent abd7c43 commit 7f6fed5
Show file tree
Hide file tree
Showing 3 changed files with 172 additions and 17 deletions.
11 changes: 10 additions & 1 deletion hemi/parallel_for.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

#include "hemi.h"
#include "launch.h"
#include "configure.h"
#include "grid_stride_range.h"

// TODO, add versions with execution policy
Expand All @@ -24,9 +25,17 @@

namespace hemi
{
class ExecutionPolicy; // forward decl

template <typename index_type, typename F>
void parallel_for(index_type first, index_type last, F function) {
hemi::launch([=] HEMI_LAMBDA () {
ExecutionPolicy p;
parallel_for(p, first, last, function);
}

template <typename index_type, typename F>
void parallel_for(const ExecutionPolicy &p, index_type first, index_type last, F function) {
hemi::launch(p, [=] HEMI_LAMBDA () {
for (auto idx : grid_stride_range(first, last)) function(idx);
});
}
Expand Down
4 changes: 2 additions & 2 deletions test/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,13 @@ test_portable_launch_device.o: test_portable_launch.cpp
nvcc -x cu $(CPPFLAGS) $(NVCCFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -c -o $@

test_parallel_for_host.o: test_parallel_for.cpp
$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include -L$(CUDA_DIR)/lib -lcudart $^ -o $@
$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -o $@

test_parallel_for_device.o: test_parallel_for.cpp
nvcc -x cu $(CPPFLAGS) $(NVCCFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -c -o $@

test_array_host.o: test_array.cpp
$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include -L$(CUDA_DIR)/lib -lcudart $^ -o $@
$(CXX) -c $(CPPFLAGS) $(CXXFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -o $@

test_array_device.o: test_array.cpp
nvcc -x cu $(CPPFLAGS) $(NVCCFLAGS) -I$(HEMI_DIR) -I$(CUDA_DIR)/include $^ -c -o $@
Expand Down
174 changes: 160 additions & 14 deletions test/test_parallel_for.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,38 +5,184 @@
// Separate function because __device__ lambda can't be declared
// inside a private member function, and TEST() defines TestBody()
// private to the test class
void runParallelFor(int *result)
void runParallelFor(int *count, int *gdim, int *bdim)
{
hemi::parallel_for(0, 100, [=] HEMI_LAMBDA (int) {
*gdim = hemi::globalBlockCount();
*bdim = hemi::localThreadCount();

#ifdef HEMI_DEV_CODE
atomicAdd(count, 1);
#else
(*count)++;
#endif
});
}

void runParallelForEP(const hemi::ExecutionPolicy &ep, int *count, int *gdim, int *bdim)
{
hemi::parallel_for(ep, 0, 100, [=] HEMI_LAMBDA (int) {
*gdim = hemi::globalBlockCount();
*bdim = hemi::localThreadCount();

#ifdef HEMI_DEV_CODE
atomicAdd(result, 1);
atomicAdd(count, 1);
#else
(*result)++;
(*count)++;
#endif
});
}

TEST(ParallelForTest, ComputesCorrectSum) {
class ParallelForTest : public ::testing::Test {
protected:
virtual void SetUp() {
#ifdef HEMI_CUDA_COMPILER
ASSERT_SUCCESS(cudaMalloc(&dCount, sizeof(int)));
ASSERT_SUCCESS(cudaMalloc(&dBdim, sizeof(int)));
ASSERT_SUCCESS(cudaMalloc(&dGdim, sizeof(int)));

int devId;
ASSERT_SUCCESS(cudaGetDevice(&devId));
ASSERT_SUCCESS(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, devId));
#else
dCount = new int;
dBdim = new int;
dGdim = new int;

smCount = 1;
#endif
}

virtual void TearDown() {
#ifdef HEMI_CUDA_COMPILER
ASSERT_SUCCESS(cudaFree(dCount));
ASSERT_SUCCESS(cudaFree(dBdim));
ASSERT_SUCCESS(cudaFree(dGdim));
#else
delete dCount;
delete dBdim;
delete dGdim;
#endif
}

void Zero() {
#ifdef HEMI_CUDA_COMPILER
ASSERT_SUCCESS(cudaMemset(dCount, 0, sizeof(int)));
ASSERT_SUCCESS(cudaMemset(dBdim, 0, sizeof(int)));
ASSERT_SUCCESS(cudaMemset(dGdim, 0, sizeof(int)));
#else
*dCount = 0;
*dBdim = 0;
*dGdim = 0;
#endif
count = 0;
bdim = 0;
gdim = 0;
}

void CopyBack() {
#ifdef HEMI_CUDA_COMPILER
ASSERT_SUCCESS(cudaMemcpy(&count, dCount, sizeof(int), cudaMemcpyDefault));
ASSERT_SUCCESS(cudaMemcpy(&bdim, dBdim, sizeof(int), cudaMemcpyDefault));
ASSERT_SUCCESS(cudaMemcpy(&gdim, dGdim, sizeof(int), cudaMemcpyDefault));
#else
count = *dCount;
bdim = *dBdim;
gdim = *dGdim;
#endif
}

int smCount;

int *dCount;
int *dBdim;
int *dGdim;

int count;
int bdim;
int gdim;
};

TEST_F(ParallelForTest, ComputesCorrectSum) {
Zero();
runParallelFor(dCount, dGdim, dBdim);
CopyBack();

ASSERT_EQ(count, 100);
}


TEST_F(ParallelForTest, AutoConfigMaximalLaunch) {
Zero();
runParallelFor(dCount, dGdim, dBdim);
ASSERT_SUCCESS(cudaDeviceSynchronize());

CopyBack();

int hResult = 0;
int *dResult;
ASSERT_GE(gdim, smCount);
ASSERT_EQ(gdim%smCount, 0);
#ifdef HEMI_CUDA_COMPILER
ASSERT_SUCCESS(cudaMalloc((void**)&dResult, sizeof(int)));
ASSERT_GE(bdim, 32);
#else
dResult = new int;
ASSERT_EQ(bdim, 1);
#endif
}

TEST_F(ParallelForTest, ExplicitBlockSize)
{
Zero();
hemi::ExecutionPolicy ep;
ep.setBlockSize(128);
runParallelForEP(ep, dCount, dGdim, dBdim);
ASSERT_SUCCESS(hemi::deviceSynchronize());

ASSERT_SUCCESS(cudaMemcpy(dResult, &hResult, sizeof(int), cudaMemcpyDefault));
CopyBack();

ASSERT_GE(gdim, smCount);
ASSERT_EQ(gdim%smCount, 0);
#ifdef HEMI_CUDA_COMPILER
ASSERT_EQ(bdim, 128);
#else
ASSERT_EQ(bdim, 1);
#endif
}

runParallelFor(dResult);
TEST_F(ParallelForTest, ExplicitGridSize)
{
Zero();
hemi::ExecutionPolicy ep;
ep.setGridSize(100);
runParallelForEP(ep, dCount, dGdim, dBdim);
ASSERT_SUCCESS(cudaDeviceSynchronize());

ASSERT_SUCCESS(cudaMemcpy(&hResult, dResult, sizeof(int), cudaMemcpyDefault));
ASSERT_EQ(hResult, 100);
CopyBack();

#ifdef HEMI_CUDA_COMPILER
ASSERT_SUCCESS(cudaFree(dResult));
ASSERT_EQ(gdim, 100);
ASSERT_GE(bdim, 32);
#else
delete dResult;
ASSERT_EQ(gdim, 1);
ASSERT_EQ(bdim, 1);
#endif
}

TEST_F(ParallelForTest, InvalidConfigShouldFail)
{
Zero();
// Fail due to block size too large
hemi::ExecutionPolicy ep;
ep.setBlockSize(10000);
runParallelForEP(ep, dCount, dGdim, dBdim);
#ifdef HEMI_CUDA_COMPILER
ASSERT_FAILURE(checkCudaErrors());
#endif

// Fail due to excessive shared memory size
ep.setBlockSize(0);
ep.setGridSize(0);
ep.setSharedMemBytes(1000000);
runParallelForEP(ep, dCount, dGdim, dBdim);
#ifdef HEMI_CUDA_COMPILER
ASSERT_FAILURE(checkCudaErrors());
#endif
}

0 comments on commit 7f6fed5

Please sign in to comment.