Skip to content

Commit

Permalink
enable multi-GPU cv.wrap()
Browse files Browse the repository at this point in the history
  • Loading branch information
Jens Glaser committed Apr 17, 2019
1 parent b6802f2 commit 999b6e1
Show file tree
Hide file tree
Showing 6 changed files with 247 additions and 71 deletions.
66 changes: 48 additions & 18 deletions metadynamics/CollectiveWrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,14 @@ CollectiveWrapper::CollectiveWrapper(std::shared_ptr<SystemDefinition> sysdef,
#ifdef ENABLE_CUDA
if (m_exec_conf->exec_mode == ExecutionConfiguration::GPU)
{
GPUFlags<Scalar> sum(m_exec_conf);
sum.resetFlags(0);
GlobalArray<Scalar> sum(1,m_exec_conf);
m_sum.swap(sum);
TAG_ALLOCATION(m_sum);
}

cudaDeviceProp dev_prop = m_exec_conf->dev_prop;
m_tuner_reduce.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, name+"_reduce", this->m_exec_conf));
m_tuner_scale.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, name+"_scale", this->m_exec_conf));
#endif
}

Expand Down Expand Up @@ -73,22 +77,35 @@ void CollectiveWrapper::computeCVGPU(unsigned int timestep)
{
ArrayHandle<Scalar4> d_force(m_fc->getForceArray(), access_location::device, access_mode::read);

unsigned int block_size = 512;
unsigned int n_blocks = m_pdata->getN() / block_size + 1;

ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocator(), n_blocks);

gpu_reduce_potential_energy(d_scratch.data,
d_force.data,
m_pdata->getN()+m_pdata->getNGhosts(),
m_sum.getDeviceFlags(),
n_blocks,
block_size,
false);

if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
// maximum size, per GPU, for a block_size of one
unsigned int scratch_size = (m_pdata->getN()+m_pdata->getNGhosts()+1);
ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocatorManaged(), scratch_size*m_exec_conf->getNumActiveGPUs());

m_energy = m_sum.readFlags();
{
ArrayHandle<Scalar> d_sum(m_sum, access_location::device, access_mode::overwrite);

// reset sum
cudaMemsetAsync(d_sum.data, 0, sizeof(Scalar));

m_exec_conf->beginMultiGPU();
m_tuner_reduce->begin();

gpu_reduce_potential_energy(d_scratch.data,
d_force.data,
m_pdata->getGPUPartition(),
m_pdata->getNGhosts(),
scratch_size,
d_sum.data,
false,
m_tuner_reduce->getParam());
if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();

m_tuner_reduce->end();
m_exec_conf->endMultiGPU();
}

ArrayHandle<Scalar> h_sum(m_sum, access_location::host, access_mode::read);
m_energy = *h_sum.data;
}

void CollectiveWrapper::computeBiasForcesGPU(unsigned int timestep)
Expand All @@ -100,10 +117,23 @@ void CollectiveWrapper::computeBiasForcesGPU(unsigned int timestep)
unsigned int pitch = m_fc->getVirialArray().getPitch();
Scalar fac = m_bias;

gpu_scale_netforce(d_force.data, d_torque.data, d_virial.data, pitch, fac, m_pdata->getN()+m_pdata->getNGhosts());
m_exec_conf->beginMultiGPU();
m_tuner_scale->begin();

gpu_scale_netforce(d_force.data,
d_torque.data,
d_virial.data,
pitch,
fac,
m_pdata->getGPUPartition(),
m_pdata->getNGhosts(),
m_tuner_scale->getParam());

if (m_exec_conf->isCUDAErrorCheckingEnabled())
CHECK_CUDA_ERROR();

m_tuner_scale->end();
m_exec_conf->endMultiGPU();
}
#endif

Expand Down
19 changes: 17 additions & 2 deletions metadynamics/CollectiveWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

#include "CollectiveVariable.h"

#include <hoomd/GPUFlags.h>
#include <hoomd/GlobalArray.h>

/*! Wrapper to convert a regular ForceCompute into a CollectiveVariable */

Expand All @@ -26,12 +26,27 @@ class CollectiveWrapper : public CollectiveVariable
return m_energy;
}

//! Set autotuner parameters
/*! \param enable Enable/disable autotuning
\param period period (approximate) in time steps when returning occurs
*/
virtual void setAutotunerParams(bool enable, unsigned int period)
{
CollectiveVariable::setAutotunerParams(enable, period);
m_tuner_reduce->setPeriod(period);
m_tuner_reduce->setEnabled(enable);
m_tuner_scale->setPeriod(period);
m_tuner_scale->setEnabled(enable);
}

protected:
std::shared_ptr<ForceCompute> m_fc; //!< The parent force compute
Scalar m_energy; //!< The potential energy

#ifdef ENABLE_CUDA
GPUFlags<Scalar> m_sum; //!< for reading back potential energy from GPU
GlobalArray<Scalar> m_sum; //!< for reading back potential energy from GPU
std::unique_ptr<Autotuner> m_tuner_scale; //!< Autotuner for scaling forces
std::unique_ptr<Autotuner> m_tuner_reduce; //!< Autotuner for collective variable reduction
#endif

virtual void computeCV(unsigned int timestep);
Expand Down
66 changes: 48 additions & 18 deletions metadynamics/WellTemperedEnsemble.cc
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,14 @@ WellTemperedEnsemble::WellTemperedEnsemble(std::shared_ptr<SystemDefinition> sys
#ifdef ENABLE_CUDA
if (m_exec_conf->exec_mode == ExecutionConfiguration::GPU)
{
GPUFlags<Scalar> sum(m_exec_conf);
sum.resetFlags(0);
GlobalArray<Scalar> sum(1,m_exec_conf);
m_sum.swap(sum);
TAG_ALLOCATION(m_sum);
}

cudaDeviceProp dev_prop = m_exec_conf->dev_prop;
m_tuner_reduce.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, "wte_reduce", this->m_exec_conf));
m_tuner_scale.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, "wte_scale", this->m_exec_conf));
#endif
}

Expand Down Expand Up @@ -68,22 +72,35 @@ void WellTemperedEnsemble::computeCVGPU(unsigned int timestep)
{
ArrayHandle<Scalar4> d_net_force(m_pdata->getNetForce(), access_location::device, access_mode::read);

unsigned int block_size = 512;
unsigned int n_blocks = m_pdata->getN() / block_size + 1;

ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocator(), n_blocks);

gpu_reduce_potential_energy(d_scratch.data,
d_net_force.data,
m_pdata->getN(),
m_sum.getDeviceFlags(),
n_blocks,
block_size,
false);

if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
// maximum size, per GPU, for a block_size of one
unsigned int scratch_size = (m_pdata->getN()+1);
ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocatorManaged(), scratch_size*m_exec_conf->getNumActiveGPUs());

m_pe = m_sum.readFlags();
{
ArrayHandle<Scalar> d_sum(m_sum, access_location::device, access_mode::overwrite);

// reset sum
cudaMemsetAsync(d_sum.data, 0, sizeof(Scalar));

m_exec_conf->beginMultiGPU();
m_tuner_reduce->begin();

gpu_reduce_potential_energy(d_scratch.data,
d_net_force.data,
m_pdata->getGPUPartition(),
0, // nghost
scratch_size,
d_sum.data,
false,
m_tuner_reduce->getParam());
if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();

m_tuner_reduce->end();
m_exec_conf->endMultiGPU();
}

ArrayHandle<Scalar> h_sum(m_sum, access_location::host, access_mode::read);
m_pe = *h_sum.data;
}

void WellTemperedEnsemble::computeBiasForcesGPU(unsigned int timestep)
Expand All @@ -95,10 +112,23 @@ void WellTemperedEnsemble::computeBiasForcesGPU(unsigned int timestep)
unsigned int pitch = m_pdata->getNetVirial().getPitch();
Scalar fac = Scalar(1.0)+m_bias;

gpu_scale_netforce(d_net_force.data, d_net_torque.data, d_net_virial.data, pitch, fac, m_pdata->getN());
m_exec_conf->beginMultiGPU();
m_tuner_scale->begin();

gpu_scale_netforce(d_net_force.data,
d_net_torque.data,
d_net_virial.data,
pitch,
fac,
m_pdata->getGPUPartition(),
0, // nghost
m_tuner_scale->getParam());

if (m_exec_conf->isCUDAErrorCheckingEnabled())
CHECK_CUDA_ERROR();

m_tuner_scale->end();
m_exec_conf->endMultiGPU();
}
#endif

Expand Down
125 changes: 100 additions & 25 deletions metadynamics/WellTemperedEnsemble.cu
Original file line number Diff line number Diff line change
@@ -1,15 +1,33 @@
#include <hoomd/ParticleData.cuh>
#include "WellTemperedEnsemble.cuh"

// pre-pascal atomicAdd for doubles
__device__ double atomicAdd_double(double* address, double val) {

unsigned long long int* address_as_ull = (unsigned long long int*)address;
unsigned long long int old = *address_as_ull, assumed;

do {
assumed = old;
old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed)));
// Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
}
while (assumed != old);
return __longlong_as_double(old);
}

__global__ void gpu_scale_netforce_kernel(Scalar4 *d_net_force,
Scalar4 *d_net_torque,
Scalar *d_net_virial,
unsigned int net_virial_pitch,
Scalar fac,
unsigned int N)
const unsigned int nwork,
const unsigned int offset)
{
unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;

if (idx>=N) return;
if (idx>=nwork) return;
idx += offset;

Scalar4 net_force = d_net_force[idx];
net_force.x *= fac;
Expand Down Expand Up @@ -37,10 +55,37 @@ void gpu_scale_netforce(Scalar4 *d_net_force,
Scalar *d_net_virial,
unsigned int net_virial_pitch,
Scalar fac,
unsigned int N)
const GPUPartition& gpu_partition,
const unsigned int nghost,
const unsigned int block_size)
{
unsigned int block_size = 256;
gpu_scale_netforce_kernel<<<(N/block_size+1),block_size>>>(d_net_force, d_net_torque, d_net_virial, net_virial_pitch, fac, N);
static unsigned int max_block_size = UINT_MAX;
if (max_block_size == UINT_MAX)
{
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr, (const void *)gpu_scale_netforce_kernel);
max_block_size = attr.maxThreadsPerBlock;
}

unsigned int run_block_size = min(block_size, max_block_size);

// iterate over active GPUs in reverse, to end up on first GPU when returning from this function
for (int idev = gpu_partition.getNumActiveGPUs() - 1; idev >= 0; --idev)
{
auto range = gpu_partition.getRangeAndSetGPU(idev);

unsigned int nwork = range.second - range.first;

// process ghosts in final range
if (idev == (int)gpu_partition.getNumActiveGPUs()-1)
nwork += nghost;

// setup the grid to run the kernel
dim3 grid( (nwork/run_block_size) + 1, 1, 1);
dim3 threads(run_block_size, 1, 1);

gpu_scale_netforce_kernel<<<grid, threads>>>(d_net_force, d_net_torque, d_net_virial, net_virial_pitch, fac, nwork, range.first);
}
}

//! Shared memory used in reducing the sums
Expand All @@ -51,15 +96,17 @@ extern __shared__ Scalar compute_pe_final_sdata[];

__global__ void gpu_compute_potential_energy_partial(Scalar *d_scratch,
Scalar4 *d_net_force,
unsigned int N,
const unsigned int nwork,
const unsigned int offset,
bool zero_energy)
{
// determine which particle this thread works on
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int work_idx = blockIdx.x * blockDim.x + threadIdx.x;

Scalar my_element; // element of scratch space read in
if (idx < N)
if (work_idx < nwork)
{
const unsigned int idx = work_idx + offset;
Scalar4 net_force = d_net_force[idx];

// compute our contribution to the sum
Expand Down Expand Up @@ -137,32 +184,60 @@ __global__ void gpu_compute_potential_energy_final_sums(Scalar *d_scratch,

if (threadIdx.x == 0)
{
*d_sum = final_sum;
#if (__CUDA_ARCH >= 600)
atomicAdd_system(d_sum,final_sum);
#else
atomicAdd_double(d_sum,final_sum);
#endif
}
}

void gpu_reduce_potential_energy(Scalar *d_scratch,
Scalar4 *d_net_force,
unsigned int N,
const GPUPartition& gpu_partition,
const unsigned int nghost,
const unsigned int scratch_size,
Scalar *d_sum,
unsigned int n_blocks,
unsigned int block_size,
bool zero_energy)
bool zero_energy,
const unsigned int block_size)
{
dim3 grid(n_blocks, 1, 1);
dim3 threads(block_size, 1, 1);
unsigned int shared_bytes = sizeof(Scalar)*block_size;
static unsigned int max_block_size = UINT_MAX;
if (max_block_size == UINT_MAX)
{
cudaFuncAttributes attr;
cudaFuncGetAttributes(&attr, (const void *)gpu_compute_potential_energy_partial);
max_block_size = attr.maxThreadsPerBlock;
}

unsigned int run_block_size = min(block_size, max_block_size);

gpu_compute_potential_energy_partial<<<grid, threads, shared_bytes>>>(d_scratch, d_net_force, N, zero_energy);
// iterate over active GPUs in reverse, to end up on first GPU when returning from this function
for (int idev = gpu_partition.getNumActiveGPUs() - 1; idev >= 0; --idev)
{
auto range = gpu_partition.getRangeAndSetGPU(idev);

unsigned int nwork = range.second - range.first;

// process ghosts in final range
if (idev == (int)gpu_partition.getNumActiveGPUs()-1)
nwork += nghost;

int final_block_size = 512;
grid = dim3(1, 1, 1);
threads = dim3(final_block_size, 1, 1);
// setup the grid to run the kernel
unsigned int n_blocks = (nwork/run_block_size) + 1;
dim3 grid(n_blocks, 1, 1);
dim3 threads(run_block_size, 1, 1);

shared_bytes = sizeof(Scalar)*final_block_size;
unsigned int shared_bytes = sizeof(Scalar)*run_block_size;
gpu_compute_potential_energy_partial<<<grid, threads, shared_bytes>>>(d_scratch+idev*scratch_size, d_net_force, nwork, range.first, zero_energy);

// run the kernel
gpu_compute_potential_energy_final_sums<<<grid, threads, shared_bytes>>>(d_scratch,
n_blocks,
d_sum);
int final_block_size = 512;
grid = dim3(1, 1, 1);
threads = dim3(final_block_size, 1, 1);
shared_bytes = sizeof(Scalar)*final_block_size;

// run the kernel
gpu_compute_potential_energy_final_sums<<<grid, threads, shared_bytes>>>(d_scratch+idev*scratch_size,
n_blocks,
d_sum);
}
}
Loading

0 comments on commit 999b6e1

Please sign in to comment.