enable multi-GPU cv.wrap()

jglaser · Apr 17, 2019 · 999b6e1 · 999b6e1
1 parent b6802f2
commit 999b6e1
Show file tree

Hide file tree

Showing 6 changed files with 247 additions and 71 deletions.
diff --git a/metadynamics/CollectiveWrapper.cc b/metadynamics/CollectiveWrapper.cc
@@ -17,10 +17,14 @@ CollectiveWrapper::CollectiveWrapper(std::shared_ptr<SystemDefinition> sysdef,
     #ifdef ENABLE_CUDA
     if (m_exec_conf->exec_mode == ExecutionConfiguration::GPU)
         {
-        GPUFlags<Scalar> sum(m_exec_conf);
-        sum.resetFlags(0);
+        GlobalArray<Scalar> sum(1,m_exec_conf);
         m_sum.swap(sum);
+        TAG_ALLOCATION(m_sum);
         }
+
+    cudaDeviceProp dev_prop = m_exec_conf->dev_prop;
+    m_tuner_reduce.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, name+"_reduce", this->m_exec_conf));
+    m_tuner_scale.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, name+"_scale", this->m_exec_conf));
     #endif
     }
 
@@ -73,22 +77,35 @@ void CollectiveWrapper::computeCVGPU(unsigned int timestep)
     {
     ArrayHandle<Scalar4> d_force(m_fc->getForceArray(), access_location::device, access_mode::read);
 
-    unsigned int block_size = 512;
-    unsigned int n_blocks = m_pdata->getN() / block_size + 1;
-
-    ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocator(), n_blocks);
-
-    gpu_reduce_potential_energy(d_scratch.data,
-        d_force.data,
-        m_pdata->getN()+m_pdata->getNGhosts(),
-        m_sum.getDeviceFlags(),
-        n_blocks,
-        block_size,
-        false);
-
-    if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
+    // maximum size, per GPU, for a block_size of one
+    unsigned int scratch_size = (m_pdata->getN()+m_pdata->getNGhosts()+1);
+    ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocatorManaged(), scratch_size*m_exec_conf->getNumActiveGPUs());
 
-    m_energy = m_sum.readFlags();
+        {
+        ArrayHandle<Scalar> d_sum(m_sum, access_location::device, access_mode::overwrite);
+
+        // reset sum
+        cudaMemsetAsync(d_sum.data, 0, sizeof(Scalar));
+
+        m_exec_conf->beginMultiGPU();
+        m_tuner_reduce->begin();
+
+        gpu_reduce_potential_energy(d_scratch.data,
+            d_force.data,
+            m_pdata->getGPUPartition(),
+            m_pdata->getNGhosts(),
+            scratch_size,
+            d_sum.data,
+            false,
+            m_tuner_reduce->getParam());
+        if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
+
+        m_tuner_reduce->end();
+        m_exec_conf->endMultiGPU();
+        }
+
+    ArrayHandle<Scalar> h_sum(m_sum, access_location::host, access_mode::read);
+    m_energy = *h_sum.data;
     }
 
 void CollectiveWrapper::computeBiasForcesGPU(unsigned int timestep)
@@ -100,10 +117,23 @@ void CollectiveWrapper::computeBiasForcesGPU(unsigned int timestep)
     unsigned int pitch = m_fc->getVirialArray().getPitch();
     Scalar fac = m_bias;
 
-    gpu_scale_netforce(d_force.data, d_torque.data, d_virial.data, pitch, fac, m_pdata->getN()+m_pdata->getNGhosts());
+    m_exec_conf->beginMultiGPU();
+    m_tuner_scale->begin();
+
+    gpu_scale_netforce(d_force.data,
+        d_torque.data,
+        d_virial.data,
+        pitch,
+        fac,
+        m_pdata->getGPUPartition(),
+        m_pdata->getNGhosts(),
+        m_tuner_scale->getParam());
 
     if (m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
+
+    m_tuner_scale->end();
+    m_exec_conf->endMultiGPU();
     }
 #endif
 

diff --git a/metadynamics/CollectiveWrapper.h b/metadynamics/CollectiveWrapper.h
@@ -3,7 +3,7 @@
 
 #include "CollectiveVariable.h"
 
-#include <hoomd/GPUFlags.h>
+#include <hoomd/GlobalArray.h>
 
 /*! Wrapper to convert a regular ForceCompute into a CollectiveVariable */
 
@@ -26,12 +26,27 @@ class CollectiveWrapper : public CollectiveVariable
             return m_energy;
             }
 
+        //! Set autotuner parameters
+        /*! \param enable Enable/disable autotuning
+            \param period period (approximate) in time steps when returning occurs
+        */
+        virtual void setAutotunerParams(bool enable, unsigned int period)
+            {
+            CollectiveVariable::setAutotunerParams(enable, period);
+            m_tuner_reduce->setPeriod(period);
+            m_tuner_reduce->setEnabled(enable);
+            m_tuner_scale->setPeriod(period);
+            m_tuner_scale->setEnabled(enable);
+            }
+
     protected:
         std::shared_ptr<ForceCompute> m_fc; //!< The parent force compute
         Scalar m_energy;                    //!< The potential energy
 
         #ifdef ENABLE_CUDA
-        GPUFlags<Scalar> m_sum;     //!< for reading back potential energy from GPU
+        GlobalArray<Scalar> m_sum;     //!< for reading back potential energy from GPU
+        std::unique_ptr<Autotuner> m_tuner_scale; //!< Autotuner for scaling forces
+        std::unique_ptr<Autotuner> m_tuner_reduce; //!< Autotuner for collective variable reduction
         #endif
 
         virtual void computeCV(unsigned int timestep);

diff --git a/metadynamics/WellTemperedEnsemble.cc b/metadynamics/WellTemperedEnsemble.cc
@@ -16,10 +16,14 @@ WellTemperedEnsemble::WellTemperedEnsemble(std::shared_ptr<SystemDefinition> sys
     #ifdef ENABLE_CUDA
     if (m_exec_conf->exec_mode == ExecutionConfiguration::GPU)
         {
-        GPUFlags<Scalar> sum(m_exec_conf);
-        sum.resetFlags(0);
+        GlobalArray<Scalar> sum(1,m_exec_conf);
         m_sum.swap(sum);
+        TAG_ALLOCATION(m_sum);
         }
+
+    cudaDeviceProp dev_prop = m_exec_conf->dev_prop;
+    m_tuner_reduce.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, "wte_reduce", this->m_exec_conf));
+    m_tuner_scale.reset(new Autotuner(dev_prop.warpSize, dev_prop.maxThreadsPerBlock, dev_prop.warpSize, 5, 100000, "wte_scale", this->m_exec_conf));
     #endif
     }
 
@@ -68,22 +72,35 @@ void WellTemperedEnsemble::computeCVGPU(unsigned int timestep)
     {
     ArrayHandle<Scalar4> d_net_force(m_pdata->getNetForce(), access_location::device, access_mode::read);
 
-    unsigned int block_size = 512;
-    unsigned int n_blocks = m_pdata->getN() / block_size + 1;
-
-    ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocator(), n_blocks);
-
-    gpu_reduce_potential_energy(d_scratch.data,
-        d_net_force.data,
-        m_pdata->getN(),
-        m_sum.getDeviceFlags(),
-        n_blocks,
-        block_size,
-        false);
-
-    if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
+    // maximum size, per GPU, for a block_size of one
+    unsigned int scratch_size = (m_pdata->getN()+1);
+    ScopedAllocation<Scalar> d_scratch(m_exec_conf->getCachedAllocatorManaged(), scratch_size*m_exec_conf->getNumActiveGPUs());
 
-    m_pe = m_sum.readFlags();
+        {
+        ArrayHandle<Scalar> d_sum(m_sum, access_location::device, access_mode::overwrite);
+
+        // reset sum
+        cudaMemsetAsync(d_sum.data, 0, sizeof(Scalar));
+
+        m_exec_conf->beginMultiGPU();
+        m_tuner_reduce->begin();
+
+        gpu_reduce_potential_energy(d_scratch.data,
+            d_net_force.data,
+            m_pdata->getGPUPartition(),
+            0, // nghost
+            scratch_size,
+            d_sum.data,
+            false,
+            m_tuner_reduce->getParam());
+        if (m_exec_conf->isCUDAErrorCheckingEnabled()) CHECK_CUDA_ERROR();
+
+        m_tuner_reduce->end();
+        m_exec_conf->endMultiGPU();
+        }
+
+    ArrayHandle<Scalar> h_sum(m_sum, access_location::host, access_mode::read);
+    m_pe = *h_sum.data;
     }
 
 void WellTemperedEnsemble::computeBiasForcesGPU(unsigned int timestep)
@@ -95,10 +112,23 @@ void WellTemperedEnsemble::computeBiasForcesGPU(unsigned int timestep)
     unsigned int pitch = m_pdata->getNetVirial().getPitch();
     Scalar fac = Scalar(1.0)+m_bias;
 
-    gpu_scale_netforce(d_net_force.data, d_net_torque.data, d_net_virial.data, pitch, fac, m_pdata->getN());
+    m_exec_conf->beginMultiGPU();
+    m_tuner_scale->begin();
+
+    gpu_scale_netforce(d_net_force.data,
+        d_net_torque.data,
+        d_net_virial.data,
+        pitch,
+        fac,
+        m_pdata->getGPUPartition(),
+        0, // nghost
+        m_tuner_scale->getParam());
 
     if (m_exec_conf->isCUDAErrorCheckingEnabled())
         CHECK_CUDA_ERROR();
+
+    m_tuner_scale->end();
+    m_exec_conf->endMultiGPU();
     }
 #endif
 

diff --git a/metadynamics/WellTemperedEnsemble.cu b/metadynamics/WellTemperedEnsemble.cu
@@ -1,15 +1,33 @@
 #include <hoomd/ParticleData.cuh>
+#include "WellTemperedEnsemble.cuh"
+
+// pre-pascal atomicAdd for doubles
+__device__ double atomicAdd_double(double* address, double val) { 
+
+    unsigned long long int* address_as_ull = (unsigned long long int*)address; 
+    unsigned long long int old = *address_as_ull, assumed; 
+
+    do { 
+        assumed = old; 
+        old = atomicCAS(address_as_ull, assumed, __double_as_longlong(val + __longlong_as_double(assumed))); 
+        // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
+        } 
+    while (assumed != old); 
+    return __longlong_as_double(old); 
+    } 
 
 __global__ void gpu_scale_netforce_kernel(Scalar4 *d_net_force,
     Scalar4 *d_net_torque,
     Scalar *d_net_virial,
     unsigned int net_virial_pitch,
     Scalar fac,
-    unsigned int N)
+    const unsigned int nwork,
+    const unsigned int offset)
     {
     unsigned int idx = blockIdx.x*blockDim.x+threadIdx.x;
 
-    if (idx>=N) return;
+    if (idx>=nwork) return;
+    idx += offset;
 
     Scalar4 net_force = d_net_force[idx];
     net_force.x *= fac;
@@ -37,10 +55,37 @@ void gpu_scale_netforce(Scalar4 *d_net_force,
     Scalar *d_net_virial,
     unsigned int net_virial_pitch,
     Scalar fac,
-    unsigned int N)
+    const GPUPartition& gpu_partition,
+    const unsigned int nghost,
+    const unsigned int block_size)
     {
-    unsigned int block_size = 256;
-    gpu_scale_netforce_kernel<<<(N/block_size+1),block_size>>>(d_net_force, d_net_torque, d_net_virial, net_virial_pitch, fac, N);
+    static unsigned int max_block_size = UINT_MAX;
+    if (max_block_size == UINT_MAX)
+        {
+        cudaFuncAttributes attr;
+        cudaFuncGetAttributes(&attr, (const void *)gpu_scale_netforce_kernel);
+        max_block_size = attr.maxThreadsPerBlock;
+        }
+
+    unsigned int run_block_size = min(block_size, max_block_size);
+
+    // iterate over active GPUs in reverse, to end up on first GPU when returning from this function
+    for (int idev = gpu_partition.getNumActiveGPUs() - 1; idev >= 0; --idev)
+        {
+        auto range = gpu_partition.getRangeAndSetGPU(idev);
+
+        unsigned int nwork = range.second - range.first;
+
+        // process ghosts in final range
+        if (idev == (int)gpu_partition.getNumActiveGPUs()-1)
+            nwork += nghost;
+
+        // setup the grid to run the kernel
+        dim3 grid( (nwork/run_block_size) + 1, 1, 1);
+        dim3 threads(run_block_size, 1, 1);
+
+        gpu_scale_netforce_kernel<<<grid, threads>>>(d_net_force, d_net_torque, d_net_virial, net_virial_pitch, fac, nwork, range.first);
+        }
     }
 
 //! Shared memory used in reducing the sums
@@ -51,15 +96,17 @@ extern __shared__ Scalar compute_pe_final_sdata[];
 
 __global__ void gpu_compute_potential_energy_partial(Scalar *d_scratch,
                                                 Scalar4 *d_net_force,
-                                                unsigned int N,
+                                                const unsigned int nwork,
+                                                const unsigned int offset,
                                                 bool zero_energy)
     {
     // determine which particle this thread works on
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    int work_idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     Scalar my_element; // element of scratch space read in
-    if (idx < N)
+    if (work_idx < nwork)
         {
+        const unsigned int idx = work_idx + offset;
         Scalar4 net_force = d_net_force[idx];
 
         // compute our contribution to the sum
@@ -137,32 +184,60 @@ __global__ void gpu_compute_potential_energy_final_sums(Scalar *d_scratch,
 
     if (threadIdx.x == 0)
         {
-        *d_sum = final_sum;
+        #if (__CUDA_ARCH >= 600)
+        atomicAdd_system(d_sum,final_sum);
+        #else
+        atomicAdd_double(d_sum,final_sum);
+        #endif
         }
     }
 
 void gpu_reduce_potential_energy(Scalar *d_scratch,
     Scalar4 *d_net_force,
-    unsigned int N,
+    const GPUPartition& gpu_partition,
+    const unsigned int nghost,
+    const unsigned int scratch_size,
     Scalar *d_sum,
-    unsigned int n_blocks,
-    unsigned int block_size,
-    bool zero_energy)
+    bool zero_energy,
+    const unsigned int block_size)
     {
-    dim3 grid(n_blocks, 1, 1);
-    dim3 threads(block_size, 1, 1);
-    unsigned int shared_bytes = sizeof(Scalar)*block_size;
+    static unsigned int max_block_size = UINT_MAX;
+    if (max_block_size == UINT_MAX)
+        {
+        cudaFuncAttributes attr;
+        cudaFuncGetAttributes(&attr, (const void *)gpu_compute_potential_energy_partial);
+        max_block_size = attr.maxThreadsPerBlock;
+        }
+
+    unsigned int run_block_size = min(block_size, max_block_size);
 
-    gpu_compute_potential_energy_partial<<<grid, threads, shared_bytes>>>(d_scratch, d_net_force, N, zero_energy);
+    // iterate over active GPUs in reverse, to end up on first GPU when returning from this function
+    for (int idev = gpu_partition.getNumActiveGPUs() - 1; idev >= 0; --idev)
+        {
+        auto range = gpu_partition.getRangeAndSetGPU(idev);
+
+        unsigned int nwork = range.second - range.first;
+
+        // process ghosts in final range
+        if (idev == (int)gpu_partition.getNumActiveGPUs()-1)
+            nwork += nghost;
 
-    int final_block_size = 512;
-    grid = dim3(1, 1, 1);
-    threads = dim3(final_block_size, 1, 1);
+        // setup the grid to run the kernel
+        unsigned int n_blocks = (nwork/run_block_size) + 1;
+        dim3 grid(n_blocks,  1, 1);
+        dim3 threads(run_block_size, 1, 1);
 
-    shared_bytes = sizeof(Scalar)*final_block_size;
+        unsigned int shared_bytes = sizeof(Scalar)*run_block_size;
+        gpu_compute_potential_energy_partial<<<grid, threads, shared_bytes>>>(d_scratch+idev*scratch_size, d_net_force, nwork, range.first, zero_energy);
 
-    // run the kernel
-    gpu_compute_potential_energy_final_sums<<<grid, threads, shared_bytes>>>(d_scratch,
-                                                                   n_blocks,
-                                                                   d_sum);
+        int final_block_size = 512;
+        grid = dim3(1, 1, 1);
+        threads = dim3(final_block_size, 1, 1);
+        shared_bytes = sizeof(Scalar)*final_block_size;
+
+        // run the kernel
+        gpu_compute_potential_energy_final_sums<<<grid, threads, shared_bytes>>>(d_scratch+idev*scratch_size,
+                                                                       n_blocks,
+                                                                       d_sum);
+        }
     }