Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

More efficient unweighting using the GPU #642

Draft
wants to merge 4 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions epochX/cudacpp/gg_tt.mad/SubProcesses/fbridge.inc
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,20 @@ C
END SUBROUTINE FBRIDGESEQUENCE_NOMULTICHANNEL
END INTERFACE

C
C Compute the maximum event weight of the batch on the GPU
C - PBRIDGE: the memory address of the C++ Bridge
C - ALL_WGT: the Jacobian+PDF weights from Fortran
C - MAX_WGT: maximum observed weight on GPU (output)
C
INTERFACE
SUBROUTINE FBRIDGECOMPUTEMAXWEIGHT(PBRIDGE, ALL_WGT, MAX_WGT)
INTEGER*8 PBRIDGE
DOUBLE PRECISION ALL_WGT(*)
DOUBLE_PRECISION MAX_WGT
END SUBROUTINE FBRIDGECOMPUTEMAXWEIGHT
END INTERFACE

C
C Retrieve the number of good helicities for helicity filtering in the Bridge.
C - PBRIDGE: the memory address of the C++ Bridge
Expand Down
121 changes: 73 additions & 48 deletions epochX/cudacpp/gg_ttgg.mad/SubProcesses/Bridge.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,41 @@ namespace mg5amcCpu
virtual ~CppObjectInFortran() {}
};

#ifdef __CUDACC__
template<typename T, typename U>
__global__ void computeFinalWeightKernel(T const * matrixElmSq, U * weights, const std::size_t n) {
for( unsigned int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
weights[i] = weights[i] * matrixElmSq[i];
}
}

template<typename T>
__device__ T max(T a, T b) {
return a < b ? b : a;
}

template<typename T>
__global__ void computeMaxWeightKernel(T * weights, const std::size_t n) {
T maxWeight = 0;
__shared__ T sharedMaxWeight[1024];

for( unsigned int i = threadIdx.x; blockIdx.x == 0 && i < n; i += blockDim.x )
{
maxWeight = max(maxWeight, weights[i]);
}
sharedMaxWeight[threadIdx.x] = maxWeight;

__syncthreads();

if (threadIdx.x == 0) {
for( unsigned int i = 0; i < blockDim.x; ++i) {
maxWeight = max( maxWeight, sharedMaxWeight[i] );
}
weights[0] = maxWeight;
}
}
#endif

//--------------------------------------------------------------------------
/**
* A templated class for calling the CUDA/C++ matrix element calculations of the event generation workflow.
Expand Down Expand Up @@ -122,6 +157,14 @@ namespace mg5amcCpu
int* selhel,
int* selcol,
const bool goodHelOnly = false );

/**
* Compute the maximum event weight on the device.
*
* @param jacobianWeights Jacobian and PDF weights from madevent
*/
FORTRANFPTYPE computeMaxWeight( const FORTRANFPTYPE* jacobianWeights );

#else
/**
* Sequence to be executed for the vectorized CPU matrix element calculation
Expand Down Expand Up @@ -168,6 +211,7 @@ namespace mg5amcCpu
DeviceBufferMatrixElements m_devMEs;
DeviceBufferSelectedHelicity m_devSelHel;
DeviceBufferSelectedColor m_devSelCol;
std::unique_ptr<mg5amcGpu::DeviceBuffer<FORTRANFPTYPE, 1U>> m_devJacobianWeights = nullptr;
PinnedHostBufferGs m_hstGs;
PinnedHostBufferRndNumHelicity m_hstRndHel;
PinnedHostBufferRndNumColor m_hstRndCol;
Expand Down Expand Up @@ -315,18 +359,10 @@ namespace mg5amcCpu
//const int thrPerEvt = 1; // AV: try new alg with 1 event per thread... this seems slower
gpuLaunchKernel( dev_transposeMomentaF2C, m_gpublocks * thrPerEvt, m_gputhreads, m_devMomentaF.data(), m_devMomentaC.data(), m_nevt );
}
if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
{
memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) );
memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) );
memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) );
}
else
{
std::copy( gs, gs + m_nevt, m_hstGs.data() );
std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
}
std::copy( gs, gs + m_nevt, m_hstGs.data() );
std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );

copyDeviceFromHost( m_devGs, m_hstGs );
copyDeviceFromHost( m_devRndHel, m_hstRndHel );
copyDeviceFromHost( m_devRndCol, m_hstRndCol );
Expand All @@ -341,19 +377,26 @@ namespace mg5amcCpu
flagAbnormalMEs( m_hstMEs.data(), m_nevt );
copyHostFromDevice( m_hstSelHel, m_devSelHel );
copyHostFromDevice( m_hstSelCol, m_devSelCol );
if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
{
memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() );
memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() );
}
else
{
std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
}
std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
}

template<typename FORTRANFPTYPE>
FORTRANFPTYPE Bridge<FORTRANFPTYPE>::computeMaxWeight( const FORTRANFPTYPE* jacobianWeights )
{
if( !m_devJacobianWeights ) m_devJacobianWeights = std::make_unique<DeviceBuffer<FORTRANFPTYPE, 1U>>( m_nevt );
checkCuda( cudaMemcpy(m_devJacobianWeights->data(), jacobianWeights, m_nevt*sizeof(FORTRANFPTYPE), cudaMemcpyDefault) );

computeFinalWeightKernel<<<m_gpublocks, m_gputhreads>>>( m_devMEs.data(), m_devJacobianWeights->data(), m_nevt );
computeMaxWeightKernel<<<m_gpublocks, m_gputhreads>>>( m_devJacobianWeights->data(), m_nevt );

FORTRANFPTYPE maxWeight = 0.;
checkCuda( cudaMemcpy( &maxWeight, m_devJacobianWeights->data(), sizeof( FORTRANFPTYPE ), cudaMemcpyDefault ) );

return maxWeight;
}

#endif

#ifndef MGONGPUCPP_GPUIMPL
Expand All @@ -369,18 +412,9 @@ namespace mg5amcCpu
const bool goodHelOnly )
{
hst_transposeMomentaF2C( momenta, m_hstMomentaC.data(), m_nevt );
if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
{
memcpy( m_hstGs.data(), gs, m_nevt * sizeof( FORTRANFPTYPE ) );
memcpy( m_hstRndHel.data(), rndhel, m_nevt * sizeof( FORTRANFPTYPE ) );
memcpy( m_hstRndCol.data(), rndcol, m_nevt * sizeof( FORTRANFPTYPE ) );
}
else
{
std::copy( gs, gs + m_nevt, m_hstGs.data() );
std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
}
std::copy( gs, gs + m_nevt, m_hstGs.data() );
std::copy( rndhel, rndhel + m_nevt, m_hstRndHel.data() );
std::copy( rndcol, rndcol + m_nevt, m_hstRndCol.data() );
if( m_nGoodHel < 0 )
{
m_nGoodHel = m_pmek->computeGoodHelicities();
Expand All @@ -389,18 +423,9 @@ namespace mg5amcCpu
if( goodHelOnly ) return;
m_pmek->computeMatrixElements( channelId );
flagAbnormalMEs( m_hstMEs.data(), m_nevt );
if constexpr( std::is_same_v<FORTRANFPTYPE, fptype> )
{
memcpy( mes, m_hstMEs.data(), m_hstMEs.bytes() );
memcpy( selhel, m_hstSelHel.data(), m_hstSelHel.bytes() );
memcpy( selcol, m_hstSelCol.data(), m_hstSelCol.bytes() );
}
else
{
std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
}
std::copy( m_hstMEs.data(), m_hstMEs.data() + m_nevt, mes );
std::copy( m_hstSelHel.data(), m_hstSelHel.data() + m_nevt, selhel );
std::copy( m_hstSelCol.data(), m_hstSelCol.data() + m_nevt, selcol );
}
#endif

Expand Down
25 changes: 23 additions & 2 deletions epochX/cudacpp/gg_ttgg.mad/SubProcesses/P1_gg_ttxgg/auto_dsig1.f
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,7 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
C
DOUBLE PRECISION ALL_PP(0:3,NEXTERNAL,VECSIZE_MEMMAX)
DOUBLE PRECISION ALL_WGT(VECSIZE_MEMMAX)
DOUBLE PRECISION ALL_JACANDPDF(VECSIZE_MEMMAX)
DOUBLE PRECISION ALL_XBK(2,VECSIZE_MEMMAX)
DOUBLE PRECISION ALL_Q2FACT(2,VECSIZE_MEMMAX)
DOUBLE PRECISION ALL_CM_RAP(VECSIZE_MEMMAX)
Expand Down Expand Up @@ -397,8 +398,15 @@ DOUBLE PRECISION FUNCTION DSIG1_VEC(ALL_PP, ALL_XBK, ALL_Q2FACT,
CALL RANMAR(HEL_RAND(IVEC))
CALL RANMAR(COL_RAND(IVEC))
ENDDO

! Prepare computation of max weight on GPU
! This requires the jacobian and PDF weights
DO IVEC=1,VECSIZE_USED
ALL_JACANDPDF(IVEC) = ALL_WGT(IVEC)*ALL_PD(0,IVEC)
ENDDO

CALL SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
$ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
$ ALL_OUT , SELECTED_HEL, SELECTED_COL, VECSIZE_USED, ALL_JACANDPDF)


DO IVEC=1,VECSIZE_USED
Expand Down Expand Up @@ -465,7 +473,7 @@ SUBROUTINE PRINT_ZERO_AMP1()


SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
$ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED)
$ OUT, SELECTED_HEL, SELECTED_COL, VECSIZE_USED, ALL_WGT)
USE OMP_LIB
IMPLICIT NONE

Expand All @@ -482,6 +490,8 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
INTEGER SELECTED_HEL(VECSIZE_MEMMAX)
INTEGER SELECTED_COL(VECSIZE_MEMMAX)
INTEGER VECSIZE_USED
DOUBLE PRECISION ALL_WGT(VECSIZE_MEMMAX)


INTEGER IVEC
INTEGER IEXT
Expand All @@ -506,6 +516,11 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
DOUBLE PRECISION CBYF1
INTEGER*4 NGOODHEL, NTOTHEL

C Pull in twgt to allow for more efficient unweighting
double precision twgt, maxwgt,swgt(maxevents)
integer lun, nw, itmin
common/to_unwgt/twgt, maxwgt, swgt, lun, nw, itmin

INTEGER*4 NWARNINGS
SAVE NWARNINGS
DATA NWARNINGS/0/
Expand Down Expand Up @@ -576,6 +591,12 @@ SUBROUTINE SMATRIX1_MULTI(P_MULTI, HEL_RAND, COL_RAND, CHANNEL,
& SELECTED_HEL2, SELECTED_COL2, .FALSE.) ! do not quit after computing helicities
ENDIF
call counters_smatrix1multi_stop( 0 ) ! cudacppMEs=0

! Compute the maximum weight of this batch. It needs to be
! rescaled by CONV=389379660d0 to be compatible with the scale
! used in the common block to_unwgt
CALL FBRIDGECOMPUTEMAXWEIGHT(FBRIDGE_PBRIDGE, ALL_WGT, twgt)
twgt = twgt * 389379660d0
ENDIF

IF( FBRIDGE_MODE .LT. 0 ) THEN ! (BothQuiet=-1 or BothDebug=-2)
Expand Down
13 changes: 13 additions & 0 deletions epochX/cudacpp/gg_ttgg.mad/SubProcesses/fbridge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,19 @@ extern "C"
#endif
}

void fbridgecomputemaxweight_( CppObjectInFortran** ppbridge,
const FORTRANFPTYPE* jacobianWeights,
double * maxWeight)
{
#ifdef __CUDACC__
Bridge<FORTRANFPTYPE>* pbridge = static_cast<Bridge<FORTRANFPTYPE>*>( *ppbridge );
*maxWeight = pbridge->computeMaxWeight(jacobianWeights);
#else
std::cerr << __func__ << " only implemented on GPU.\n";
*maxWeight = 0.;
#endif
}

/**
* Execute the matrix-element calculation "sequence" via a Bridge on GPU/CUDA or CUDA/C++, without multi-channel mode.
* This is a C symbol that should be called from the Fortran code (in auto_dsig1.f).
Expand Down
2 changes: 1 addition & 1 deletion epochX/cudacpp/gg_ttgg.mad/SubProcesses/unwgt.f
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ SUBROUTINE unwgt(px,wgt,numproc, ihel, icol, ivec)
c
data idum/-1/
data yran/1d0/
data fudge/10d0/
data fudge/0.5d0/
C-----
C BEGIN CODE
C-----
Expand Down
Loading