Skip to content

Commit

Permalink
gpu-aware MPI improved data clauses
Browse files Browse the repository at this point in the history
  • Loading branch information
reuterbal committed Aug 24, 2023
1 parent 2aac87e commit ab807d2
Show file tree
Hide file tree
Showing 4 changed files with 31 additions and 34 deletions.
19 changes: 10 additions & 9 deletions src/trans/gpu/internal/trgtol_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -453,10 +453,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
ZCOMBUFS(:,:) = 0.
ZCOMBUFR(:,:) = 0.
#ifdef ACCGPU
!$ACC END KERNELS
!$ACC END KERNELS
#endif
#ifdef OMPGPU
!$OMP END TARGET
!$OMP END TARGET
#endif

CALL GSTATS(1805,1)
Expand Down Expand Up @@ -553,9 +553,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
!$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS)
#elif defined(_CRAYFTN)
!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) &
!$ACC& PRESENT(NGPBLKS,IFLDS,JK_MAX,IGPTRSEND,MYSETW,INDOFF,MYPROC,IGPTROFF,IFLDOFF, &
!$ACC& LLUV,LLGP2,LLGP3A,LLGP3B,PGLAT,KINDEX,PGPUV,PGP2,PGP3A,PGP3B,IUVLEVS, &
!$ACC& IUVPARS,IGP2PARS,IGP3ALEVS,IGP3APARS,IGP3BLEVS,IGP3BPARS)
!$ACC COPYIN(NGPBLKS,IFLDS,JK_MAX,MYSETW,MYPROC) &
!$ACC& PRESENT(IGPTRSEND,INDOFF,IGPTROFF,IFLDOFF, &
!$ACC& LLUV,LLGP2,LLGP3A,LLGP3B,PGLAT,KINDEX,PGPUV,PGP2,PGP3A,PGP3B, &
!$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS) !!,IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS)
#endif
#endif
DO JBLK=1,NGPBLKS
Expand Down Expand Up @@ -637,10 +638,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
!$ACC& PRESENT(IGPTRSEND,LLUV,LLGP2,LLGP3A,LLGP3B,ZCOMBUFS,KPTRGP,PGP,PGPUV,PGP2,PGP3A,PGP3B, &
!$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS)
#elif defined(_CRAYFTN)
!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(INS,JK_MAX,IJPOS,IFLDA) &
!$ACC& PRESENT(ISEND_FLD_END,NGPBLKS,ISETW,IPOS,IGPTRSEND,LLINDER,LLPGPONLY, &
!$ACC& LLUV,LLGP2,LLGP3A,LLGP3B,ZCOMBUFS,KPTRGP,PGP,PGPUV,PGP2,PGP3A,PGP3B,IUVLEVS, &
!$ACC& IUVPARS,IGP2PARS,IGP3ALEVS,IGP3APARS,IGP3BLEVS,IGP3BPARS)
!$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) &
!$ACC COPYIN(NGPBLKS,ISEND_FLD_END,INS,JK_MAX,IJPOS,IFLDA,IPOS,ISETW,LLINDER,LLPGPONLY) &
!$ACC& PRESENT(IGPTRSEND,LLUV,LLGP2,LLGP3A,LLGP3B,ZCOMBUFS,KPTRGP,PGP,PGPUV,PGP2,PGP3A,PGP3B, &
!$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS) !!,IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS)
#endif
#endif
DO JJ=1,ISEND_FLD_END
Expand Down
29 changes: 14 additions & 15 deletions src/trans/gpu/internal/trltog_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -444,10 +444,10 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
ZCOMBUFS(:,:) = 0.
ZCOMBUFR(:,:) = 0.
#ifdef ACCGPU
!$ACC END KERNELS
!$ACC END KERNELS
#endif
#ifdef OMPGPU
!$OMP END TARGET
!$OMP END TARGET
#endif

#ifdef ACCGPU
Expand Down Expand Up @@ -506,7 +506,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) &
!$OMP& MAP(TO:IGPTROFF,IFLDOFF,JK_MAX) COLLAPSE(3) &
!$OMP& SHARED(NGPBLKS,IFLDS,IGPTRSEND,MYSETW,INDOFF,MYPROC, &
!$OMP& IGPTROFF,LLINDER,KPTRGP,PGP,PGLAT,KINDEX,IFLDOFF,JK_MAX)
!$OMP& LLINDER,KPTRGP,PGP,PGLAT,KINDEX)
#endif
#ifdef ACCGPU
!$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) &
Expand Down Expand Up @@ -550,11 +550,10 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
!$ACC& PGLAT,LLGP3A,PGP3A,LLGP3B,PGP3B,KINDEX)
#elif defined(_CRAYFTN)
!$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) &
!$ACC& COPYIN(IGPTROFF,IFLDOFF,JK_MAX) COLLAPSE(3) &
!$ACC& PRESENT(NGPBLKS,IFLDS,IGPTRSEND,MYSETW,INDOFF,MYPROC, &
!$ACC& LLUV,PGPUV,IUVLEVS,IUVPARS,LLGP2,PGP2,IGP2PARS, &
!$ACC& PGLAT,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS,LLGP3B,PGP3B,&
!$ACC& IGP3BLEVS,IGP3BPARS,KINDEX)
!$ACC& COPYIN(NGPBLKS,IGPTROFF,IFLDOFF,JK_MAX,IFLDS,MYPROC,MYSETW) COLLAPSE(3) &
!$ACC& PRESENT(IGPTRSEND,INDOFF,LLUV,PGPUV,LLGP2,PGP2,IGP2PARS, &
!$ACC& PGLAT,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS,LLGP3B,PGP3B,KINDEX) !!,&
!!!$ACC& IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS)
#endif
#endif
DO JBLK=1,NGPBLKS
Expand Down Expand Up @@ -608,8 +607,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
!$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COPYIN(ILEN) COLLAPSE(2) &
!$ACC& PRESENT(KINDEX,INDOFF,ZCOMBUFS,PGLAT)
#elif defined(_CRAYFTN)
!$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COPYIN(ILEN) COLLAPSE(2) &
!$ACC& PRESENT(KF_FS,KINDEX,INDOFF,ISEND,ZCOMBUFS,INS,PGLAT)
!$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COPYIN(ILEN,INS,ISEND,KF_FS) COLLAPSE(2) &
!$ACC& PRESENT(KINDEX,INDOFF,ZCOMBUFS,PGLAT)
#endif
#endif
DO JL=1,ILEN
Expand Down Expand Up @@ -756,11 +755,11 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
!$ACC& LLGP2,PGP2,IGP2PARS,LLGP3A,PGP3A,LLGP3B,PGP3B)
#elif defined(_CRAYFTN)
!$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IFIRST,ILAST,JI,JK,IFLDT) &
!$ACC& COPYIN(INR,KF_FS,IPOS,JPOS,IFLD,IFLDA,JK_MAX,IRECV_FLD_END) COLLAPSE(3) &
!$ACC& PRESENT(NGPBLKS,IGPTRSEND,ISETW, &
!$ACC& LLINDER,PGP,KPTRGP,ZCOMBUFR,LLPGPONLY,LLUV,PGPUV,IUVLEVS, &
!$ACC& IUVPARS,LLGP2,PGP2,IGP2PARS,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS, &
!$ACC& LLGP3B,PGP3B,IGP3BLEVS,IGP3BPARS)
!$ACC& COPYIN(NGPBLKS,INR,KF_FS,IPOS,JPOS,IFLD,IFLDA,JK_MAX,IRECV_FLD_END,ISETW,LLINDER,LLPGPONLY) COLLAPSE(3) &
!$ACC& PRESENT(IGPTRSEND,PGP,KPTRGP,ZCOMBUFR,LLUV,PGPUV, &
!$ACC& LLGP2,PGP2,IGP2PARS,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS, &
!$ACC& LLGP3B,PGP3B) !,
!!$ACC& IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS)
#endif
#endif
DO JBLK=1,NGPBLKS
Expand Down
3 changes: 2 additions & 1 deletion src/trans/gpu/internal/trltom_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,8 @@ END FUNCTION ALLTOALLV_CUDAIPC
!$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN)
#elif defined(_CRAYFTN)
!$ACC KERNELS ASYNC(1) DEFAULT(NONE) &
!$ACC& PRESENT(PFBUF,PFBUF_IN,FROM_RECV,TO_RECV,FROM_SEND,TO_SEND)
!$ACC& COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) &
!$ACC& PRESENT(PFBUF,PFBUF_IN)
#endif
#endif
PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND)
Expand Down
14 changes: 5 additions & 9 deletions src/trans/gpu/internal/trmtol_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -142,8 +142,8 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD)
#if defined(__NVCOMPILER) || defined(__PGI)
!$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN)
#elif defined(_CRAYFTN)
!$ACC KERNELS ASYNC(1) DEFAULT(NONE) &
!$ACC& PRESENT(PFBUF,PFBUF_IN,FROM_RECV,TO_RECV,FROM_SEND,TO_SEND)
!$ACC KERNELS ASYNC(1) DEFAULT(NONE) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) &
!$ACC& PRESENT(PFBUF,PFBUF_IN)
#endif
#endif
PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND)
Expand All @@ -159,15 +159,11 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD)
CALL GSTATS(410,0)

#ifdef ACCGPU
!$ACC DATA PRESENT(PFBUF_IN, PFBUF)
!$ACC DATA PRESENT(PFBUF_IN,PFBUF) COPYIN(ILENS,ILENR,IOFFS,IOFFR)
!$ACC HOST_DATA USE_DEVICE(PFBUF_IN,PFBUF)
#endif
#ifdef OMPGPU
!$OMP TARGET DATA MAP(PRESENT,ALLOC:PFBUF_IN, PFBUF)
#endif
#ifdef ACCGPU
!$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF)
#endif
#ifdef OMPGPU
!$OMP TARGET DATA USE_DEVICE_PTR(PFBUF_IN, PFBUF)
#endif
CALL GSTATS(807,0)
Expand Down Expand Up @@ -202,7 +198,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD)
!$OMP& SHARED(ISTA,ILEN,PFBUF,PFBUF_IN)
#endif
#ifdef ACCGPU
!$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(ISTA,ILEN,PFBUF,PFBUF_IN)
!$ACC PARALLEL LOOP DEFAULT(NONE) COPYIN(ISTA,ILEN) PRESENT(PFBUF,PFBUF_IN)
#endif
DO J=ISTA,ISTA+ILEN-1
PFBUF(J) = PFBUF_IN(J)
Expand Down

0 comments on commit ab807d2

Please sign in to comment.