diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 6fea6895..9c18de04 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -453,10 +453,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ZCOMBUFS(:,:) = 0. ZCOMBUFR(:,:) = 0. #ifdef ACCGPU - !$ACC END KERNELS + !$ACC END KERNELS #endif #ifdef OMPGPU - !$OMP END TARGET + !$OMP END TARGET #endif CALL GSTATS(1805,1) @@ -553,9 +553,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS) #elif defined(_CRAYFTN) !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) & - !$ACC& PRESENT(NGPBLKS,IFLDS,JK_MAX,IGPTRSEND,MYSETW,INDOFF,MYPROC,IGPTROFF,IFLDOFF, & - !$ACC& LLUV,LLGP2,LLGP3A,LLGP3B,PGLAT,KINDEX,PGPUV,PGP2,PGP3A,PGP3B,IUVLEVS, & - !$ACC& IUVPARS,IGP2PARS,IGP3ALEVS,IGP3APARS,IGP3BLEVS,IGP3BPARS) + !$ACC COPYIN(NGPBLKS,IFLDS,JK_MAX,MYSETW,MYPROC) & + !$ACC& PRESENT(IGPTRSEND,INDOFF,IGPTROFF,IFLDOFF, & + !$ACC& LLUV,LLGP2,LLGP3A,LLGP3B,PGLAT,KINDEX,PGPUV,PGP2,PGP3A,PGP3B, & + !$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS) !!,IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS) #endif #endif DO JBLK=1,NGPBLKS @@ -637,10 +638,10 @@ SUBROUTINE TRGTOL_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC& PRESENT(IGPTRSEND,LLUV,LLGP2,LLGP3A,LLGP3B,ZCOMBUFS,KPTRGP,PGP,PGPUV,PGP2,PGP3A,PGP3B, & !$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS) #elif defined(_CRAYFTN) - !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) COPYIN(INS,JK_MAX,IJPOS,IFLDA) & - !$ACC& PRESENT(ISEND_FLD_END,NGPBLKS,ISETW,IPOS,IGPTRSEND,LLINDER,LLPGPONLY, & - !$ACC& LLUV,LLGP2,LLGP3A,LLGP3B,ZCOMBUFS,KPTRGP,PGP,PGPUV,PGP2,PGP3A,PGP3B,IUVLEVS, & - !$ACC& IUVPARS,IGP2PARS,IGP3ALEVS,IGP3APARS,IGP3BLEVS,IGP3BPARS) + !$ACC PARALLEL LOOP COLLAPSE(3) DEFAULT(NONE) PRIVATE(JK,JI,IFLDT,IFIRST,ILAST) & + !$ACC COPYIN(NGPBLKS,ISEND_FLD_END,INS,JK_MAX,IJPOS,IFLDA,IPOS,ISETW,LLINDER,LLPGPONLY) & + !$ACC& PRESENT(IGPTRSEND,LLUV,LLGP2,LLGP3A,LLGP3B,ZCOMBUFS,KPTRGP,PGP,PGPUV,PGP2,PGP3A,PGP3B, & + !$ACC& IGP2PARS,IGP3ALEVS,IGP3APARS) !!,IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS) #endif #endif DO JJ=1,ISEND_FLD_END diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 63b5c5b1..c0f3c271 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -444,10 +444,10 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ZCOMBUFS(:,:) = 0. ZCOMBUFR(:,:) = 0. #ifdef ACCGPU - !$ACC END KERNELS + !$ACC END KERNELS #endif #ifdef OMPGPU - !$OMP END TARGET + !$OMP END TARGET #endif #ifdef ACCGPU @@ -506,7 +506,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) & !$OMP& MAP(TO:IGPTROFF,IFLDOFF,JK_MAX) COLLAPSE(3) & !$OMP& SHARED(NGPBLKS,IFLDS,IGPTRSEND,MYSETW,INDOFF,MYPROC, & - !$OMP& IGPTROFF,LLINDER,KPTRGP,PGP,PGLAT,KINDEX,IFLDOFF,JK_MAX) + !$OMP& LLINDER,KPTRGP,PGP,PGLAT,KINDEX) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) & @@ -550,11 +550,10 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC& PGLAT,LLGP3A,PGP3A,LLGP3B,PGP3B,KINDEX) #elif defined(_CRAYFTN) !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) & - !$ACC& COPYIN(IGPTROFF,IFLDOFF,JK_MAX) COLLAPSE(3) & - !$ACC& PRESENT(NGPBLKS,IFLDS,IGPTRSEND,MYSETW,INDOFF,MYPROC, & - !$ACC& LLUV,PGPUV,IUVLEVS,IUVPARS,LLGP2,PGP2,IGP2PARS, & - !$ACC& PGLAT,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS,LLGP3B,PGP3B,& - !$ACC& IGP3BLEVS,IGP3BPARS,KINDEX) + !$ACC& COPYIN(NGPBLKS,IGPTROFF,IFLDOFF,JK_MAX,IFLDS,MYPROC,MYSETW) COLLAPSE(3) & + !$ACC& PRESENT(IGPTRSEND,INDOFF,LLUV,PGPUV,LLGP2,PGP2,IGP2PARS, & + !$ACC& PGLAT,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS,LLGP3B,PGP3B,KINDEX) !!,& + !!!$ACC& IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS) #endif #endif DO JBLK=1,NGPBLKS @@ -608,8 +607,8 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COPYIN(ILEN) COLLAPSE(2) & !$ACC& PRESENT(KINDEX,INDOFF,ZCOMBUFS,PGLAT) #elif defined(_CRAYFTN) - !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COPYIN(ILEN) COLLAPSE(2) & - !$ACC& PRESENT(KF_FS,KINDEX,INDOFF,ISEND,ZCOMBUFS,INS,PGLAT) + !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(II) COPYIN(ILEN,INS,ISEND,KF_FS) COLLAPSE(2) & + !$ACC& PRESENT(KINDEX,INDOFF,ZCOMBUFS,PGLAT) #endif #endif DO JL=1,ILEN @@ -756,11 +755,11 @@ SUBROUTINE TRLTOG_CUDAAWARE(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& !$ACC& LLGP2,PGP2,IGP2PARS,LLGP3A,PGP3A,LLGP3B,PGP3B) #elif defined(_CRAYFTN) !$ACC PARALLEL LOOP DEFAULT(NONE) PRIVATE(IFIRST,ILAST,JI,JK,IFLDT) & - !$ACC& COPYIN(INR,KF_FS,IPOS,JPOS,IFLD,IFLDA,JK_MAX,IRECV_FLD_END) COLLAPSE(3) & - !$ACC& PRESENT(NGPBLKS,IGPTRSEND,ISETW, & - !$ACC& LLINDER,PGP,KPTRGP,ZCOMBUFR,LLPGPONLY,LLUV,PGPUV,IUVLEVS, & - !$ACC& IUVPARS,LLGP2,PGP2,IGP2PARS,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS, & - !$ACC& LLGP3B,PGP3B,IGP3BLEVS,IGP3BPARS) + !$ACC& COPYIN(NGPBLKS,INR,KF_FS,IPOS,JPOS,IFLD,IFLDA,JK_MAX,IRECV_FLD_END,ISETW,LLINDER,LLPGPONLY) COLLAPSE(3) & + !$ACC& PRESENT(IGPTRSEND,PGP,KPTRGP,ZCOMBUFR,LLUV,PGPUV, & + !$ACC& LLGP2,PGP2,IGP2PARS,LLGP3A,PGP3A,IGP3ALEVS,IGP3APARS, & + !$ACC& LLGP3B,PGP3B) !, + !!$ACC& IUVLEVS,IUVPARS,IGP3BLEVS,IGP3BPARS) #endif #endif DO JBLK=1,NGPBLKS diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index bf414da6..5d573dd9 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -166,7 +166,8 @@ END FUNCTION ALLTOALLV_CUDAIPC !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) #elif defined(_CRAYFTN) !$ACC KERNELS ASYNC(1) DEFAULT(NONE) & - !$ACC& PRESENT(PFBUF,PFBUF_IN,FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) + !$ACC& COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) & + !$ACC& PRESENT(PFBUF,PFBUF_IN) #endif #endif PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index e28afea3..f479d93e 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -142,8 +142,8 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) #if defined(__NVCOMPILER) || defined(__PGI) !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) #elif defined(_CRAYFTN) - !$ACC KERNELS ASYNC(1) DEFAULT(NONE) & - !$ACC& PRESENT(PFBUF,PFBUF_IN,FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) + !$ACC KERNELS ASYNC(1) DEFAULT(NONE) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) & + !$ACC& PRESENT(PFBUF,PFBUF_IN) #endif #endif PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) @@ -159,15 +159,11 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) CALL GSTATS(410,0) #ifdef ACCGPU - !$ACC DATA PRESENT(PFBUF_IN, PFBUF) + !$ACC DATA PRESENT(PFBUF_IN,PFBUF) COPYIN(ILENS,ILENR,IOFFS,IOFFR) + !$ACC HOST_DATA USE_DEVICE(PFBUF_IN,PFBUF) #endif #ifdef OMPGPU !$OMP TARGET DATA MAP(PRESENT,ALLOC:PFBUF_IN, PFBUF) -#endif -#ifdef ACCGPU - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) -#endif -#ifdef OMPGPU !$OMP TARGET DATA USE_DEVICE_PTR(PFBUF_IN, PFBUF) #endif CALL GSTATS(807,0) @@ -202,7 +198,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(PFBUF_IN,PFBUF,KFIELD) !$OMP& SHARED(ISTA,ILEN,PFBUF,PFBUF_IN) #endif #ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(ISTA,ILEN,PFBUF,PFBUF_IN) + !$ACC PARALLEL LOOP DEFAULT(NONE) COPYIN(ISTA,ILEN) PRESENT(PFBUF,PFBUF_IN) #endif DO J=ISTA,ISTA+ILEN-1 PFBUF(J) = PFBUF_IN(J)