diff --git a/src/trans/gpu/internal/ftdir_ctl_mod.F90 b/src/trans/gpu/internal/ftdir_ctl_mod.F90 index 6f0852d5..5d98add9 100755 --- a/src/trans/gpu/internal/ftdir_ctl_mod.F90 +++ b/src/trans/gpu/internal/ftdir_ctl_mod.F90 @@ -72,7 +72,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, & USE FTDIR_MOD ,ONLY : FTDIR use ieee_arithmetic ! - IMPLICIT NONE @@ -91,7 +90,6 @@ END SUBROUTINE cudaProfilerStop END INTERFACE ! Dummy arguments - INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:) INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:) @@ -204,14 +202,14 @@ END SUBROUTINE cudaProfilerStop !$ACC END DATA #endif #else -CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& - &PGP,PGPUV,PGP3A,PGP3B,PGP2) #ifdef ACCGPU -!$ACC UPDATE DEVICE(ZGTF) +!$ACC DATA CREATE(ZGTF) #endif #ifdef OMPGPU -!$OMP TARGET UPDATE TO(ZGTF) +!$OMP TARGET DATA MAP(ALLOC:ZGTF) #endif +CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,& + &PGP,PGPUV,PGP3A,PGP3B,PGP2) #endif CALL GSTATS(158,1) @@ -254,9 +252,11 @@ END SUBROUTINE cudaProfilerStop #ifndef USE_CUDA_AWARE_MPI_FT #ifdef ACCGPU !$ACC UPDATE HOST(FOUBUF_IN) + !$ACC END DATA !!ZGTF #endif #ifdef OMPGPU !$OMP TARGET UPDATE FROM(FOUBUF_IN) + !$OMP END TARGET DATA !!ZGTF #endif #endif diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index cd08fdf3..c87a70c8 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -932,7 +932,7 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& &ISETV, ISEND, ISENDSET, ITAG, J, JBLK, JFLD, & &JGL, JK, JL, JLOOP, ISETW, IFLD, & &II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, & - &INSEND,INS,INR,IR, iunit + &INSEND,INS,INR,IR, iunit, JKL, JK_MAX ! LOCAL LOGICAL SCALARS LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY @@ -1221,11 +1221,28 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(-1:IRECVCOUNT,INRECV)) CALL GSTATS(1805,1) - + ! Send loop............................................................. ! Copy local contribution - + +#ifdef ACCGPU + !$ACC DATA COPYIN(KPTRGP,INDOFF,KINDEX) + !$ACC DATA COPYIN(PGP) + !$ACC DATA COPYIN(PGPUV,IUVLEVS,IUVPARS) + !$ACC DATA COPYIN(PGP2,IGP2PARS) + !$ACC DATA COPYIN(PGP3A,IGP3ALEVS,IGP3APARS) + !$ACC DATA COPYIN(PGP3B,IGP3BLEVS,IGP3BPARS) +#endif +#ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:INDOFF,KINDEX,KPTRGP) + !$OMP TARGET DATA MAP(TO:PGP) + !$OMP TARGET DATA MAP(TO:PGPUV,IUVLEVS,IUVPARS) + !$OMP TARGET DATA MAP(TO:PGP2,IGP2PARS) + !$OMP TARGET DATA MAP(TO:PGP3A,IGP3ALEVS,IGP3APARS) + !$OMP TARGET DATA MAP(TO:PGP3B,IGP3BLEVS,IGP3BPARS) +#endif + IF(ISENDTOT(MYPROC) > 0 )THEN IFLDS = 0 DO JFLD=1,KF_GP @@ -1240,67 +1257,102 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ENDDO IPOS=0 + JK_MAX = 0 DO JBLK=1,NGPBLKS IGPTROFF(JBLK)=IPOS IFIRST = IGPTRSEND(1,JBLK,MYSETW) IF(IFIRST > 0) THEN ILAST = IGPTRSEND(2,JBLK,MYSETW) IPOS=IPOS+ILAST-IFIRST+1 + IF (JK_MAX<(ILAST-IFIRST+1)) JK_MAX = (ILAST-IFIRST+1) ENDIF ENDDO - +#ifdef ACCGPU + !$ACC DATA COPYIN(IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) +#endif +#ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND) +#endif CALL GSTATS(1601,0) -#ifdef NECSX - !$OMP PARALLEL DO SCHEDULE(DYNAMIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST) -#else - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST) + IF(LLPGPONLY) THEN +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) #endif - DO JBLK=1,NGPBLKS - IFIRST = IGPTRSEND(1,JBLK,MYSETW) - IF(IFIRST > 0) THEN - ILAST = IGPTRSEND(2,JBLK,MYSETW) - IF(LLPGPONLY) THEN - DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 - DO JFLD=1,IFLDS +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) +#endif + DO JBLK=1,NGPBLKS + DO JFLD=1,IFLDS + DO JKL=1, JK_MAX + IFIRST = IGPTRSEND(1,JBLK,MYSETW) + ILAST = IGPTRSEND(2,JBLK,MYSETW) + JK = JKL+IFIRST-1 + IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 IFLD = IFLDOFF(JFLD) PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLD,JBLK) - ENDDO + ENDIF ENDDO - ELSE - DO JFLD=1,IFLDS - IFLD = IFLDOFF(JFLD) - IF(LLUV(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 + ENDDO + ENDDO +#ifdef OMPGPU + !$OMP END TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif + ELSE +#ifdef ACCGPU + !$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) +#endif +#ifdef OMPGPU + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK) +#endif + DO JBLK=1,NGPBLKS + DO JFLD=1,IFLDS + DO JKL=1, JK_MAX + IFIRST = IGPTRSEND(1,JBLK,MYSETW) + ILAST = IGPTRSEND(2,JBLK,MYSETW) + JK = JKL+IFIRST-1 + IF(IFIRST > 0 .AND. JK <= ILAST) THEN + IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 + IFLD = IFLDOFF(JFLD) + IF(LLUV(IFLD)) THEN PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK) - ENDDO - ELSEIF(LLGP2(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 + ELSEIF(LLGP2(IFLD)) THEN PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IGP2PARS(IFLD),JBLK) - ENDDO - ELSEIF(LLGP3A(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 + ELSEIF(LLGP3A(IFLD)) THEN PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK) - ENDDO - ELSEIF(LLGP3B(IFLD)) THEN - DO JK=IFIRST,ILAST - IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1 + ELSEIF(LLGP3B(IFLD)) THEN PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK) - ENDDO - ELSE - CALL ABORT_TRANS('TRLTOG_MOD: ERROR') + ENDIF ENDIF ENDDO - ENDIF - ENDIF - ENDDO - !$OMP END PARALLEL DO + ENDDO + ENDDO +#ifdef OMPGPU + !$OMP END TARGET TEAMS DISTRIBUTE PARALLEL DO +#endif + ENDIF +#ifdef ACCGPU + !$ACC END DATA !! IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA !! IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND +#endif CALL GSTATS(1601,1) - ENDIF +#ifdef ACCGPU + !$ACC END DATA !! PRESENT(PGP3B) + !$ACC END DATA !! PRESENT(PGP3A) + !$ACC END DATA !! PRESENT(PGP2) + !$ACC END DATA !! PRESENT(PGPUV) + !$ACC END DATA !! PRESENT(PGP) +#endif +#ifdef OMPGPU + !$OMP END TARGET DATA !! PRESENT(PGP3B) + !$OMP END TARGET DATA !! PRESENT(PGP3A) + !$OMP END TARGET DATA !! PRESENT(PGP2) + !$OMP END TARGET DATA !! PRESENT(PGPUV) + !$OMP END TARGET DATA !! PRESENT(PGP) +#endif #ifdef COMVERBOSE call MPI_BARRIER(MPI_COMM_WORLD,IERROR) @@ -1409,6 +1461,7 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& call MPI_BARRIER(MPI_COMM_WORLD,IERROR) Tc=TIMEF() #endif + ! Receive loop......................................................... DO INR=1,INRECV IR=IR+1 @@ -1455,8 +1508,14 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& CALL GSTATS(1603,0) - - !$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JL,II,JFLD,INR,IRECV,ILEN,IRECV_FLD_START,IRECV_FLD_END,IPOS) +#ifdef ACCGPU + !$ACC DATA COPYIN(JRECV,ZCOMBUFR,IRECVTOT) + !$ACC PARALLEL LOOP PRIVATE(II,IRECV,ILEN,IRECV_FLD_START,IRECV_FLD_END) +#endif +#ifdef OMPGPU + !$OMP TARGET DATA MAP(TO:JRECV,ZCOMBUFR,IRECVTOT) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO PRIVATE(II,IRECV,ILEN,IRECV_FLD_START,IRECV_FLD_END) +#endif DO INR=1,INRECV IRECV=JRECV(INR) ILEN = IRECVTOT(IRECV)/KF_FS @@ -1468,9 +1527,16 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-IRECV_FLD_START)*ILEN,INR) ENDDO ENDDO - ENDDO - !$OMP END PARALLEL DO - + ENDDO +#ifdef ACCGPU + !$ACC END DATA !! JRECV,ZCOMBUFR,IRECVTOT + !$ACC END DATA !! INDOFF KINDEX KPTRGP +#endif +#ifdef OMPGPU + !$OMP END TARGET TEAMS DISTRIBUTE PARALLEL DO + !$OMP END TARGET DATA !! JRECV,ZCOMBUFR,IRECVTOT + !$OMP END TARGET DATA !! INDOFF KINDEX KPTRGP +#endif ! this appears to be important (otherwise, old data picked in PGLAT) ! in particular, one would have thought that above ACC copy and update on the ! device is the same as OMP loop + update device command below, but it seems not, and winds still in field index 1 from prev inv_trans !!! @@ -1479,13 +1545,12 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& ! when run as gpnorm on cpu needs to avoid this update call IF(.NOT.LGPNORM) THEN #ifdef ACCGPU - !$ACC UPDATE DEVICE(PGLAT) -#endif -#ifdef OMPGPU - !$OMP TARGET UPDATE TO(PGLAT) +!! All data is updated on device already. PJM 9/05/2023 +! !$ACC UPDATE DEVICE(PGLAT) #endif #ifdef ACCGPU - !$ACC WAIT +!! I don't understand the need for this. PJM 9/05/2023 +! !$ACC WAIT #endif ENDIF @@ -1501,6 +1566,6 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,& IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR) IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) - + END SUBROUTINE TRGTOL END MODULE TRGTOL_MOD