Skip to content

Commit

Permalink
Direct transform optimizations
Browse files Browse the repository at this point in the history
  • Loading branch information
PaulMullowney committed Sep 13, 2023
1 parent e10f104 commit b2d6d47
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 60 deletions.
12 changes: 6 additions & 6 deletions src/trans/gpu/internal/ftdir_ctl_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,6 @@ SUBROUTINE FTDIR_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS, &
USE FTDIR_MOD ,ONLY : FTDIR
use ieee_arithmetic
!

IMPLICIT NONE


Expand All @@ -91,7 +90,6 @@ END SUBROUTINE cudaProfilerStop
END INTERFACE

! Dummy arguments

INTEGER(KIND=JPIM),INTENT(IN) :: KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS
INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETUV(:)
INTEGER(KIND=JPIM) ,OPTIONAL, INTENT(IN) :: KVSETSC(:)
Expand Down Expand Up @@ -204,14 +202,14 @@ END SUBROUTINE cudaProfilerStop
!$ACC END DATA
#endif
#else
CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,&
&PGP,PGPUV,PGP3A,PGP3B,PGP2)
#ifdef ACCGPU
!$ACC UPDATE DEVICE(ZGTF)
!$ACC DATA CREATE(ZGTF)
#endif
#ifdef OMPGPU
!$OMP TARGET UPDATE TO(ZGTF)
!$OMP TARGET DATA MAP(ALLOC:ZGTF)
#endif
CALL TRGTOL(ZGTF,KF_FS,KF_GP,KF_SCALARS_G,IVSET,KPTRGP,&
&PGP,PGPUV,PGP3A,PGP3B,PGP2)
#endif

CALL GSTATS(158,1)
Expand Down Expand Up @@ -254,9 +252,11 @@ END SUBROUTINE cudaProfilerStop
#ifndef USE_CUDA_AWARE_MPI_FT
#ifdef ACCGPU
!$ACC UPDATE HOST(FOUBUF_IN)
!$ACC END DATA !!ZGTF
#endif
#ifdef OMPGPU
!$OMP TARGET UPDATE FROM(FOUBUF_IN)
!$OMP END TARGET DATA !!ZGTF
#endif
#endif

Expand Down
173 changes: 119 additions & 54 deletions src/trans/gpu/internal/trgtol_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -932,7 +932,7 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
&ISETV, ISEND, ISENDSET, ITAG, J, JBLK, JFLD, &
&JGL, JK, JL, JLOOP, ISETW, IFLD, &
&II,INDOFFX,IBUFLENS,IBUFLENR,INRECV, IPROC,IFLDS, &
&INSEND,INS,INR,IR, iunit
&INSEND,INS,INR,IR, iunit, JKL, JK_MAX

! LOCAL LOGICAL SCALARS
LOGICAL :: LLPGPUV,LLPGP3A,LLPGP3B,LLPGP2,LLPGPONLY
Expand Down Expand Up @@ -1221,11 +1221,28 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
IF (IBUFLENR > 0) ALLOCATE(ZCOMBUFR(-1:IRECVCOUNT,INRECV))

CALL GSTATS(1805,1)

! Send loop.............................................................

! Copy local contribution


#ifdef ACCGPU
!$ACC DATA COPYIN(KPTRGP,INDOFF,KINDEX)
!$ACC DATA COPYIN(PGP)
!$ACC DATA COPYIN(PGPUV,IUVLEVS,IUVPARS)
!$ACC DATA COPYIN(PGP2,IGP2PARS)
!$ACC DATA COPYIN(PGP3A,IGP3ALEVS,IGP3APARS)
!$ACC DATA COPYIN(PGP3B,IGP3BLEVS,IGP3BPARS)
#endif
#ifdef OMPGPU
!$OMP TARGET DATA MAP(TO:INDOFF,KINDEX,KPTRGP)
!$OMP TARGET DATA MAP(TO:PGP)
!$OMP TARGET DATA MAP(TO:PGPUV,IUVLEVS,IUVPARS)
!$OMP TARGET DATA MAP(TO:PGP2,IGP2PARS)
!$OMP TARGET DATA MAP(TO:PGP3A,IGP3ALEVS,IGP3APARS)
!$OMP TARGET DATA MAP(TO:PGP3B,IGP3BLEVS,IGP3BPARS)
#endif

IF(ISENDTOT(MYPROC) > 0 )THEN
IFLDS = 0
DO JFLD=1,KF_GP
Expand All @@ -1240,67 +1257,102 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
ENDDO

IPOS=0
JK_MAX = 0
DO JBLK=1,NGPBLKS
IGPTROFF(JBLK)=IPOS
IFIRST = IGPTRSEND(1,JBLK,MYSETW)
IF(IFIRST > 0) THEN
ILAST = IGPTRSEND(2,JBLK,MYSETW)
IPOS=IPOS+ILAST-IFIRST+1
IF (JK_MAX<(ILAST-IFIRST+1)) JK_MAX = (ILAST-IFIRST+1)
ENDIF
ENDDO

#ifdef ACCGPU
!$ACC DATA COPYIN(IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND)
#endif
#ifdef OMPGPU
!$OMP TARGET DATA MAP(TO:IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND)
#endif
CALL GSTATS(1601,0)
#ifdef NECSX
!$OMP PARALLEL DO SCHEDULE(DYNAMIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST)
#else
!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JFLD,JBLK,JK,IFLD,IPOS,IFIRST,ILAST)
IF(LLPGPONLY) THEN
#ifdef ACCGPU
!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK)
#endif
DO JBLK=1,NGPBLKS
IFIRST = IGPTRSEND(1,JBLK,MYSETW)
IF(IFIRST > 0) THEN
ILAST = IGPTRSEND(2,JBLK,MYSETW)
IF(LLPGPONLY) THEN
DO JK=IFIRST,ILAST
IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
DO JFLD=1,IFLDS
#ifdef OMPGPU
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK)
#endif
DO JBLK=1,NGPBLKS
DO JFLD=1,IFLDS
DO JKL=1, JK_MAX
IFIRST = IGPTRSEND(1,JBLK,MYSETW)
ILAST = IGPTRSEND(2,JBLK,MYSETW)
JK = JKL+IFIRST-1
IF(IFIRST > 0 .AND. JK <= ILAST) THEN
IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
IFLD = IFLDOFF(JFLD)
PGLAT(JFLD,KINDEX(IPOS)) = PGP(JK,IFLD,JBLK)
ENDDO
ENDIF
ENDDO
ELSE
DO JFLD=1,IFLDS
IFLD = IFLDOFF(JFLD)
IF(LLUV(IFLD)) THEN
DO JK=IFIRST,ILAST
IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
ENDDO
ENDDO
#ifdef OMPGPU
!$OMP END TARGET TEAMS DISTRIBUTE PARALLEL DO
#endif
ELSE
#ifdef ACCGPU
!$ACC PARALLEL LOOP COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK)
#endif
#ifdef OMPGPU
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO COLLAPSE(3) PRIVATE(IPOS,IFIRST,ILAST,IFLD,JK)
#endif
DO JBLK=1,NGPBLKS
DO JFLD=1,IFLDS
DO JKL=1, JK_MAX
IFIRST = IGPTRSEND(1,JBLK,MYSETW)
ILAST = IGPTRSEND(2,JBLK,MYSETW)
JK = JKL+IFIRST-1
IF(IFIRST > 0 .AND. JK <= ILAST) THEN
IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
IFLD = IFLDOFF(JFLD)
IF(LLUV(IFLD)) THEN
PGLAT(JFLD,KINDEX(IPOS)) = PGPUV(JK,IUVLEVS(IFLD),IUVPARS(IFLD),JBLK)
ENDDO
ELSEIF(LLGP2(IFLD)) THEN
DO JK=IFIRST,ILAST
IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
ELSEIF(LLGP2(IFLD)) THEN
PGLAT(JFLD,KINDEX(IPOS)) = PGP2(JK,IGP2PARS(IFLD),JBLK)
ENDDO
ELSEIF(LLGP3A(IFLD)) THEN
DO JK=IFIRST,ILAST
IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
ELSEIF(LLGP3A(IFLD)) THEN
PGLAT(JFLD,KINDEX(IPOS)) = PGP3A(JK,IGP3ALEVS(IFLD),IGP3APARS(IFLD),JBLK)
ENDDO
ELSEIF(LLGP3B(IFLD)) THEN
DO JK=IFIRST,ILAST
IPOS = INDOFF(MYPROC)+IGPTROFF(JBLK)+JK-IFIRST+1
ELSEIF(LLGP3B(IFLD)) THEN
PGLAT(JFLD,KINDEX(IPOS)) = PGP3B(JK,IGP3BLEVS(IFLD),IGP3BPARS(IFLD),JBLK)
ENDDO
ELSE
CALL ABORT_TRANS('TRLTOG_MOD: ERROR')
ENDIF
ENDIF
ENDDO
ENDIF
ENDIF
ENDDO
!$OMP END PARALLEL DO
ENDDO
ENDDO
#ifdef OMPGPU
!$OMP END TARGET TEAMS DISTRIBUTE PARALLEL DO
#endif
ENDIF
#ifdef ACCGPU
!$ACC END DATA !! IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND
#endif
#ifdef OMPGPU
!$OMP END TARGET DATA !! IFLDOFF,IGPTROFF,LLGP2,LLGP3A,LLGP3B,LLUV,IGPTRSEND
#endif
CALL GSTATS(1601,1)

ENDIF
#ifdef ACCGPU
!$ACC END DATA !! PRESENT(PGP3B)
!$ACC END DATA !! PRESENT(PGP3A)
!$ACC END DATA !! PRESENT(PGP2)
!$ACC END DATA !! PRESENT(PGPUV)
!$ACC END DATA !! PRESENT(PGP)
#endif
#ifdef OMPGPU
!$OMP END TARGET DATA !! PRESENT(PGP3B)
!$OMP END TARGET DATA !! PRESENT(PGP3A)
!$OMP END TARGET DATA !! PRESENT(PGP2)
!$OMP END TARGET DATA !! PRESENT(PGPUV)
!$OMP END TARGET DATA !! PRESENT(PGP)
#endif

#ifdef COMVERBOSE
call MPI_BARRIER(MPI_COMM_WORLD,IERROR)
Expand Down Expand Up @@ -1409,6 +1461,7 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
call MPI_BARRIER(MPI_COMM_WORLD,IERROR)
Tc=TIMEF()
#endif

! Receive loop.........................................................
DO INR=1,INRECV
IR=IR+1
Expand Down Expand Up @@ -1455,8 +1508,14 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&

CALL GSTATS(1603,0)


!$OMP PARALLEL DO SCHEDULE(STATIC) PRIVATE(JL,II,JFLD,INR,IRECV,ILEN,IRECV_FLD_START,IRECV_FLD_END,IPOS)
#ifdef ACCGPU
!$ACC DATA COPYIN(JRECV,ZCOMBUFR,IRECVTOT)
!$ACC PARALLEL LOOP PRIVATE(II,IRECV,ILEN,IRECV_FLD_START,IRECV_FLD_END)
#endif
#ifdef OMPGPU
!$OMP TARGET DATA MAP(TO:JRECV,ZCOMBUFR,IRECVTOT)
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO PRIVATE(II,IRECV,ILEN,IRECV_FLD_START,IRECV_FLD_END)
#endif
DO INR=1,INRECV
IRECV=JRECV(INR)
ILEN = IRECVTOT(IRECV)/KF_FS
Expand All @@ -1468,9 +1527,16 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
PGLAT(JFLD,II) = ZCOMBUFR(JL+(JFLD-IRECV_FLD_START)*ILEN,INR)
ENDDO
ENDDO
ENDDO
!$OMP END PARALLEL DO

ENDDO
#ifdef ACCGPU
!$ACC END DATA !! JRECV,ZCOMBUFR,IRECVTOT
!$ACC END DATA !! INDOFF KINDEX KPTRGP
#endif
#ifdef OMPGPU
!$OMP END TARGET TEAMS DISTRIBUTE PARALLEL DO
!$OMP END TARGET DATA !! JRECV,ZCOMBUFR,IRECVTOT
!$OMP END TARGET DATA !! INDOFF KINDEX KPTRGP
#endif
! this appears to be important (otherwise, old data picked in PGLAT)
! in particular, one would have thought that above ACC copy and update on the
! device is the same as OMP loop + update device command below, but it seems not, and winds still in field index 1 from prev inv_trans !!!
Expand All @@ -1479,13 +1545,12 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
! when run as gpnorm on cpu needs to avoid this update call
IF(.NOT.LGPNORM) THEN
#ifdef ACCGPU
!$ACC UPDATE DEVICE(PGLAT)
#endif
#ifdef OMPGPU
!$OMP TARGET UPDATE TO(PGLAT)
!! All data is updated on device already. PJM 9/05/2023
! !$ACC UPDATE DEVICE(PGLAT)
#endif
#ifdef ACCGPU
!$ACC WAIT
!! I don't understand the need for this. PJM 9/05/2023
! !$ACC WAIT
#endif
ENDIF

Expand All @@ -1501,6 +1566,6 @@ SUBROUTINE TRGTOL(PGLAT,KF_FS,KF_GP,KF_SCALARS_G,KVSET,KPTRGP,&
IF (IBUFLENR > 0) DEALLOCATE(ZCOMBUFR)

IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE)

END SUBROUTINE TRGTOL
END MODULE TRGTOL_MOD

0 comments on commit b2d6d47

Please sign in to comment.